CWB
|
Macros | |
#define | popc(s, p) s[p++] |
#define | pushc(s, c, p, m) s[p++] = c; if (p>=m) goto endloop; |
Functions | |
void | maptable_init_identity (unsigned char *maptable) |
Initialise an "identity" mapping table. More... | |
void | maptable_init_both (unsigned char *maptable, const unsigned char *nocasetable, const unsigned char *nodiactable) |
Initialise a "fold both case and diacritics" mapping table. More... | |
unsigned char * | cl_string_maptable (CorpusCharset charset, int flags) |
Gets a specified character mapping table for use in regular expressions. More... | |
int | cl_string_zap_controls (char *s, CorpusCharset charset, char replace, int zap_tabs, int zap_newlines) |
Replaces any invalid control characters in a string. More... | |
int | cl_string_utf8_continuation_byte (unsigned char byte) |
Checks whether a given byte is a UTF-8 continuation byte. More... | |
size_t | cl_charset_strlen (CorpusCharset charset, char *s) |
int | cl_string_validate_encoding (char *s, CorpusCharset charset, int repair) |
Checks the encoding of a string. More... | |
char * | cl_string_reverse (const char *s, CorpusCharset charset) |
Creates a "backwards" version of the specified string. More... | |
int | cl_string_qsort_compare (const char *s1, const char *s2, CorpusCharset charset, int flags, int reverse) |
Compares two strings in a qsort-style. More... | |
int | cl_id_validate (char *s) |
Checks a string to see if it is a valid CWB identifier. More... | |
void | cl_id_toupper (char *s) |
Converts a lowercase corpus name to an equivalent uppercase form. More... | |
void | cl_id_tolower (char *s) |
Converts an uppercase corpus name to an equivalent lowercase form. More... | |
void | cl_string_canonical (char *s, CorpusCharset charset, int flags) |
Converts a string to canonical form. More... | |
int | cl_iso_char_is_alphanumeric (unsigned char c, CorpusCharset charset) |
Checks whether a character is alphanumeric in the given ISO-8859 character set. More... | |
void | cl_path_adjust_os (char *path) |
Standardises subdirectory-dividers in a string that represents a path, in an OS-sensitive way. More... | |
void | cl_path_adjust_independent (char *path) |
Standardises subdirectory-dividers in a string that represents a path into Unix-like form (ie with forward-slash), regardless of what OS we are in. More... | |
char * | cl_path_registry_quote (char *path) |
Add quotes and escape slashes to a file path if necessary. More... | |
char * | cl_path_get_component (char *s) |
Tokenises a string into components split by ':' (or ';' under Win32). More... | |
char * | cl_string_latex2iso (char *str, char *result, int target_len) |
Converts ASCII strings with latex-style blackslash escapes for accented characters to ISO-8859-1 (Latin-1). More... | |
char * | cl_xml_entity_decode (char *s) |
Decode XML entities in a string. More... | |
char * | cl_strcpy (char *buf, const char *src) |
Replacement for strcpy that won't copy more than CL_MAX_LINE_LENGTH characters. More... | |
ClAutoString | cl_autostring_new (const char *data, size_t init_bytes) |
Creates a new autostring object. More... | |
void | cl_autostring_delete (ClAutoString string) |
Delete an autostring object. More... | |
void | cl_autostring_set_increment (ClAutoString string, size_t new_increment) |
Changes the increment size (measured in bytes). More... | |
char * | cl_autostring_ptr (ClAutoString string) |
Get a pointer to the string data inside the AutoString (or NULL if the object is NULL). More... | |
size_t | cl_autostring_len (ClAutoString string) |
Get the length of the currently-stored string (or negative value in case NULL object is passed). More... | |
void | cl_autostring_reclaim_mem (ClAutoString string) |
Tries to free up unused memory by making the AutoString use only as many increments of size as necessary. More... | |
void | cl_autostring_copy (ClAutoString dst, const char *src) |
Copy the string in src into the AutoString in dst, automatically reallocating memory if necessary. More... | |
void | cl_autostring_concat (ClAutoString dst, const char *src) |
Concatenate the string src onto the end of the AutoString in dst, automatically reallocating memory if necessary. More... | |
void | cl_autostring_truncate (ClAutoString string, int new_length) |
Truncates the AutoString to the length specified. More... | |
void | cl_autostring_dump (ClAutoString string) |
Debug function: dumps the contents of an AutoString to stderr. More... | |
Variables | |
unsigned char | identity_tab [unknown_charset][256] |
Array of mapping tables used when NEITHER case NOR diacritics are to be stripped. More... | |
int | identity_tab_init [unknown_charset] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} |
unsigned char | nocase_nodiac_tab [unknown_charset][256] |
Array of mapping tables used when BOTH case AND diacritics are to be stripped. More... | |
int | nocase_nodiac_tab_init [unknown_charset] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} |
unsigned char | nodiac_tab [unknown_charset][256] |
Array of tables mapping a character (the index) to the equivalent character without any accents (the value). More... | |
unsigned char | nocase_tab [unknown_charset][256] |
Array of tables mapping a character (the index) to the equivalent character in lowercase (the value). More... | |
unsigned char | checktable_is_alphanum [unknown_charset][256] |
int | cl_allow_latex2iso = 0 |
Boolean switch enabling/disabling latex-style escapes. More... | |
#define popc | ( | s, | |
p | |||
) | s[p++] |
Referenced by cl_string_latex2iso().
#define pushc | ( | s, | |
c, | |||
p, | |||
m | |||
) | s[p++] = c; if (p>=m) goto endloop; |
Referenced by cl_string_latex2iso().
void cl_autostring_concat | ( | ClAutoString | dst, |
const char * | src | ||
) |
Concatenate the string src onto the end of the AutoString in dst, automatically reallocating memory if necessary.
References ClAutoString::bytes_allocated, cl_realloc(), ClAutoString::data, ClAutoString::increment, and ClAutoString::len.
Referenced by compose_kwic_line(), get_field_separators(), get_position_values(), and get_print_attribute_values().
void cl_autostring_copy | ( | ClAutoString | dst, |
const char * | src | ||
) |
Copy the string in src into the AutoString in dst, automatically reallocating memory if necessary.
References ClAutoString::bytes_allocated, cl_realloc(), ClAutoString::data, ClAutoString::increment, and ClAutoString::len.
void cl_autostring_delete | ( | ClAutoString | string | ) |
Delete an autostring object.
References cl_free, and ClAutoString::data.
Referenced by cleanup_kwic_line_memory().
void cl_autostring_dump | ( | ClAutoString | string | ) |
Debug function: dumps the contents of an AutoString to stderr.
References ClAutoString::bytes_allocated, ClAutoString::data, ClAutoString::increment, and ClAutoString::len.
size_t cl_autostring_len | ( | ClAutoString | string | ) |
Get the length of the currently-stored string (or negative value in case NULL object is passed).
Equivalent to reading the ->len member, except this function checks for a NULL!
ClAutoString cl_autostring_new | ( | const char * | data, |
size_t | init_bytes | ||
) |
Creates a new autostring object.
The string is initialised to data (or to a zero-length string if data is NULL).
Initially, init_bytes is allocated (and the increment step is the same size), unless the string is longer... in which case the length of the string becomes the inital amount of memory allocated.
Use 0 for init_len, and the length of the specified string is used as the initial allocation.
References ClAutoString::bytes_allocated, cl_malloc(), CL_MAX_LINE_LENGTH, ClAutoString::data, ClAutoString::increment, and ClAutoString::len.
Referenced by get_field_separators(), and setup_kwic_line_memory().
char* cl_autostring_ptr | ( | ClAutoString | string | ) |
Get a pointer to the string data inside the AutoString (or NULL if the object is NULL).
Equivalent to reading the ->data member, except this function checks for a NULL!
Referenced by compose_kwic_line().
void cl_autostring_reclaim_mem | ( | ClAutoString | string | ) |
Tries to free up unused memory by making the AutoString use only as many increments of size as necessary.
References cl_realloc(), ClAutoString::data, ClAutoString::increment, and ClAutoString::len.
void cl_autostring_set_increment | ( | ClAutoString | string, |
size_t | new_increment | ||
) |
Changes the increment size (measured in bytes).
Whenever memory reallocation is necessary, the AutoString will request a multiple of its increment value.
void cl_autostring_truncate | ( | ClAutoString | string, |
int | new_length | ||
) |
Truncates the AutoString to the length specified.
Note, does not respect UTF-8 encoding, so if the string is UTF8 you need to ascertain in advance that the cut-off does not break any UTF-8 characters into bits.
This function should be used if the character buffer is tampered with by direct access (which of course will not update the internal member of the object that tracks string length....).
References ClAutoString::len.
Referenced by compose_kwic_line(), get_field_separators(), get_position_values(), and setup_kwic_line_memory().
size_t cl_charset_strlen | ( | CorpusCharset | charset, |
char * | s | ||
) |
References utf8.
Referenced by compose_kwic_line().
void cl_id_tolower | ( | char * | s | ) |
Converts an uppercase corpus name to an equivalent lowercase form.
String is modified in situ. Only the ASCII characters are changed.
Note, this function doesn't check for what is and is not an allowed CWB-corpus-name character.
Referenced by changecase_string(), changecase_string_no_copy(), cl_new_corpus(), encode_generate_registry_file(), and main().
void cl_id_toupper | ( | char * | s | ) |
Converts a lowercase corpus name to an equivalent uppercase form.
String is modified in situ. Only the ASCII characters are changed.
Note, this function doesn't check for what is and is not an allowed CWB-corpus-name character.
The old version of this code was a line in cwb-encode that used the library toupper to cope with Latin1 characters. But these are no longer allowed in identifiers, which must be ASCII only.
Referenced by changecase_string(), changecase_string_no_copy(), encode_generate_registry_file(), and main().
int cl_id_validate | ( | char * | s | ) |
Checks a string to see if it is a valid CWB identifier.
The rules for these are as follows (see also the CQP lexer):
TODO: should the CL registry lexer be amended to reflect these restricitons? (ID there is rather laxer than this)
s | The string to check. |
Referenced by cl_new_corpus(), and encode_generate_registry_file().
int cl_iso_char_is_alphanumeric | ( | unsigned char | c, |
CorpusCharset | charset | ||
) |
Checks whether a character is alphanumeric in the given ISO-8859 character set.
This function is exported but NOT via cl.h - it is only for the use of CWB utilities. It is not part of the standard API.
Returns false if charset is utf8.
c | The character to check. |
charset | The character set to check against. |
References charset, checktable_is_alphanum, and utf8.
Referenced by scancorpus_word_is_regular().
void cl_path_adjust_independent | ( | char * | path | ) |
Standardises subdirectory-dividers in a string that represents a path into Unix-like form (ie with forward-slash), regardless of what OS we are in.
Or, to put it another way, changes backslashes into forward slashes under Windows.
This may be useful because of the need to move corpora between systems
Note that the path is modified in place.
path | The path to modify (must be Ascii-compatible) |
References SUBDIR_SEPARATOR.
void cl_path_adjust_os | ( | char * | path | ) |
Standardises subdirectory-dividers in a string that represents a path, in an OS-sensitive way.
If the CL was compiled for Unix, backslash is changed to forwardslash. If the CL was compiled for Windows, forwardslash is changed to backslash.
Note that the path is modified in place.
path | The path to modify (must be Ascii-compatible) |
References SUBDIR_SEPARATOR.
char* cl_path_get_component | ( | char * | s | ) |
Tokenises a string into components split by ':' (or ';' under Win32).
s | The string to tokenise; or, NULL if tokenisation has already been initialised. |
References last, and PATH_SEPARATOR.
char* cl_path_registry_quote | ( | char * | path | ) |
Add quotes and escape slashes to a file path if necessary.
This is for the HOME and INFO fields of the registry file.
If either field contains any characters that can't be treated as an "ID" token by the registry parser, then we make sure it is treated as a string (quoted) instead, and make all appropriate substitutions
For consistency, this function always returns a newly allocated string, regardless of whether changes have been made.
Note that the way the registry parser works, it is quite happy with either "C:\dir\subdir" or "C:\\dir\\subdir" as a path for HOME or INFO.
path | String containing the path to quotify. |
References cl_malloc(), and cl_strdup().
Referenced by encode_generate_registry_file().
char* cl_strcpy | ( | char * | buf, |
const char * | src | ||
) |
Replacement for strcpy that won't copy more than CL_MAX_LINE_LENGTH characters.
This is intended to make it easier to evade buffer overflows. But it doesn't protect against the opposite danger of losing important data from the end of a truncated string.
Note, buffer overflow is still possible if buf is a pointer to the middle of a buffer.
So this function is not a panacea, it's just a bit of a help.
It's also implemented in a way that is safe for down-strcpying, that is, if we are erasing a section from the start/middle of the string - cl_strcpy(string, string+3); for instance). The POSIX standard states that the normal strcpy has undefined behaviour if the objects overlap. That's not the case here.
buf | A string buffer to copy to. |
src | The string pointer to copy from. |
References buf, and CL_MAX_LINE_LENGTH.
Referenced by cl_string_canonical(), create_feature_maps(), encode_get_input_line(), findcorpus(), ParsePrintOptions(), and range_declare().
void cl_string_canonical | ( | char * | s, |
CorpusCharset | charset, | ||
int | flags | ||
) |
Converts a string to canonical form.
The "canonical form" of a string is for use in comparisons where case-insensitivity and/or diacritic insensitivity is desired.
Note that the string s is modified in place. This means it must have enough memory to cope with any expansions made in Unicode case folding. Ideally, allocate double the length of the string (since case-folding doesn't include any one -> more-than-two mappings so far as I know).
Note also that the arguments of this string were changed in v3.2.1. Now, a CorpusCharset is needed. This is because string canonicalising works differently in UTF8, where case folding / accent folding is done by calling Unicode-aware functions. By contrast, the process for Latin1 just uses a straightforward mapping table for both sorts of folding.
In UTF8, an additional flag REQUIRE_NFC can be passed to normalize the string into the canonical pre-composed form (NFC) used internally by CWB. All strings that are going to be inserted into or searched for within an indexed corpus should be processed in this way.
s | The string. |
charset | The character set in which the string is encoded. If this is utf8, complex accent and/or case folding will be done, as per the Unicode standard. If it is anything else, internal byte mapping tables will be used. |
flags | The flags that specify which conversions are required. Can be IGNORE_CASE | IGNORE_DIAC | REQUIRE_NFC . |
References ascii, cl_free, cl_strcpy(), cl_string_maptable(), IGNORE_CASE, IGNORE_DIAC, REQUIRE_NFC, unknown_charset, and utf8.
Referenced by cl_new_regex(), cl_regex_match(), cl_string_qsort_compare(), create_feature_maps(), encode_get_input_line(), print_tabulation(), regopt_data_copy_to_regex_object(), sencode_parse_line(), SortExternally(), SortSubcorpus(), and VerifyVariable().
char* cl_string_latex2iso | ( | char * | str, |
char * | result, | ||
int | target_len | ||
) |
Converts ASCII strings with latex-style blackslash escapes for accented characters to ISO-8859-1 (Latin-1).
Syntax:
"[AaOoUus..] –> corresponding ISO 8859-1 character
octal} –> ISO 8859-1 character
Note that if cl_allow_latex2iso is FALSE, this function will simply copy the input to the output. So it is always safe to call this function.
str | The string to convert. |
result | The location to put the altered string (which should be shorter, or at least no longer than, the input string). If this parameter is NULL, space is automatically allocated for the output. result is allowed to be the same as str. |
target_len | The maximum length of the target string. If result is NULL, then this is deduced automatically. |
References cl_allow_latex2iso, cl_malloc(), cl_strdup(), popc, and pushc.
Referenced by cl_new_regex(), do_flagged_string(), do_SetVariableValue(), and do_XMLTag().
unsigned char* cl_string_maptable | ( | CorpusCharset | charset, |
int | flags | ||
) |
Gets a specified character mapping table for use in regular expressions.
Returns pointer to static mapping table for given flags (IGNORE_CASE and IGNORE_DIAC) and character set.
Removed from the public API for 3.2.0 because there's no way for it to work if the CorpusCharset is UTF8. Prototype moved to special-chars.h
Tables exist for all character sets, but for all except Latin1 and ASCII, they are currently identical to the ASCII tables (i.e. the awareness of case/accent relationships in the upper half of each character set have not yet been inserted).
charset | The character set of this corpus. Currently ignored. |
flags | The flags that specify which table is required. Can be IGNORE_CASE and/or IGNORE_DIAC. |
References ascii, charset, identity_tab, identity_tab_init, IGNORE_CASE, IGNORE_DIAC, maptable_init_both(), maptable_init_identity(), nocase_nodiac_tab, nocase_nodiac_tab_init, nocase_tab, nodiac_tab, and utf8.
Referenced by cl_string_canonical().
int cl_string_qsort_compare | ( | const char * | s1, |
const char * | s2, | ||
CorpusCharset | charset, | ||
int | flags, | ||
int | reverse | ||
) |
Compares two strings in a qsort-style.
This function is designed to be suitable for use as a callback with qsort(). As such, its return values are negative if s1 is "less than" s2; zero if the two strings are the same; and positive if s2 is "greater than" s2. But of course you can also use it on its own.
You cannot use it directly with qsort as its parameters are wrong. It needs to be wrapped in another function that (at least) provides the charset, flags and reverse arguments (e.g. from global variables or by calling other functions).
The two strings must be in the same character set. Both will be made canonical in accordance with the flags argument if it is set. Also, the comparison can be done on reverse-order strings.
Note that if either flags or reverse is non-zero, then memory allocation will be necessary. If you are calling this function in a loop, that could quickly get costly. To avoid this, a pair of one-time-allocated buffers are used - but this doesn't dispense with all need for allocation. [Another option would be to allow a buffer to be optionally supplied....]
If charset == utf8 and strings are passed in from external sources, the flag REQUIRE_NFC should always be specified to obtain consistent results.
s1 | First string to compare. |
s2 | Second string to compare. |
charset | Character set of the two strings. |
flags | IGNORE_CASE, IGNORE_DIAC, REQUIRE_NFC |
reverse | Boolean: if true, strings are compared from end to beginning, rather than beginning to end. |
References cl_free, cl_malloc(), CL_MAX_LINE_LENGTH, cl_string_canonical(), s1, s2, and utf8.
Referenced by i2compare().
char* cl_string_reverse | ( | const char * | s, |
CorpusCharset | charset | ||
) |
Creates a "backwards" version of the specified string.
The memory for the reversed string is newly allocated. (This is potentially wasteful, but it occurs in the depths of GLib, so short of reinventing the wheel we have to live with it.)
s | String to reverse. |
charset | The character set of the string. |
References cl_strdup(), and utf8.
Referenced by SortExternally(), and SortSubcorpus().
int cl_string_utf8_continuation_byte | ( | unsigned char | byte | ) |
Checks whether a given byte is a UTF-8 continuation byte.
Byte to check.
Referenced by compose_kwic_line().
int cl_string_validate_encoding | ( | char * | s, |
CorpusCharset | charset, | ||
int | repair | ||
) |
Checks the encoding of a string.
This function looks for bad bytes (or byte sequences in the case of UTF8); if any are present, it judges the string invalid.
The string can optionally be "repaired" in-place by replacing bad bytes with '?' characters. If the "repair" is successful, the function returns True.
What counts as "bad" is of course relative to the character set that the string is encoded in - so this must be specified.
s | Null-terminated string to check. |
charset | CorpusCharset of the string's encoding. |
repair | if True, replace invalid bytes by '?' |
References arabic, ascii, cyrillic, greek, hebrew, latin1, latin2, latin3, latin4, latin5, latin6, latin7, latin8, latin9, and utf8.
Referenced by create_feature_maps(), do_flagged_re_variable(), encode_get_input_line(), prepare_Query(), printAlignedStrings(), sencode_parse_line(), and VerifyVariable().
int cl_string_zap_controls | ( | char * | s, |
CorpusCharset | charset, | ||
char | replace, | ||
int | zap_tabs, | ||
int | zap_newlines | ||
) |
Replaces any invalid control characters in a string.
"Invalid" control characters are any below 0x20.
The string is modified in situ. A typical "replace" to use would be '?' to match the action of cl_string_validate_encoding.
s | The string to modify. |
charset | The character set of the string. |
replace | The replacement character to use. If this is 0, the character is deleted rather than replaced. |
zap_tabs | Whether or not tabs should be zapped (boolean). |
zap_newlines | Whether or not and should be zapped (boolean). |
Referenced by encode_get_input_line(), and sencode_parse_line().
char* cl_xml_entity_decode | ( | char * | s | ) |
Decode XML entities in a string.
This function decodes pre-defined XML entities in string s. It overwrites the input string s and also returns s for convenience.
(The entities are < > & " ').
TODO – numeric entities?
If passed NULL, it will not fall over - it will just pass NULL back!
This function is safe for strings in any encoding. The returned string will be at the same memory location and will always be the same length or shorter after the decoding of entities.
s | A string to decode. |
Referenced by encode_add_wattr_line(), and range_open().
void maptable_init_both | ( | unsigned char * | maptable, |
const unsigned char * | nocasetable, | ||
const unsigned char * | nodiactable | ||
) |
Initialise a "fold both case and diacritics" mapping table.
Referenced by cl_string_maptable().
void maptable_init_identity | ( | unsigned char * | maptable | ) |
Initialise an "identity" mapping table.
Referenced by cl_string_maptable().
unsigned char checktable_is_alphanum[unknown_charset][256] |
Referenced by cl_iso_char_is_alphanumeric().
int cl_allow_latex2iso = 0 |
Boolean switch enabling/disabling latex-style escapes.
By default, it is false; if programs wish to allow these escapes they need to offer some means of changing this variable.
Note that enabling this variable may cause scrambling of the string for LatinX strings where X is not 1; and may cause undefined errors for UTF8 strings. In short, you should only activate it when you are working with a corpus whose charset is Latin1.
Referenced by cl_string_latex2iso().
unsigned char identity_tab[unknown_charset][256] |
Array of mapping tables used when NEITHER case NOR diacritics are to be stripped.
These are composite tables: they are only generated when needed (the corresponding identity_tab_init value is a boolean indicating whether this has been done yet).
Use a CorpusCharset value as the index into this array.
Referenced by cl_string_maptable().
int identity_tab_init[unknown_charset] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} |
Referenced by cl_string_maptable().
unsigned char nocase_nodiac_tab[unknown_charset][256] |
Array of mapping tables used when BOTH case AND diacritics are to be stripped.
These are composite tables: they are only generated when needed (the corresponding identity_tab_init value is a boolean indicating whether this has been done yet).
Use a CorpusCharset value as the index into this array.
Referenced by cl_string_maptable().
int nocase_nodiac_tab_init[unknown_charset] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} |
Referenced by cl_string_maptable().
unsigned char nocase_tab[unknown_charset][256] |
Array of tables mapping a character (the index) to the equivalent character in lowercase (the value).
There are as many tables as there are possible values of CorpusCharset. Moreover, tables must always be in the same order as the values of CorpusCharset are declared in.
This means starting at ascii == 0 and working up through the canonical order that is observable in cl.h
Use a CorpusCharset value as the index into this array.
Referenced by cl_string_maptable().
unsigned char nodiac_tab[unknown_charset][256] |
Array of tables mapping a character (the index) to the equivalent character without any accents (the value).
There are as many tables as there are possible values of CorpusCharset. Moreover, tables must always be in the same order as the values of CorpusCharset are declared in.
This means starting at ascii == 0 and working up through the canonical order that is observable in cl.h
Use a CorpusCharset value as the index into this array.
Referenced by cl_string_maptable().