CWB
Data Structures | Macros | Typedefs | Functions | Variables
cdaccess.c File Reference
#include <stdarg.h>
#include <math.h>
#include <ctype.h>
#include <sys/types.h>
#include <errno.h>
#include "globals.h"
#include "endian.h"
#include "macros.h"
#include "attributes.h"
#include "special-chars.h"
#include "bitio.h"
#include "compression.h"
#include "regopt.h"
#include "cdaccess.h"

Data Structures

struct  _position_stream_rec_
 Underlying structure for the PositionStream object. More...
 

Macros

#define COMPRESS_DEBUG   0
 If COMPRESS_DEBUG is set to a positive integer, cl_cpos2id() will print debugging messages. More...
 
#define check_arg(arg, atyp, rval)
 Checks an Attribute passed as a function argument for usability in that function. More...
 

Typedefs

typedef struct
_position_stream_rec_ 
PositionStreamRecord
 Underlying structure for the PositionStream object. More...
 

Functions

int cl_strcmp (char *s1, char *s2)
 CL internal string comparison (uses signed char on all platforms). More...
 
char * cl_error_string (int error_num)
 Gets a string describing the error identified by an error number. More...
 
void cl_error (char *message)
 Prints an error message, together with a string identifying the current error number. More...
 
char * cl_id2str (Attribute *attribute, int id)
 Gets the string that corresponds to the specified item on the given P-attribute. More...
 
int cl_str2id (Attribute *attribute, char *id_string)
 Gets the ID code that corresponds to the specified string on the given P-attribute. More...
 
int cl_id2strlen (Attribute *attribute, int id)
 Calculates the length of the string that corresponds to the specified item on the given P-attribute. More...
 
int cl_sort2id (Attribute *attribute, int sort_index_position)
 Gets the ID code of the item at the specified position in the Attribute's sorted wordlist index. More...
 
int cl_id2sort (Attribute *attribute, int id)
 Gets the position in the Attribute's sorted wordlist index of the item with the specified ID code. More...
 
int cl_sequence_compressed (Attribute *attribute)
 Checks whether the item sequence of the given P-attribute is compressed. More...
 
int cl_index_compressed (Attribute *attribute)
 Check whether the reverse-corpus index (inverted file) of the given P-attribute is compressed. More...
 
int cl_max_cpos (Attribute *attribute)
 Gets the maximum position on this P-attribute (ie the size of the attribute). More...
 
int cl_max_id (Attribute *attribute)
 Gets the maximum id on this P-attribute (ie the range of the attribute's ID codes). More...
 
int cl_id2freq (Attribute *attribute, int id)
 Gets the frequency of an item on this attribute. More...
 
int * cl_id2cpos_oldstyle (Attribute *attribute, int id, int *freq, int *restrictor_list, int restrictor_list_size)
 Gets all the corpus positions where the specified item is found on the given P-attribute. More...
 
PositionStream cl_new_stream (Attribute *attribute, int id)
 Creates a new PositionStream object. More...
 
int cl_delete_stream (PositionStream *ps)
 Deletes a PositionStream object. More...
 
int cl_read_stream (PositionStream ps, int *buffer, int buffer_size)
 Reads corpus positions from a position stream to a buffer. More...
 
int cl_cpos2id (Attribute *attribute, int position)
 Gets the integer ID of the item at the specified position on the given p-attribute. More...
 
char * cl_cpos2str (Attribute *attribute, int position)
 Gets the string of the item at the specified position on the given p-attribute. More...
 
char * cl_id2all (Attribute *attribute, int index, int *freq, int *slen)
 Gets the string of the item with the specified ID on the given p-attribute. More...
 
int * cl_regex2id (Attribute *attribute, char *pattern, int flags, int *number_of_matches)
 Gets a list of the ids of those items on a given Attribute that match a particular regular-expression pattern. More...
 
int cl_idlist2freq (Attribute *attribute, int *word_ids, int number_of_words)
 Calculates the total frequency of all items on a list of item IDs. More...
 
static int intcompare (const void *i, const void *j)
 internal function for use with qsort More...
 
int * cl_idlist2cpos_oldstyle (Attribute *attribute, int *word_ids, int number_of_words, int sort, int *size_of_table, int *restrictor_list, int restrictor_list_size)
 Gets a list of corpus positions matching a list of ids. More...
 
int * get_previous_mark (int *data, int size, int position)
 Gets a pointer to the location where a structure is stored. More...
 
int cl_cpos2struc (Attribute *a, int cpos)
 Gets the ID number of a structure (instance of an s-attribute) that is found at the given corpus position. More...
 
int cl_cpos2boundary (Attribute *a, int cpos)
 Compares the location of a corpus position to the regions of an s-attribute. More...
 
int cl_max_struc (Attribute *a)
 Gets the maximum for this S-attribute (ie the size of the S-attribute). More...
 
int cl_cpos2struc2cpos (Attribute *attribute, int position, int *struc_start, int *struc_end)
 Gets the start and end positions of the instance of the given S-attribute found at the specified corpus position. More...
 
int cl_cpos2struc_oldstyle (Attribute *attribute, int position, int *struc_num)
 Gets the ID number of a structure (instance of an s-attribute) that is found at the given corpus position. More...
 
int cl_struc2cpos (Attribute *attribute, int struc_num, int *struc_start, int *struc_end)
 Retrieves the start-and-end corpus positions of a specified structure of the given s-attribute type. More...
 
int cl_max_struc_oldstyle (Attribute *attribute, int *nr_strucs)
 Gets the number of instances of an s-attribute in the corpus. More...
 
int cl_struc_values (Attribute *attribute)
 Checks whether this s-attribute has attribute values. More...
 
int s_v_comp (const void *v1, const void *v2)
 A non-exported function used by cl_struc2str. More...
 
char * cl_struc2str (Attribute *attribute, int struc_num)
 Gets the value that is associated with the specified instance of the given s-attribute. More...
 
char * structure_value_at_position (Attribute *struc, int position)
 Gets the value associated with the instance of the given s-attribute that occurs at the specified corpus position. More...
 
int get_alignment (int *data, int size, int position)
 Gets the id number of the alignment at the specified corpus position. More...
 
int get_extended_alignment (int *data, int size, int position)
 Gets the id number of the alignment at the specified corpus position. More...
 
int cl_cpos2alg2cpos_oldstyle (Attribute *attribute, int position, int *source_corpus_start, int *source_corpus_end, int *aligned_corpus_start, int *aligned_corpus_end)
 Gets the corpus positions of an alignment on the given align-attribute. More...
 
int cl_has_extended_alignment (Attribute *attribute)
 Checks whether an attribute's XALIGN component exists, that is, whether or not it has extended alignment. More...
 
int cl_max_alg (Attribute *attribute)
 Gets the id number of alignments on this align-attribute. More...
 
int cl_cpos2alg (Attribute *attribute, int cpos)
 Gets the id number of the alignment at the specified corpus position. More...
 
int cl_alg2cpos (Attribute *attribute, int alg, int *source_region_start, int *source_region_end, int *target_region_start, int *target_region_end)
 Gets the corpus positions of an alignment on the given align-attribute. More...
 
int cl_dynamic_call (Attribute *attribute, DynCallResult *dcr, DynCallResult *args, int nr_args)
 Calls a dynamic attribute. More...
 
int cl_dynamic_numargs (Attribute *attribute)
 Count the number of arguments on a dynamic attribute's argument list. More...
 

Variables

int cl_errno = CDA_OK
 Error number for CL: is set after access to any of various corpus-data-access functions. More...
 

Macro Definition Documentation

#define check_arg (   arg,
  atyp,
  rval 
)
Value:
if (arg == NULL) { \
cl_errno = CDA_ENULLATT; return rval; \
} \
else if (arg->type != atyp) { \
cl_errno = CDA_EATTTYPE; return rval; \
}
#define CDA_ENULLATT
Error code: NULL passed as attribute argument.
Definition: cl.h:147
int cl_errno
Error number for CL: is set after access to any of various corpus-data-access functions.
Definition: cdaccess.c:45
#define CDA_EATTTYPE
Error code: function was called on illegal attribute.
Definition: cl.h:148

Checks an Attribute passed as a function argument for usability in that function.

(a) arg must not be NULL. (b) arg type has to be the type specified in atyp.

If these conditions are not fulfilled, the current function returns rval, and cl_errno is set.

Referenced by cl_cpos2alg2cpos_oldstyle(), cl_cpos2id(), cl_cpos2str(), cl_cpos2struc2cpos(), cl_cpos2struc_oldstyle(), cl_dynamic_call(), cl_dynamic_numargs(), cl_has_extended_alignment(), cl_id2all(), cl_id2cpos_oldstyle(), cl_id2freq(), cl_id2sort(), cl_id2str(), cl_id2strlen(), cl_idlist2cpos_oldstyle(), cl_idlist2freq(), cl_index_compressed(), cl_max_cpos(), cl_max_id(), cl_max_struc_oldstyle(), cl_new_stream(), cl_regex2id(), cl_sequence_compressed(), cl_sort2id(), cl_str2id(), cl_struc2cpos(), cl_struc2str(), and cl_struc_values().

#define COMPRESS_DEBUG   0

If COMPRESS_DEBUG is set to a positive integer, cl_cpos2id() will print debugging messages.

(2 prints more than 1!)

Referenced by cl_cpos2id().

Typedef Documentation

Underlying structure for the PositionStream object.

PositionStreams are used for accessing Attributes. Each one represents a stream of corpus positions, representing positions where a given item occurs.

Function Documentation

int cl_alg2cpos ( Attribute attribute,
int  alg,
int *  source_region_start,
int *  source_region_end,
int *  target_region_start,
int *  target_region_end 
)

Gets the corpus positions of an alignment on the given align-attribute.

Note that four corpus positions are retrieved, into the addresses given as parameters.

Parameters
attributeThe align-attribute to look on.
algThe ID of the alignment whose positions are wanted.
source_region_startLocation to put source corpus start position.
source_region_endLocation to put source corpus end position.
target_region_startLocation to put target corpus start position.
target_region_endLocation to put target corpus end position.
Returns
Boolean: true = all OK, false = problem.

References CDA_EIDXORNG, CDA_ENODATA, CDA_OK, cl_errno, cl_has_extended_alignment(), CompAlignData, CompXAlignData, TMblob::data, TComponent::data, ensure_component(), and TComponent::size.

Referenced by check_alignment_constraints(), compose_kwic_line(), decode_print_token_sequence(), do_cqi_cl_alg2cpos(), do_translate(), main(), and printAlignedStrings().

int cl_cpos2alg ( Attribute attribute,
int  cpos 
)

Gets the id number of the alignment at the specified corpus position.

Parameters
attributeThe align-attribute to look on.
cposThe corpus position to look at.
Returns
The id number of the alignment at this position, or a negative int error code.

References CDA_EALIGN, CDA_ENODATA, CDA_EPOSORNG, CDA_OK, cl_errno, cl_has_extended_alignment(), CompAlignData, CompXAlignData, TMblob::data, TComponent::data, ensure_component(), get_alignment(), get_extended_alignment(), and TComponent::size.

Referenced by check_alignment_constraints(), compose_kwic_line(), decode_print_token_sequence(), do_cqi_cl_cpos2alg(), do_translate(), and printAlignedStrings().

int cl_cpos2alg2cpos_oldstyle ( Attribute attribute,
int  position,
int *  source_corpus_start,
int *  source_corpus_end,
int *  aligned_corpus_start,
int *  aligned_corpus_end 
)

Gets the corpus positions of an alignment on the given align-attribute.

This is for old-style alignments only: it doesn't (can't) deal with extended alignments. Depracated: use cl_alg2cpos instead (but note its parameters are not identical).

See also
cl_alg2cpos.
Parameters
attributeThe align-attribute to look on.
positionThe corpus position {??} of the alignment whose positions are wanted.
source_corpus_startLocation to put source corpus start position.
source_corpus_endLocation to put source corpus end position.
aligned_corpus_startLocation to put target corpus start position.
aligned_corpus_endLocation to put target corpus end position.
Returns
Boolean: true = all OK, false = problem.

References ATT_ALIGN, CDA_ENODATA, CDA_EPOSORNG, CDA_OK, check_arg, cl_errno, CompAlignData, TMblob::data, TComponent::data, ensure_component(), get_alignment(), and TComponent::size.

int cl_cpos2boundary ( Attribute a,
int  cpos 
)

Compares the location of a corpus position to the regions of an s-attribute.

This determines whether the specified corpus position is within a region (i.e. a structure, an instance of that s-attribute) on the given s-attribute; and/or on a boundary; or outside a region.

See also
STRUC_INSIDE
STRUC_LBOUND
STRUC_RBOUND
Parameters
aThe s-attribute on which to search.
cposThe corpus position to look for.
Returns
0 if this position is outside a region; some combination of flags if it is within a region or on a bound; or a negative number (error code) in case of error.

References CDA_ESTRUC, cl_cpos2struc2cpos(), cl_errno, STRUC_INSIDE, STRUC_LBOUND, and STRUC_RBOUND.

int cl_cpos2id ( Attribute attribute,
int  position 
)
char* cl_cpos2str ( Attribute attribute,
int  position 
)

Gets the string of the item at the specified position on the given p-attribute.

Parameters
attributeThe P-attribute to look on.
positionThe corpus position to look at.
Returns
The string of the item at that position on this attribute (pointer to actual data within the attribute, DO NOT FREE!), or NULL if there is an error.

References ATT_POS, CDA_OK, check_arg, cl_cpos2id(), cl_errno, and cl_id2str().

Referenced by alignshow_print_next_region(), decode_print_token_sequence(), do_cqi_cl_cpos2str(), get_position_values(), print_tabulation(), SortExternally(), and SortSubcorpus().

int cl_cpos2struc ( Attribute a,
int  cpos 
)

Gets the ID number of a structure (instance of an s-attribute) that is found at the given corpus position.

This is a wrapper of the "old" function get_num_of_struc() that normalises it to standard return value behaviour.

Parameters
aThe s-attribute on which to search.
cposThe corpus position to look for.
Returns
The number of the structure that is found.

References cl_cpos2struc_oldstyle(), and cl_errno.

Referenced by compose_kwic_line(), decode_print_surrounding_s_att_values(), decode_print_token_sequence(), do_cqi_cl_cpos2lbound(), do_cqi_cl_cpos2rbound(), do_cqi_cl_cpos2struc(), eval_constraint(), get_position_values(), and main().

int cl_cpos2struc2cpos ( Attribute attribute,
int  position,
int *  struc_start,
int *  struc_end 
)

Gets the start and end positions of the instance of the given S-attribute found at the specified corpus position.

This function finds one particular instance of the S-attribute, and assigns its start and end points to the locations given as arguments.

Parameters
attributeThe s-attribute to search.
positionThe corpus position to search for.
struc_startLocation for the start position of the instance.
struc_endLocation for the end position of the instance.

References ATT_STRUC, CDA_ENODATA, CDA_ESTRUC, CDA_OK, check_arg, cl_errno, CompStrucData, TMblob::data, TComponent::data, ensure_component(), get_previous_mark(), and TComponent::size.

Referenced by cl_cpos2boundary(), and decode_print_token_sequence().

int cl_cpos2struc_oldstyle ( Attribute attribute,
int  position,
int *  struc_num 
)

Gets the ID number of a structure (instance of an s-attribute) that is found at the given corpus position.

Depracated function: use cl_cpos2struc.

See also
cl_cpos2struc
Parameters
attributeThe s-attribute on which to search.
positionThe corpus position to look for.
struc_numLocation where the number of the structure that is found will be put.
Returns
Boolean: true for all OK, false for error.

References ATT_STRUC, CDA_ENODATA, CDA_ESTRUC, CDA_OK, check_arg, cl_errno, CompStrucData, TMblob::data, TComponent::data, ensure_component(), get_previous_mark(), and TComponent::size.

Referenced by cl_cpos2struc().

int cl_delete_stream ( PositionStream ps)

Deletes a PositionStream object.

References BSclose().

Referenced by compress_reversed_index(), and decompress_check_reversed_index().

int cl_dynamic_call ( Attribute attribute,
DynCallResult dcr,
DynCallResult args,
int  nr_args 
)

Calls a dynamic attribute.

This is the attribute access function for dynamic attributes.

Parameters
attributeThe (dynamic) attribute in question.
dcrLocation for the result (*int or *char).
argsLocation of the parameters (of *int or *char).
nr_argsNumber of parameters.
Returns
Boolean: True for all OK, false for error.

References Dynamic_Attribute::arglist, ATT_DYN, ATTAT_FLOAT, ATTAT_INT, ATTAT_NONE, ATTAT_PAREF, ATTAT_POS, ATTAT_STRING, ATTAT_VAR, Dynamic_Attribute::call, CDA_EARGS, CDA_OK, _DCR::charres, check_arg, cl_errno, CL_MAX_LINE_LENGTH, cl_strdup(), _Attribute::dyn, _DCR::floatres, _DCR::intres, _DynArg::next, Dynamic_Attribute::res_type, _DynArg::type, _DCR::type, and _DCR::value.

int cl_dynamic_numargs ( Attribute attribute)

Count the number of arguments on a dynamic attribute's argument list.

Parameters
attributepointer to the Attribute object to analyse; it must be a dynamic attribute.
Returns
integer specifying the number of arguments; a negative integer is returned if for any argument on dyn.arglist, the type is equal to ATTAT_VAR

References Dynamic_Attribute::arglist, ATT_DYN, ATTAT_VAR, CDA_OK, check_arg, cl_errno, _Attribute::dyn, _DynArg::next, and _DynArg::type.

void cl_error ( char *  message)
char* cl_error_string ( int  error_num)

Gets a string describing the error identified by an error number.

The string is a pointer to an internal constant string, i.e., do not modify or free it!

Parameters
error_numError number integer (a CDA_* constant as defined in cl.h)

References CDA_EACCESS, CDA_EALIGN, CDA_EARGS, CDA_EATTTYPE, CDA_EBADREGEX, CDA_EBUFFER, CDA_EFSETINV, CDA_EIDORNG, CDA_EIDXORNG, CDA_EINTERNAL, CDA_ENODATA, CDA_ENOMEM, CDA_ENOSTRING, CDA_ENULLATT, CDA_ENYI, CDA_EOTHER, CDA_EPATTERN, CDA_EPOSIX, CDA_EPOSORNG, CDA_EREMOTE, CDA_ESTRUC, and CDA_OK.

Referenced by cl_error(), open_input_stream(), and open_stream().

int cl_has_extended_alignment ( Attribute attribute)

Checks whether an attribute's XALIGN component exists, that is, whether or not it has extended alignment.

Parameters
attributeAn align-attribute.
Returns
Boolean.

References ATT_ALIGN, check_arg, cl_errno, component_state(), ComponentLoaded, ComponentUnloaded, and CompXAlignData.

Referenced by cl_alg2cpos(), cl_cpos2alg(), cl_max_alg(), and describecorpus_show_statistics().

char* cl_id2all ( Attribute attribute,
int  index,
int *  freq,
int *  slen 
)

Gets the string of the item with the specified ID on the given p-attribute.

As well as returning the string, other information about the item is inserted into locations specified by other parameters.

Parameters
attributeThe P-attribute to look on.
indexThe ID of the item to look at.
freqWill be set to the frequency of the item.
slenWill be set to the string-length of the item.
Returns
The string of the item at that position on this attribute, OR NULL if there is an error.

References ATT_POS, CDA_OK, check_arg, cl_errno, get_id_frequency, get_id_string_len, and get_string_of_id.

Referenced by lexdecode_print_item_info().

int* cl_id2cpos_oldstyle ( Attribute attribute,
int  id,
int *  freq,
int *  restrictor_list,
int  restrictor_list_size 
)

Gets all the corpus positions where the specified item is found on the given P-attribute.

The restrictor list is a set of ranges in which instances of the item MUST occur to be collected by this function. If no restrictor list is specified (i.e. restrictor_list is NULL), then ALL corpus positions where the item occurs are returned.

This restrictor list has the form of a list of ranges {start,end} of size restrictor_list_size, that is, the number of ints in this area is 2 * restrictor_list_size!!!

This function is "oldstyle" because in the "newstyle" function, there is no restrictor list. (And in fact, the newstyle function is implemented as a macro to this one with the last two arguments NULL and 0.)

See also
Parameters
attributeThe P-attribute to look on.
idThe id of the item to look for.
freqThe frequency of the specified item is written here. This will be 0 in the case of errors.
restrictor_listA list of pairs of integers specifying ranges {start,end} in the corpus
restrictor_list_sizeThe number of PAIRS of ints in the restrictor list.
Returns
Pointer to the list of corpus positions; or NULL in case of error.

References ATT_POS, BSclose(), BSopen(), BSseek(), CDA_EIDORNG, CDA_ENODATA, CDA_OK, check_arg, cl_errno, cl_free, cl_id2freq(), cl_index_compressed(), cl_malloc(), cl_max_cpos(), cl_max_id(), cl_realloc(), CompCompRF, CompCompRFX, CompRevCorpus, CompRevCorpusIdx, compute_ba(), TMblob::data, TComponent::data, ensure_component(), and read_golomb_code_bs().

int cl_id2freq ( Attribute attribute,
int  id 
)

Gets the frequency of an item on this attribute.

Parameters
attributeThe P-attribute to look on
idIdentifier of an item on this attribute.
Returns
The frequency count of the item specified by id, or an error code (if less than 0)

References ATT_POS, CDA_EIDXORNG, CDA_ENODATA, CDA_OK, check_arg, cl_errno, CompCorpusFreqs, TMblob::data, TComponent::data, and ensure_component().

Referenced by cl_id2cpos_oldstyle(), cl_idlist2freq(), compress_reversed_index(), compute_code_lengths(), creat_rev_corpus(), create_feature_maps(), decompress_check_reversed_index(), do_cqi_cl_id2freq(), and validate_revcorp().

int cl_id2sort ( Attribute attribute,
int  id 
)

Gets the position in the Attribute's sorted wordlist index of the item with the specified ID code.

This function is NOT YET IMPLEMENTED.

See also
get_id_from_sortidx
Parameters
attributeThe (positional) Attribute whose index is to be searched
idIdentifier of an item on this attribute.
Returns
The offset of that item in the sorted wordlist index.

References ATT_POS, CDA_ENODATA, CDA_ENYI, CDA_OK, check_arg, cl_errno, CompLexiconSrt, and ensure_component().

char* cl_id2str ( Attribute attribute,
int  id 
)

Gets the string that corresponds to the specified item on the given P-attribute.

Parameters
attributeThe Attribute to look the item up on
idIdentifier of an item on this attribute.
Returns
The string (pointer to actual data within the attribute, DO NOT FREE!), or NULL if there is an error.

References ATT_POS, CDA_EIDORNG, CDA_ENODATA, CDA_OK, check_arg, cl_errno, CompLexicon, CompLexiconIdx, TMblob::data, TComponent::data, ensure_component(), and TComponent::size.

Referenced by cl_cpos2str(), cl_id2strlen(), create_feature_maps(), do_cqi_cl_id2str(), Group_id2str(), i2compare(), main(), and scancorpus_add_key().

int cl_id2strlen ( Attribute attribute,
int  id 
)

Calculates the length of the string that corresponds to the specified item on the given P-attribute.

Parameters
attributeThe (positional) Attribute to look up the item on
idIdentifier of an item on this attribute.
Returns
The length of the string, or a CDA_ error code

References ATT_POS, CDA_EIDORNG, CDA_ENODATA, CDA_EOTHER, CDA_OK, check_arg, cl_errno, cl_id2str(), CompLexiconIdx, TMblob::data, TComponent::data, ensure_component(), and TComponent::size.

Referenced by create_feature_maps().

int* cl_idlist2cpos_oldstyle ( Attribute attribute,
int *  word_ids,
int  number_of_words,
int  sort,
int *  size_of_table,
int *  restrictor_list,
int  restrictor_list_size 
)

Gets a list of corpus positions matching a list of ids.

This function returns an (ordered) list of all corpus positions which match one of the ids given in the list of ids. The table is allocated with malloc, so free it when you don't need any more.

The list itself is returned; its size is placed in size_of_table. This size is, of course, the same as the cumulative id frequency of the ids (because each corpus position matching one of the ids is added into the list).

BEWARE: when the id list is rather big or there are highly-frequent ids in the id list (for example, after a call to collect_matching_ids with the pattern ".*") this will give a copy of the corpus – for which you probably don't have enough memory!!! It is therefore a good idea to call cumulative_id_frequency before and to introduce some kind of bias.

This function is DEPRACATED in favour of cl_idlist2cpos().

This function is "oldstyle" because it has the "restrictor list" parameters, which are not available through the "newstyle" function cl_idlist2cpos() (which is currently just a macro to this).

A note on the last two parameters, which are currently unused: restrictor_list is a list of integer pairs [a,b] which means that the returned value only contains positions which fall within at least one of these intervals. The list must be sorted by the start positions, and secondarily by b. restrictor_list_size is the number of integers in this list, NOT THE NUMBER OF PAIRS. WARNING: CURRENTLY UNIMPLEMENTED {NB – this description of restrictor_list_size DOESN'T MATCH the one for get_positions(), which this function calls...

REMEMBER: this monster returns a list of corpus indices, not a list of ids.

See also
collect_matching_ids
get_positions
cl_idlist2cpos
Parameters
attributeThe P-attribute we are looking in
word_idsA list of item ids (i.e. id codes for items on this attribute).
number_of_wordsThe length of this list.
sortboolean: return sorted list?
size_of_tableThe size of the allocated table will be placed here.
restrictor_listSee function description.
restrictor_list_sizeSee function description.
Returns
Pointer to the list of corpus positions.

References ATT_POS, CDA_EIDORNG, CDA_ENODATA, CDA_OK, check_arg, cl_errno, cl_idlist2freq(), cl_malloc(), CompLexiconIdx, ensure_component(), get_positions, intcompare(), and TComponent::size.

Referenced by get_matched_corpus_positions().

int cl_idlist2freq ( Attribute attribute,
int *  word_ids,
int  number_of_words 
)

Calculates the total frequency of all items on a list of item IDs.

This function returns the sum of the word frequencies of words, which is an array of word_ids with length number_of_words.

The result is therefore the number of corpus positions which match one of the words.

Parameters
attributeP-attribute on which these items are found.
word_idsAn array of item IDs.
number_of_wordsLength of the word_ids array.
Returns
Sum of all the frequencies; less than 0 for an error.

References ATT_POS, CDA_ENODATA, CDA_OK, check_arg, cl_errno, and cl_id2freq().

Referenced by cl_idlist2cpos_oldstyle(), and OptimizeStringConstraint().

int cl_index_compressed ( Attribute attribute)

Check whether the reverse-corpus index (inverted file) of the given P-attribute is compressed.

See comments in body of function for what counts as "compressed".

Returns
Boolean.

References ATT_POS, check_arg, cl_errno, CompCompRF, CompCompRFX, component_state(), ComponentLoaded, ComponentUnloaded, CompRevCorpus, and CompRevCorpusIdx.

Referenced by cl_id2cpos_oldstyle(), and cl_new_stream().

int cl_max_alg ( Attribute attribute)

Gets the id number of alignments on this align-attribute.

This is equal to the maximum alignment on this attribute.

Parameters
attributeAn align-attribute.
Returns
The number of alignments on this attribute.

References CDA_ENODATA, CDA_OK, cl_errno, cl_has_extended_alignment(), CompAlignData, CompXAlignData, ensure_component(), and TComponent::size.

Referenced by describecorpus_show_statistics(), do_cqi_cl_attribute_size(), and main().

int cl_max_cpos ( Attribute attribute)

Gets the maximum position on this P-attribute (ie the size of the attribute).

The result of this function is equal to the number of tokens in the attribute.

If the attribute's item sequence is compressed, this is read from the attribute's Huffman code descriptor block.

Otherwise, it is read from the size member of the Attribute's CompCorpus component.

Returns
The maximum corpus position, or an error code (if less than 0)

References ATT_POS, CDA_ENODATA, CDA_OK, check_arg, cl_errno, cl_sequence_compressed(), CompCorpus, CompHuffCodes, corpus, ensure_component(), POS_Attribute::hc, _huffman_code_descriptor::length, _Attribute::pos, and TComponent::size.

Referenced by cl_id2cpos_oldstyle(), compose_kwic_line(), compress_reversed_index(), compute_code_lengths(), creat_rev_corpus(), decode_check_huff(), decompress_check_reversed_index(), describecorpus_show_basic_info(), describecorpus_show_statistics(), do_cqi_cl_attribute_size(), get_matched_corpus_positions(), lexdecode_show(), main(), OptimizeStringConstraint(), Setop(), SortSubcorpus(), and validate_revcorp().

int cl_max_id ( Attribute attribute)

Gets the maximum id on this P-attribute (ie the range of the attribute's ID codes).

The result of this function is equal to the number of types in this attribute.

See also
get_attribute_size
Returns
The maximum Id, or an error code (if less than 0)

References ATT_POS, CDA_ENODATA, CDA_OK, check_arg, cl_errno, CompLexiconIdx, ensure_component(), and TComponent::size.

Referenced by cl_id2cpos_oldstyle(), compress_reversed_index(), compute_code_lengths(), creat_rev_corpus(), create_feature_maps(), decompress_check_reversed_index(), describecorpus_show_statistics(), do_cqi_cl_lexicon_size(), get_matched_corpus_positions(), lexdecode_show(), main(), and validate_revcorp().

int cl_max_struc ( Attribute a)

Gets the maximum for this S-attribute (ie the size of the S-attribute).

The result of this function is equal to the number of instances of this s-attribute in the corpus.

This function works as a wrapper round cl_max_struc_oldstyle that normalises it to standard return value behaviour.

The s-attribute to evaluate.

Returns
The maximum corpus position, or an error code (if less than 0)

References cl_errno, and cl_max_struc_oldstyle().

Referenced by compose_kwic_line(), describecorpus_show_statistics(), do_cqi_cl_attribute_size(), main(), matchfirstpattern(), and scancorpus_add_key().

int cl_max_struc_oldstyle ( Attribute attribute,
int *  nr_strucs 
)

Gets the number of instances of an s-attribute in the corpus.

Depracated: use cl_max_struc instead.

See also
cl_max_struc.
Parameters
attributeThe s-attribute to count.
nr_strucsThe number of instances is put here.
Returns
boolean: true for all OK, false for problem.

References ATT_STRUC, CDA_ENODATA, CDA_OK, check_arg, cl_errno, CompStrucData, ensure_component(), and TComponent::size.

Referenced by cl_max_struc().

PositionStream cl_new_stream ( Attribute attribute,
int  id 
)
int cl_read_stream ( PositionStream  ps,
int *  buffer,
int  buffer_size 
)

Reads corpus positions from a position stream to a buffer.

Parameters
psThe position stream to read.
bufferLocation to put the resulting item positions.
buffer_sizeMaximum number of item positions to read. (Fewer will be read if fewer are available).
Returns
The number of item positions that have been read. This may be less than buffer_size (and will be 0 if there are no instances of this item left).

References _position_stream_rec_::b, _position_stream_rec_::base, _position_stream_rec_::bs, _position_stream_rec_::id_freq, _position_stream_rec_::is_compressed, _position_stream_rec_::last_pos, _position_stream_rec_::nr_items, and read_golomb_code_bs().

Referenced by compress_reversed_index(), and decompress_check_reversed_index().

int* cl_regex2id ( Attribute attribute,
char *  pattern,
int  flags,
int *  number_of_matches 
)

Gets a list of the ids of those items on a given Attribute that match a particular regular-expression pattern.

The pattern is interpreted internally with the CL regex engine, q.v.

The function returns a pointer to a sequence of ints of size number_of_matches. The list is allocated with malloc(), so do a cl_free() when you don't need it any more.

See also
cl_new_regex
Parameters
attributeThe p-attribute to look on.
patternString containing the pattern against which to match each item on the attribute. Note: this pattern is a regular expression, but it is passed as a string, not a CL_Regex object. The CL_Regex object is created internally.
flagsFlags for the regular expression system via cl_new_regex.
number_of_matchesThis is set to the number of item ids found, i.e. the size of the returned buffer.
Returns
A pointer to the list of item ids.

References ATT_POS, CDA_EBADREGEX, CDA_ENODATA, CDA_OK, check_arg, cl_calloc(), cl_debug, cl_delete_regex(), cl_errno, cl_free, cl_malloc(), cl_new_regex(), cl_regex_error, cl_regex_match(), cl_regex_optimised(), cl_regopt_count_get(), cl_regopt_count_reset(), CompLexicon, CompLexiconIdx, TMblob::data, TComponent::data, ensure_component(), _Attribute::pos, TComponent::size, and word.

Referenced by do_cqi_cl_regex2id(), get_matched_corpus_positions(), lexdecode_show(), and scancorpus_add_key().

int cl_sequence_compressed ( Attribute attribute)

Checks whether the item sequence of the given P-attribute is compressed.

See comments in body of function for what counts as "compressed".

Returns
Boolean.

References ATT_POS, check_arg, cl_errno, CompCorpus, CompHuffCodes, CompHuffSeq, CompHuffSync, component_state(), ComponentLoaded, ComponentUnloaded, POS_Attribute::hc, and _Attribute::pos.

Referenced by cl_max_cpos(), and load_component().

int cl_sort2id ( Attribute attribute,
int  sort_index_position 
)

Gets the ID code of the item at the specified position in the Attribute's sorted wordlist index.

That is, given a sort-order position, the actual ID of the corresponding item is generated.

See also
get_sortidxpos_of_id
Parameters
attributeThe (positional) Attribute whose index is to be searched.
sort_index_positionThe offset in the index where the ID code is to be found.
Returns
Either the integer ID, or an error code (if less than 0)

References ATT_POS, CDA_EIDXORNG, CDA_ENODATA, CDA_OK, check_arg, cl_errno, CompLexiconSrt, TMblob::data, TComponent::data, and ensure_component().

Referenced by lexdecode_show().

int cl_str2id ( Attribute attribute,
char *  id_string 
)

Gets the ID code that corresponds to the specified string on the given P-attribute.

Parameters
attributeThe (positional) Attribute to look the string up on
id_stringThe string of an item on this attribute
Returns
Either the integer ID of the item, or an error code (if less than 0). In the latter case, the error code will also be written to cl_errno.

References ATT_POS, CDA_ENODATA, CDA_ENOSTRING, CDA_EOTHER, CDA_OK, check_arg, cl_errno, cl_strcmp(), CompLexicon, CompLexiconIdx, CompLexiconSrt, TMblob::data, TComponent::data, ensure_component(), and TComponent::size.

Referenced by create_feature_maps(), do_cqi_cl_str2id(), get_corpus_positions(), lexdecode_show(), OptimizeStringConstraint(), show_features(), and VerifyVariable().

int cl_strcmp ( char *  s1,
char *  s2 
)

CL internal string comparison (uses signed char on all platforms).

Referenced by cl_set_intersection(), cl_str2id(), cl_string_list_strcmp(), compare_cells(), and scompare().

int cl_struc2cpos ( Attribute attribute,
int  struc_num,
int *  struc_start,
int *  struc_end 
)

Retrieves the start-and-end corpus positions of a specified structure of the given s-attribute type.

Parameters
attributeAn s-attribute.
struc_numThe instance of that s-attribute to retrieve (i.e. the struc_num'th instance of this s-attribute in the corpus).
struc_startLocation to put the starting corpus position.
struc_endLocation to put the ending corpus position.
Returns
boolean: true for all OK, 0 for problem

References ATT_STRUC, CDA_EIDXORNG, CDA_ENODATA, CDA_OK, check_arg, cl_errno, CompStrucData, TMblob::data, TComponent::data, ensure_component(), and TComponent::size.

Referenced by align_print_line(), compose_kwic_line(), decode_print_token_sequence(), do_cqi_cl_cpos2lbound(), do_cqi_cl_cpos2rbound(), do_cqi_cl_struc2cpos(), eval_constraint(), feature_match(), get_position_values(), main(), and matchfirstpattern().

char* cl_struc2str ( Attribute attribute,
int  struc_num 
)

Gets the value that is associated with the specified instance of the given s-attribute.

Parameters
attributeAn S-attribute.
struc_numID of the structure whose value is wanted (ie, function gets value of struc_num'th instance of this s-attribute)
Returns
A string; or NULL in case of error. Note that this string is a pointer to the depths of the Attribute object itself, as this function does not strdup() its result – so don't free this return value!

References ATT_STRUC, CDA_EIDXORNG, CDA_EINTERNAL, CDA_ENODATA, CDA_OK, check_arg, cl_errno, cl_struc_values(), CompStrucAVS, CompStrucAVX, TMblob::data, TComponent::data, ensure_component(), s_v_comp(), and TComponent::size.

Referenced by compute_grouping(), decode_print_surrounding_s_att_values(), decode_print_token_sequence(), do_cqi_cl_struc2str(), eval_constraint(), get_position_values(), main(), matchfirstpattern(), scancorpus_add_key(), and structure_value_at_position().

int cl_struc_values ( Attribute attribute)
int get_alignment ( int *  data,
int  size,
int  position 
)

Gets the id number of the alignment at the specified corpus position.

For use with non-extended alignments. Requires members of the ALIGN component as arguments.

Not an exported function!

{Query:am I correct that "position" here means a cpos?? – AH} {If I'm not, other docblocks in cdaccess also have errors}

See also
cl_cpos2alg
get_extended_alignment
Parameters
dataThe data member of a CompAlignData component.
sizeThe size member of the same CompAlignData component.
positionThe corpus position to look at.
Returns
The id of the alignment at this corpus position, or -1 for error.

Referenced by cl_cpos2alg(), and cl_cpos2alg2cpos_oldstyle().

int get_extended_alignment ( int *  data,
int  size,
int  position 
)

Gets the id number of the alignment at the specified corpus position.

For use with extended alignments. Requires members of the XALIGN component as arguments.

Not an exported function!

See also
cl_cpos2alg
get_alignment
Parameters
dataThe data member of a CompXAlignData component.
sizeThe size member of the same CompXAlignData component.
positionThe corpus position to look at.
Returns
The id of the alignment at this corpus position, or -1 for error.

References CDA_EALIGN.

Referenced by cl_cpos2alg().

int* get_previous_mark ( int *  data,
int  size,
int  position 
)

Gets a pointer to the location where a structure is stored.

The structure (instance of an s-attribute) that is found is the one in which the specified corpus position occurs.

Non-exported function.

Parameters
data"data.data" member of an s-attribute
size"size" member of the same s-attribute
positionThe corpus position to look for.
Returns
Pointer to the integers in data where the start point of the structure at this corpus position can be found. NULL if not found.

Referenced by cl_cpos2struc2cpos(), and cl_cpos2struc_oldstyle().

static int intcompare ( const void *  i,
const void *  j 
)
static

internal function for use with qsort

Referenced by cl_idlist2cpos_oldstyle().

int s_v_comp ( const void *  v1,
const void *  v2 
)

A non-exported function used by cl_struc2str.

Referenced by cl_struc2str().

char* structure_value_at_position ( Attribute struc,
int  position 
)

Gets the value associated with the instance of the given s-attribute that occurs at the specified corpus position.

Parameters
strucThe s-attribute to search through.
positionThe corpus position being queried.
Returns
The value of the instance of the s-attribute, or NULL for error.

References cl_struc2str(), and get_num_of_struc.

Referenced by get_leaf_value().

Variable Documentation

int cl_errno = CDA_OK