CWB
|
#include <ctype.h>
#include <sys/types.h>
#include <sys/utsname.h>
#include <pwd.h>
#include <grp.h>
#include "globals.h"
#include "macros.h"
#include "attributes.h"
#include "registry.tab.h"
#include "corpus.h"
Data Structures | |
struct | _charset_spec |
structure for the global list of charset names More... | |
Typedefs | |
typedef struct _charset_spec | charset_spec |
structure for the global list of charset names More... | |
Functions | |
void | cregrestart (FILE *file) |
Function created in output from parsing registry.y void cregerror(char *message); TODO - is the above function unused? in this file? More... | |
int | cregparse () |
Parse a corpus registry file. More... | |
char * | cl_standard_registry () |
Gets a string containing the path of the default registry directory. More... | |
Corpus * | find_corpus (char *registry_dir, char *registry_name) |
Gets a pointer to the Corpus object with the specified CWB-name and registry location. More... | |
FILE * | find_corpus_registry (char *registry_dir, char *registry_name, char **real_registry_dir) |
Gets a file handle for the registry file of the corpus with the specified CWB-name and registry location. More... | |
int | check_access_conditions (Corpus *corpus, int verbose) |
Checks whether the corpus can be accessed. More... | |
Corpus * | cl_new_corpus (char *registry_dir, char *registry_name) |
Creates a Corpus object to represent a given indexed corpus, located in a given directory accessible to the program. More... | |
int | cl_delete_corpus (Corpus *corpus) |
Deletes a Corpus object from memory. More... | |
void | describe_corpus (Corpus *corpus) |
Prints a description of the corpus to STDOUT. More... | |
cl_string_list | cl_corpus_list_attributes (Corpus *corpus, int attribute_type) |
Gets a list of the named attributes that this corpus posesses. More... | |
void | FreeIDList (IDList *list) |
Deletes an IDList object, and sets the argument pointer to NULL. More... | |
int | memberIDList (char *s, IDList l) |
Checks whether the specified string occurs in the given IDList. More... | |
CorpusProperty | cl_first_corpus_property (Corpus *corpus) |
Gets the first entry in this corpus's list of properties. More... | |
CorpusProperty | cl_next_corpus_property (CorpusProperty prop) |
Gets the next corpus property on the list of properties. More... | |
char * | cl_corpus_property (Corpus *corpus, char *property) |
Gets the value of the specified corpus property. More... | |
CorpusCharset | cl_corpus_charset (Corpus *corpus) |
Retrieves the special 'charset' property from a Corpus object. More... | |
char * | cl_charset_name (CorpusCharset id) |
Gets a string containing the name of the specified CorpusCharset character set object. More... | |
CorpusCharset | cl_charset_from_name (char *name) |
Gets a CorpusCharset enumeration with the id code for the given string. More... | |
char * | cl_charset_name_canonical (char *name_to_check) |
Checks whether a string represents a valid charset, and returns a pointer to the name in canonical form (ie lacking any non-standard case there may be in the input string). More... | |
void | add_corpus_property (Corpus *corpus, char *property, char *value) |
Adds a property to the list of corpus properties. More... | |
Variables | |
FILE * | cregin |
File pointer for loading corpus registry. More... | |
Corpus * | cregcorpus |
Pointer to a corpus object that is used when loading from the registry. More... | |
char * | cregin_path = "" |
Full path of the registry file currently being parsed (for registry parser error messages) More... | |
char * | cregin_name = "" |
The name of registry file currently being parsed (for registry parser error messages) More... | |
char | errmsg [CL_MAX_LINE_LENGTH] |
Buffer for an error message. More... | |
Corpus * | loaded_corpora = NULL |
Head of a linked list of loaded corpus handles (for memory manager). More... | |
static char * | regdir = NULL |
The default registry directory. More... | |
charset_spec | charset_names [] |
a list of charset names as strings paired with CorpusCharset ID values; where there are multiple possible names for one ID, the pair with the CWB-preferred name comes first in the array (and is the same as the identifier used for that charset in the CorpusCharset enumeration). More... | |
typedef struct _charset_spec charset_spec |
structure for the global list of charset names
void add_corpus_property | ( | Corpus * | corpus, |
char * | property, | ||
char * | value | ||
) |
Adds a property to the list of corpus properties.
Use this function from registry.y only!
If the property is already defined, ignore and warn. If the property is 'charset', corpus charset is set as well.
corpus | Corpus object to add property to. |
property | Name of property to add. |
value | Value of property to add. |
References TCorpus::charset, cl_charset_from_name(), cl_corpus_property(), cl_malloc(), cregin_name, cregin_path, TCorpusProperty::next, TCorpus::properties, TCorpusProperty::property, and TCorpusProperty::value.
int check_access_conditions | ( | Corpus * | corpus, |
int | verbose | ||
) |
Checks whether the corpus can be accessed.
If this corpus has access restriction in the form of a list of users, then this function checks if the current user is on that list.
Then ditto for the list of groups and current group; then ditto for the list of hosts and current host.
Note that this is currently disabled for users/groups. So, if either Corpus::userAccessList or Corpus::groupAccessList are changed from their initial (NULL) setting, this function will return false.
On the other hand, the function does work with hosts. If a list of allows hosts is set, this function will return true iff the current host is on that list.
Finally note: if compiled for Windows, this function returns true without actually checking anything, because windows does not support the POSIX user/group functionality on which all of this relies. Under Windows, access policies have to managed by the cqpserver rather than by the CL simply reading from the registry.
corpus | The corpus. |
verbose | A boolean. Currently ignored. |
References TCorpus::groupAccessList, TCorpus::hostAccessList, TCorpus::id, memberIDList(), TCorpus::name, _idbuf::next, passwd, _idbuf::string, and TCorpus::userAccessList.
Referenced by cl_new_corpus().
CorpusCharset cl_charset_from_name | ( | char * | name | ) |
Gets a CorpusCharset enumeration with the id code for the given string.
References _charset_spec::name, and unknown_charset.
Referenced by add_corpus_property(), cwbci_parse_options(), main(), and sencode_parse_options().
char* cl_charset_name | ( | CorpusCharset | id | ) |
Gets a string containing the name of the specified CorpusCharset character set object.
Note that returned string cannot be modified. TODO It should probably be a const char.
References _charset_spec::name.
Referenced by corpus_info(), describecorpus_show_basic_info(), and do_cqi_corpus_charset().
char* cl_charset_name_canonical | ( | char * | name_to_check | ) |
Checks whether a string represents a valid charset, and returns a pointer to the name in canonical form (ie lacking any non-standard case there may be in the input string).
Note that the returned string cannot be modified.
name_to_check | String containing the character set name to be checked |
References _charset_spec::name.
Referenced by cwbci_parse_options(), encode_parse_options(), and sencode_parse_options().
CorpusCharset cl_corpus_charset | ( | Corpus * | corpus | ) |
Retrieves the special 'charset' property from a Corpus object.
corpus | The corpus object from which to retrieve the charset |
References TCorpus::charset.
Referenced by create_feature_maps(), decode_print_xml_declaration(), main(), scancorpus_add_key(), and sencode_parse_options().
cl_string_list cl_corpus_list_attributes | ( | Corpus * | corpus, |
int | attribute_type | ||
) |
Gets a list of the named attributes that this corpus posesses.
This function creates a list of strings containing the names of all and only those Attributes in this corpus whose type matches that specified in the second parameter.
corpus | The corpus whose attributes are to be listed. |
attribute_type | The type of attributes to be listed. This must be one of the attribute type macros: ATT_POS, ATT_STRUC etc. For all attributes, specify ATT_ALL (natuerlich). |
References _Attribute::any, TCorpus::attributes, cl_new_string_list(), cl_strdup(), and cl_string_list_append().
char* cl_corpus_property | ( | Corpus * | corpus, |
char * | property | ||
) |
Gets the value of the specified corpus property.
corpus | Pointer to the Corpus object. |
property | Name of the property to retrieve. |
References cl_first_corpus_property(), cl_next_corpus_property(), TCorpusProperty::property, and TCorpusProperty::value.
Referenced by add_corpus_property(), and corpus_info().
int cl_delete_corpus | ( | Corpus * | corpus | ) |
Deletes a Corpus object from memory.
A Corpus object keeps track of how many times it has been requested via cl_new_corpus(). When cl_delete_corpus() is called, the object is only actually deleted when there is just one outstanding request. Otherwise, the variable tracking the number of requests is decremented.
corpus | The Corpus to delete. |
References TCorpus::admin, TCorpus::attributes, cl_delete_attribute(), cl_free, FreeIDList(), TCorpus::groupAccessList, TCorpus::hostAccessList, TCorpus::id, TCorpus::info_file, loaded_corpora, TCorpus::name, TCorpus::next, TCorpus::nr_of_loads, TCorpus::path, TCorpus::registry_dir, TCorpus::registry_name, and TCorpus::userAccessList.
Referenced by cl_new_corpus(), compressrdx_cleanup(), decode_cleanup(), huffcode_usage(), and main().
CorpusProperty cl_first_corpus_property | ( | Corpus * | corpus | ) |
Gets the first entry in this corpus's list of properties.
(The corpus properties iterator / property datatype is public.)
corpus | Pointer to the Corpus object. |
References TCorpus::properties.
Referenced by cl_corpus_property(), and corpus_info().
Corpus* cl_new_corpus | ( | char * | registry_dir, |
char * | registry_name | ||
) |
Creates a Corpus object to represent a given indexed corpus, located in a given directory accessible to the program.
registry_dir | Path to the CWB registry directory from which the corpus is to be loaded. This may be NULL, in which case the default registry directory is used. |
registry_name | The CWB-name of the indexed corpus to load (in the all-lowercase form) |
References check_access_conditions(), cl_delete_corpus(), cl_free, cl_id_tolower(), cl_id_validate(), cl_standard_registry(), cl_strdup(), corpus, cregcorpus, cregin, cregin_name, cregin_path, cregparse(), cregrestart(), find_corpus(), find_corpus_registry(), TCorpus::id, loaded_corpora, TCorpus::next, TCorpus::nr_of_loads, TCorpus::registry_dir, and TCorpus::registry_name.
Referenced by main(), printAlignedStrings(), and sencode_parse_options().
CorpusProperty cl_next_corpus_property | ( | CorpusProperty | prop | ) |
Gets the next corpus property on the list of properties.
(The corpus properties iterator / property datatype is public.)
prop | The current property. |
References TCorpusProperty::next.
Referenced by cl_corpus_property(), and corpus_info().
char* cl_standard_registry | ( | ) |
Gets a string containing the path of the default registry directory.
Note this is a pointer to an internal string, and therefore must not be altered or freed.
References regdir, REGISTRY_DEFAULT_PATH, and REGISTRY_ENVVAR.
Referenced by cl_new_corpus(), find_corpus(), load_corpusnames(), and main().
int cregparse | ( | ) |
Parse a corpus registry file.
The file to be parsed is specified by global variables cregin_path and cregin_name.
Function created in output from parsing registry.y
Referenced by cl_new_corpus().
void cregrestart | ( | FILE * | file | ) |
Function created in output from parsing registry.y void cregerror(char *message); TODO - is the above function unused? in this file?
Restarts the registry file parse
Function created in output from parsing registry.y
Referenced by cl_new_corpus().
void describe_corpus | ( | Corpus * | corpus | ) |
Prints a description of the corpus to STDOUT.
TODO might be nice to have this function offer an option of XML-style output. TODO might also be nice to have it return a string, or send to a parameter stream, so that the caller can decide what to do with it: direct UI formatting does not really belong in the low-level CL.
References _Attribute::any, TCorpus::attributes, describe_attribute(), TCorpus::id, TCorpus::info_file, TCorpus::name, TCorpus::path, TCorpus::registry_dir, and TCorpus::registry_name.
Referenced by main().
Corpus* find_corpus | ( | char * | registry_dir, |
char * | registry_name | ||
) |
Gets a pointer to the Corpus object with the specified CWB-name and registry location.
(Works by searching the loaded_corpora global linked list.)
registry_dir | The registry directory. |
registry_name | The CWB name of the corpus. |
References cl_standard_registry(), TCorpus::next, PATH_SEPARATOR, TCorpus::registry_dir, TCorpus::registry_name, and STREQ.
Referenced by cl_new_corpus().
FILE* find_corpus_registry | ( | char * | registry_dir, |
char * | registry_name, | ||
char ** | real_registry_dir | ||
) |
Gets a file handle for the registry file of the corpus with the specified CWB-name and registry location.
The registry file is opened for text-mode read.
registry_dir | The registry directory. |
registry_name | The CWB name of the corpus. |
real_registry_dir | This will be set to a pointer to the "real" name of the directory derived from the registry_dir parameter. |
References cl_malloc(), CL_MAX_LINE_LENGTH, PATH_SEPARATOR, and SUBDIR_SEPARATOR.
Referenced by cl_new_corpus().
void FreeIDList | ( | IDList * | list | ) |
Deletes an IDList object, and sets the argument pointer to NULL.
list | IDList to delete. |
References cl_free, _idbuf::next, and _idbuf::string.
Referenced by cl_delete_corpus().
int memberIDList | ( | char * | s, |
IDList | l | ||
) |
Checks whether the specified string occurs in the given IDList.
s | The username, groupname, or hostname to look for. |
l | The IDList to search. |
References _idbuf::next, and _idbuf::string.
Referenced by check_access_conditions().
charset_spec charset_names[] |
a list of charset names as strings paired with CorpusCharset ID values; where there are multiple possible names for one ID, the pair with the CWB-preferred name comes first in the array (and is the same as the identifier used for that charset in the CorpusCharset enumeration).
TODO should it be const charset_spec ?
Corpus* cregcorpus |
Pointer to a corpus object that is used when loading from the registry.
(External variable, defined in the output from parsing registry.y)
Referenced by cl_new_corpus().
FILE* cregin |
File pointer for loading corpus registry.
(External variable, defined in the output from parsing registry.y)
Referenced by cl_new_corpus().
char* cregin_name = "" |
The name of registry file currently being parsed (for registry parser error messages)
Referenced by add_corpus_property(), and cl_new_corpus().
char* cregin_path = "" |
Full path of the registry file currently being parsed (for registry parser error messages)
Referenced by add_corpus_property(), and cl_new_corpus().
char errmsg[CL_MAX_LINE_LENGTH] |
Buffer for an error message.
{Used in registry parser???}
Corpus* loaded_corpora = NULL |
Head of a linked list of loaded corpus handles (for memory manager).
Referenced by cl_delete_corpus(), and cl_new_corpus().
|
static |
The default registry directory.
It is initialised when the function that reads it is first called.
Referenced by cl_standard_registry().