CWB
Macros | Functions | Variables
corpmanag.c File Reference
#include <stddef.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <dirent.h>
#include <unistd.h>
#include <string.h>
#include <ctype.h>
#include <math.h>
#include "../cl/globals.h"
#include "../cl/macros.h"
#include "../cl/corpus.h"
#include "../cl/attributes.h"
#include "../cl/cdaccess.h"
#include "../cl/fileutils.h"
#include "corpmanag.h"
#include "cqp.h"
#include "options.h"
#include "output.h"
#include "ranges.h"

Macros

#define COLON   ':'
 
#define SLASH   '^'
 
#define subcorpload_debug   0
 
#define SUBCORPMAGIC   36193928
 magic number for subcorpus (incl. More...
 

Functions

static Boolean attach_subcorpus (CorpusList *cl, char *advertised_directory, char *advertised_filename)
 
CorpusListGetSystemCorpus (char *name, char *registry)
 
void init_corpuslist (void)
 Initialises the global corpus list (sets it to NULL, no matter what its value was). More...
 
void initialize_cl (CorpusList *cl, int free_name)
 Resets to empty a CorpusList object. More...
 
void free_corpuslist (void)
 Frees the global list of currently-loaded corpora. More...
 
CorpusListNewCL (void)
 Creates a new CorpusList object. More...
 
FieldType field_name_to_type (char *name)
 Returns a FieldType enumeration corresponding to the field name indicated by its stirng argument. More...
 
char * field_type_to_name (FieldType ft)
 Returns a pointer to an internal constant string that labels the FieldType argument. More...
 
int NrFieldValues (CorpusList *cl, FieldType ft)
 Counts the number of field-value items of a specified type in the given subcorpus (that is, query resultset). More...
 
int SystemCorpusSize (Corpus *corpus)
 A utility function required by ensure_corpus_size() More...
 
Boolean ensure_corpus_size (CorpusList *cl)
 This is an internal function used to ensure that a system corpus from the corpus list is accessible and that its size has been computed. More...
 
CorpusListLoadedCorpus (char *name, char *qualifier, CorpusType type)
 Finds a loaded corpus. More...
 
CorpusListfindcorpus (char *s, CorpusType type, int try_recursive_search)
 Finds the pointer to the corpus (or subcorpus, or query result) with the given name. More...
 
void dropcorpus (CorpusList *cl)
 Remove a corpus from the global list of corpora. More...
 
CorpusListduplicate_corpus (CorpusList *cl, char *new_name, Boolean force_overwrite)
 Duplicate a corpus via its CorpusList object. More...
 
CorpusListmake_temp_corpus (CorpusList *cl, char *new_name)
 Copy a corpus as type TEMP. More...
 
CorpusListassign_temp_to_sub (CorpusList *tmp, char *subname)
 Convert a temporary corpus to a real subcorpus. More...
 
void drop_temp_corpora (void)
 Delete temproary corpora. More...
 
static char * changecase_string (const char *str, enum case_mode mode)
 Creates a copy of the string with the given mode (LOWER/UPPER) enforced on it. More...
 
static char * changecase_string_no_copy (char *str, enum case_mode mode)
 Like changecase_string(), but modifies the string in situ. More...
 
static char * get_fulllocalpath (CorpusList *cl, int qualify)
 
static Boolean accessible (char *dir, char *file)
 Tests whether a file is accessible. More...
 
int check_stamp (char *directory, char *fname)
 
void load_corpusnames (enum corpus_type ct)
 
void check_available_corpora (enum corpus_type ct)
 
CorpusListensure_syscorpus (char *registry, char *name)
 
Boolean save_subcorpus (CorpusList *cl, char *fname)
 
void save_unsaved_subcorpora ()
 
CorpusListFirstCorpusFromList ()
 Gets the CorpusList pointer for the first corpus on the currently-loaded list. More...
 
CorpusListNextCorpusFromList (CorpusList *cl)
 Gets the CorpusList pointer for the next corpus on the currently-loaded list. More...
 
Boolean access_corpus (CorpusList *cl)
 Assesses whether a specified corpus can be accessed. More...
 
CorpusListsearch_corpus (char *name)
 Find the CorpusList object corresponding to a corpus name. More...
 
Boolean change_corpus (char *name, Boolean silent)
 Make a corpus accessible for searching as the "current" corpus. More...
 
Boolean valid_subcorpus_id (char *corpusname)
 
Boolean valid_subcorpus_name (char *corpusname)
 Checks whether corpusname is syntactically valid as a query result name. More...
 
Boolean is_qualified (char *corpusname)
 Checks whether corpusname is fully qualified (with name of mother corpus); does not imply syntatic validity. More...
 
char * split_subcorpus_name (char *corpusname, char *mother_name)
 Splits a query result corpus-name into qualifier and local name. More...
 
int touch_corpus (CorpusList *cp)
 Touches a corpus, ie, marks it as changed. More...
 
int set_current_corpus (CorpusList *cp, int force)
 Sets the current corpus (by pointer to the corpus). More...
 
int set_current_corpus_name (char *name, int force)
 Sets the current corpus (by name). More...
 
static int show_corpora_files_sort (const void *p1, const void *p2)
 Internal function for sorting list of corpus names. More...
 
void show_corpora_files1 (enum corpus_type ct)
 Function that does the work for show_corpora_files. More...
 
void show_corpora_files (enum corpus_type ct)
 A function to print out a list of corpora currently available. More...
 

Variables

CorpusListcorpuslist
 Global list of currently-loaded corpora. More...
 

Macro Definition Documentation

#define COLON   ':'
#define SLASH   '^'

Referenced by findcorpus().

#define subcorpload_debug   0

Referenced by attach_subcorpus().

#define SUBCORPMAGIC   36193928

magic number for subcorpus (incl.

query) file format

Referenced by attach_subcorpus(), check_stamp(), and save_subcorpus().

Function Documentation

Boolean access_corpus ( CorpusList cl)

Assesses whether a specified corpus can be accessed.

That is, it makes sure that the data for corpus in "cl" is loaded and accessible.

Parameters
clA CorpusList specifying the corpus to check.
Returns
A boolean - true if cl can be accessed.

References attach_subcorpus(), False, cl::loaded, cl::range, cl::saved, cl::size, SUB, SYSTEM, TEMP, True, and cl::type.

Referenced by catalog_corpus(), change_corpus(), cqi_find_corpus(), cqi_lookup_attribute(), do_cqi_corpus_attributes(), do_cqi_corpus_charset(), do_cqi_corpus_full_name(), findcorpus(), prepare_AlignmentConstraints(), prepare_Query(), red_factor(), Setop(), SortSubcorpus(), and SortSubcorpusRandomize().

static Boolean accessible ( char *  dir,
char *  file 
)
static

Tests whether a file is accessible.

A file is considered accessible iff user can read it and it is not a (sub)directory.

This test is used for registry entries.

Parameters
dirDirectory in which the file is to be found.
fileThe filename to check.
Returns
Boolean: true iff file is accessible.

References cl_malloc(), False, and True.

Referenced by load_corpusnames().

CorpusList* assign_temp_to_sub ( CorpusList tmp,
char *  subname 
)

Convert a temporary corpus to a real subcorpus.

assign_temp_to_sub assigns the temporary corpus in *tmp to a "real" subcorpus with name "subname". If such a subcorpus already exists, it is overwritten. The temporary corpus is deleted afterwards. The return value is the new subcorpus (which may be equal to tmp, but not necessarily).

Parameters
tmpTemporary corpus to convert.
subnameName to use for new subcorpus.
Returns
Pointer to new subcorpus.

References cl::abs_fn, auto_save, cl_free, cl_strdup(), cl::corpus, dropcorpus(), False, findcorpus(), initialize_cl(), cl::keywords, cl::loaded, cl::mother_name, cl::mother_size, cl::name, cl::needs_update, cl::query_corpus, cl::query_text, cl::range, cl::registry, save_subcorpus(), cl::saved, cl::size, cl::sortidx, SUB, cl::targets, TEMP, True, cl::type, and UNDEF.

Referenced by do_undump(), and in_UnnamedCorpusCommand().

static Boolean attach_subcorpus ( CorpusList cl,
char *  advertised_directory,
char *  advertised_filename 
)
static
Boolean change_corpus ( char *  name,
Boolean  silent 
)

Make a corpus accessible for searching as the "current" corpus.

change_corpus sets the current corpus to the corpus with name "name", first searching SUB corpora, then searching SYSTEM corpora.

When a corpus is "made accessible", its name is checked for validity and availability; if all is OK, set_current_corpus is called on it.

Parameters
nameA string indicating the name of a corpus.
silentBoolean. Ignored.
Returns
Boolean. True if the corpus was set successfully, otherwise false.

References access_corpus(), False, cl::name, search_corpus(), set_current_corpus(), and True.

static char* changecase_string ( const char *  str,
enum case_mode  mode 
)
static

Creates a copy of the string with the given mode (LOWER/UPPER) enforced on it.

References cl_id_tolower(), cl_id_toupper(), cl_strdup(), and LOWER.

Referenced by get_fulllocalpath(), and GetSystemCorpus().

static char* changecase_string_no_copy ( char *  str,
enum case_mode  mode 
)
static

Like changecase_string(), but modifies the string in situ.

References cl_id_tolower(), cl_id_toupper(), and LOWER.

Referenced by load_corpusnames().

void check_available_corpora ( enum corpus_type  ct)
int check_stamp ( char *  directory,
char *  fname 
)
void drop_temp_corpora ( void  )

Delete temproary corpora.

drop_temp_corpora clears the list of corpora of all temporary stuff.

References corpuslist, dropcorpus(), initialize_cl(), cl::next, TEMP, True, and cl::type.

Referenced by do_undump(), in_UnnamedCorpusCommand(), and load_corpusnames().

void dropcorpus ( CorpusList cl)

Remove a corpus from the global list of corpora.

See also
corpuslist
Parameters
clThe corpus to drop.

References corpuslist, current_corpus, initialize_cl(), cl::next, set_current_corpus(), and True.

Referenced by assign_temp_to_sub(), attach_subcorpus(), copy_intervals(), do_cqi_cqp_drop_subcorpus(), drop_temp_corpora(), ensure_corpus_size(), and main().

CorpusList* duplicate_corpus ( CorpusList cl,
char *  new_name,
Boolean  force_overwrite 
)

Duplicate a corpus via its CorpusList object.

duplicate_corpus creates a copy of an existing corpus and casts its type to SUB. The new corpus is given the name "new_name". If a subcorpus of that name is already present, NULL is retured if force_overwrite is False. If force_overwrite is True, the old corpus is discarded.

Parameters
clThe corpus to duplicate
new_nameName for the duplicated corpus.
force_overwriteBoolean: whether or not to force an overwrite if the subcorpus you are attempting to create already exists.
Returns
NULL if you attempted to overwrite with force_overwrite == false. Otherwise, a CorpusList pointer to the new corpus.

References cl::abs_fn, auto_save, cl_malloc(), cl_strdup(), cl::corpus, corpuslist, cqpmessage(), False, initialize_cl(), cl::keywords, cl::loaded, LoadedCorpus(), cl::mother_name, cl::mother_size, cl::name, cl::needs_update, NewCL(), cl::next, cl::query_corpus, cl::query_text, cl::range, cl::registry, save_subcorpus(), cl::saved, cl::size, cl::sortidx, SUB, SYSTEM, cl::targets, True, cl::type, and Warning.

Referenced by copy_intervals(), findcorpus(), and in_CorpusCommand().

Boolean ensure_corpus_size ( CorpusList cl)

This is an internal function used to ensure that a system corpus from the corpus list is accessible and that its size has been computed.

In case of subcorpora, this function implements delayed loading. It is necessary because of a hack that prevents CQP from determining the sizes of all know corpora at start-up (which caused annoying delays if one or more corpora are not accessible) and from reading all subcorpora in the local corpus directory (which caused a number of delays and crashes with MP templates). ensure_corpus_size() is needed by findcorpus() and ensure_syscorpus() at the very least. It may be needed in other places to keep CQP from crashing.

Parameters
clThe corpus whose accessibility is to be checked.
Returns
Boolean: true if access is OK.

References attach_subcorpus(), cderrno, cdperror_string, CL_MAX_FILENAME_LENGTH, cl::corpus, cqpmessage(), dropcorpus(), _Range::end, False, cl::loaded, cl::local_dir, cl::mother_name, cl::mother_size, cl::name, cl::range, SUB, SYSTEM, SystemCorpusSize(), True, cl::type, user_level, and Warning.

Referenced by ensure_syscorpus(), and findcorpus().

CorpusList* ensure_syscorpus ( char *  registry,
char *  name 
)
FieldType field_name_to_type ( char *  name)

Returns a FieldType enumeration corresponding to the field name indicated by its stirng argument.

References KeywordField, MatchEndField, MatchField, NoField, and TargetField.

Referenced by do_cqi_cqp_fdist_1(), do_cqi_cqp_fdist_2(), and labellookup().

char* field_type_to_name ( FieldType  ft)

Returns a pointer to an internal constant string that labels the FieldType argument.

References cqpmessage(), Error, KeywordField, MatchEndField, MatchField, NoField, and TargetField.

Referenced by do_AnchorPoint(), and prepare_do_subset().

CorpusList* findcorpus ( char *  s,
CorpusType  type,
int  try_recursive_search 
)

Finds the pointer to the corpus (or subcorpus, or query result) with the given name.

When searching for s (name of corpus) strcmp() is used; no case conversion is done.

If "type" is UNDEF, it returns the first corpus with matching name. Otherwise the returned corpus has the type "type".

Parameters
sname of the corpus to find (as string)
typeIf this is UNDEF, all corpora are checked; if it is any other type, only corproa of that type are checked.
try_recursive_searchBoolean: whether or not to try to find corpus through implicit expansion.
Returns
Pointer to the CorpusList object for the specified corpus. NULL is returned when the corpus is not yet present.

References access_corpus(), ATT_STRUC, ctxtsp::attrib, calculate_leftboundary(), calculate_rightboundary(), CL_MAX_LINE_LENGTH, cl_strcpy(), COLON, cl::corpus, cqpmessage(), ctxtsp::direction, duplicate_corpus(), _Range::end, ensure_corpus_size(), expansion, find_attribute, left, leftright, LoadedCorpus(), cl::mother_name, cl::range, RangeSetop(), right, RUniq, cl::size, ctxtsp::size, SLASH, _Range::start, structure, SYSTEM, touch_corpus(), True, cl::type, ctxtsp::type, and Warning.

Referenced by assign_temp_to_sub(), copy_intervals(), corpus_info(), cqi_find_corpus(), cqi_lookup_attribute(), do_cqi_corpus_attributes(), do_cqi_corpus_charset(), do_cqi_corpus_full_name(), do_translate(), do_undump(), make_temp_corpus(), prepare_AlignmentConstraints(), search_corpus(), set_current_corpus_name(), and valid_subcorpus_id().

CorpusList* FirstCorpusFromList ( )

Gets the CorpusList pointer for the first corpus on the currently-loaded list.

Function for iterating through the list of currently-loaded corpora.

Returns
The requested CorpusList pointer.

References corpuslist.

Referenced by do_cqi_corpus_list_corpora(), do_cqi_cqp_list_subcorpora(), and main().

void free_corpuslist ( void  )

Frees the global list of currently-loaded corpora.

This function sets the corpus list to NULL and frees all members of the list.

References corpuslist, initialize_cl(), cl::next, set_current_corpus(), and True.

static char* get_fulllocalpath ( CorpusList cl,
int  qualify 
)
static
CorpusList * GetSystemCorpus ( char *  name,
char *  registry 
)
void init_corpuslist ( void  )

Initialises the global corpus list (sets it to NULL, no matter what its value was).

References set_current_corpus().

void initialize_cl ( CorpusList cl,
int  free_name 
)

Resets to empty a CorpusList object.

This is done, largely, by freeing all its members (and setting nonfreeable members to 0 or NULL)...

Parameters
clThe corpus list to initialise.
free_nameBoolean: the name, mother_name and mother_sizemembers will be cleared iff free_name.

References cl::abs_fn, cl::cd, cl_free, cl::corpus, False, cl::keywords, cl::loaded, cl::mother_name, cl::mother_size, cl::name, cl::needs_update, cl::query_corpus, cl::query_text, cl::range, cl::registry, cl::saved, cl::size, cl::sortidx, cl::targets, cl::type, and UNDEF.

Referenced by assign_temp_to_sub(), attach_subcorpus(), drop_temp_corpora(), dropcorpus(), duplicate_corpus(), free_corpuslist(), and make_temp_corpus().

Boolean is_qualified ( char *  corpusname)

Checks whether corpusname is fully qualified (with name of mother corpus); does not imply syntatic validity.

References COLON.

Referenced by do_undump(), and in_CorpusCommand().

void load_corpusnames ( enum corpus_type  ct)
CorpusList* LoadedCorpus ( char *  name,
char *  qualifier,
CorpusType  type 
)

Finds a loaded corpus.

This function tries to find the corpus with name 'name' in the list of currently loaded corpora. In case of subcorpora, qualifier is the mother's name. in case of system corpora, qualifier is the registry. If qualifier is NULL, it is neglected and the first matching corpus is returned. If type is not UNDEF, only corpora of that type are returned. No side effects take place.

Parameters
nameThe corpus we are lookign for.
qualifierAn extra "bit" of the corpus name (see function description).
typeWhich type of corpus is wanted (may be UNDEF).
Returns
Pointer to the CorpusList of the corpus that was found.

References current_corpus, cl::mother_name, cl::name, cl::next, cl::registry, STREQ, SUB, SYSTEM, TEMP, cl::type, and UNDEF.

Referenced by duplicate_corpus(), ensure_syscorpus(), findcorpus(), and load_corpusnames().

CorpusList* make_temp_corpus ( CorpusList cl,
char *  new_name 
)

Copy a corpus as type TEMP.

make_temp_corpus makes a copy of the corpus in *cl into a corpus of type "TEMP" with name "new_name". If a temporary corpus with that name already exists, it is overwritten.

Parameters
clThe corpus to copy.
new_nameName for the temporary copy.
Returns
NULL for error. Otherwise, a CorpusList pointer to the new corpus.

References cl::abs_fn, cl_malloc(), cl_strdup(), cl::corpus, corpuslist, False, findcorpus(), initialize_cl(), cl::keywords, cl::loaded, cl::mother_name, cl::mother_size, cl::name, cl::needs_update, NewCL(), cl::next, cl::query_corpus, cl::query_text, cl::range, cl::registry, cl::saved, cl::size, cl::sortidx, cl::targets, TEMP, True, and cl::type.

Referenced by do_setop(), do_translate(), do_undump(), in_UnnamedCorpusCommand(), prepare_do_subset(), and prepare_Query().

CorpusList* NewCL ( void  )
CorpusList* NextCorpusFromList ( CorpusList cl)

Gets the CorpusList pointer for the next corpus on the currently-loaded list.

Function for iterating through the list of currently-loaded corpora.

Parameters
clThe current corpus on the list.
Returns
The requested CorpusList pointer.

References cl::next.

Referenced by do_cqi_corpus_list_corpora(), do_cqi_cqp_list_subcorpora(), and main().

int NrFieldValues ( CorpusList cl,
FieldType  ft 
)

Counts the number of field-value items of a specified type in the given subcorpus (that is, query resultset).

If the type is MatchField, then the N of values is simply equal to the number of query results. If it is KeywordField or TargetField, the number returned is the number of results where the field exists (which is not always all of them).

Parameters
clThe query result to analyse.
ftThe field type to count.
Returns
The number of values of the speciifed field-type.

References KeywordField, cl::keywords, MatchField, NoField, cl::size, TargetField, and cl::targets.

Boolean save_subcorpus ( CorpusList cl,
char *  fname 
)
void save_unsaved_subcorpora ( )
CorpusList* search_corpus ( char *  name)

Find the CorpusList object corresponding to a corpus name.

First the SUB corpora (created by queries) are searched, then the SYSTEM corproa.

Parameters
nameString containing name of corpus to find.
Returns
Pointer to desired CorpusList.

References findcorpus(), SUB, and SYSTEM.

Referenced by change_corpus().

int set_current_corpus ( CorpusList cp,
int  force 
)

Sets the current corpus (by pointer to the corpus).

Also, executes Xkwic side effects, if necessary.

Parameters
cpPointer to the corpus to set as current. cp may be NULL, which is legal.
forceIf true, the current corpus is set to the specified corpus, even if it is ALREADY set to that corpus.
Returns
Always 1.

References _context_description_block::attributes, CD, cl::corpus, current_corpus, DEFAULT_ATT_NAME, DestroyAttributeList(), FindInAL(), _attlist::list, _attrbuf::next, _attrbuf::status, _context_description_block::strucAttributes, and update_context_descriptor().

Referenced by after_CorpusCommand(), change_corpus(), check_available_corpora(), cqi_activate_corpus(), dropcorpus(), free_corpuslist(), init_corpuslist(), and set_current_corpus_name().

int set_current_corpus_name ( char *  name,
int  force 
)

Sets the current corpus (by name).

Also, execustes Xkwic side effects, if necessary.

Parameters
nameName of the corpus to set as current.
forceIf true, the current corpus is set to the specified corpus, even if it is ALREADY set to that corpus.
Returns
True if the corpus was found and set, otherwise false if the corpus could not be found.

References findcorpus(), set_current_corpus(), and UNDEF.

Referenced by initialize_cqp().

void show_corpora_files ( enum corpus_type  ct)

A function to print out a list of corpora currently available.

"files" is a misnomer; it actually looks on the global list of currently loaded corpora, and prints their names.

Either system corpora (SYSTEM) or subcorpora (SUB) can be shown, depending on ct. If ct is UNDEF, both are shown.

For subcorpora, a bundle of other information is shown too.

Parameters
ctType of corpus to show (SUB, SYSTEM or UNDEF).

References show_corpora_files1(), SUB, SYSTEM, and UNDEF.

void show_corpora_files1 ( enum corpus_type  ct)
static int show_corpora_files_sort ( const void *  p1,
const void *  p2 
)
static

Internal function for sorting list of corpus names.

See also
show_corpora_files

Referenced by show_corpora_files1().

char* split_subcorpus_name ( char *  corpusname,
char *  mother_name 
)

Splits a query result corpus-name into qualifier and local name.

This function splits query result name {corpusname} into qualifier (name of mother corpus) and local name; returns pointer to local name part, or NULL if {corpusname} is not syntactically valid; if mother_name is not NULL, it must point to a buffer of suitable length (CL_MAX_LINE_LENGTH is sufficient) where the qualifier will be stored (empty string for unqualified corpus, and return value == {corpusname} in this case)

References COLON.

Referenced by do_undump(), and valid_subcorpus_name().

int SystemCorpusSize ( Corpus corpus)

A utility function required by ensure_corpus_size()

References ATT_POS, DEFAULT_ATT_NAME, find_attribute, and get_attribute_size.

Referenced by ensure_corpus_size().

int touch_corpus ( CorpusList cp)

Touches a corpus, ie, marks it as changed.

Parameters
cpThe corpus to touch. This must be of type SUB.
Returns
Boolean: true if the touch worked, otherwise false.

References cl::needs_update, cl::saved, SUB, and cl::type.

Referenced by delete_intervals(), do_cut(), evaluate_target(), findcorpus(), RangeSetop(), set_target(), SortSubcorpus(), and SortSubcorpusRandomize().

Boolean valid_subcorpus_id ( char *  corpusname)

References False, findcorpus(), SYSTEM, and True.

Boolean valid_subcorpus_name ( char *  corpusname)

Checks whether corpusname is syntactically valid as a query result name.

References False, split_subcorpus_name(), and True.

Referenced by do_undump().

Variable Documentation

CorpusList* corpuslist