CWB
Data Structures | Macros | Typedefs | Functions | Variables
cwb-encode.c File Reference
#include <ctype.h>
#include <math.h>
#include <stdarg.h>
#include <sys/types.h>
#include <time.h>
#include <dirent.h>
#include <errno.h>
#include <sys/stat.h>
#include "../cl/globals.h"
#include "../cl/macros.h"
#include "../cl/storage.h"
#include "../cl/lexhash.h"
#include "../cl/endian.h"
#include "../cl/attributes.h"
#include <sys/time.h>

Data Structures

struct  _Range
 The Range object represents a range of corpus positions - for instance, the range enclosed by an instance of an s-attribute. More...
 
struct  WAttr
 WAttr object: represents a P-attribute being encoded. More...
 

Macros

#define UMASK   0644
 User privileges of new files (octal format) More...
 
#define UNDEF_VALUE   "__UNDEF__"
 Default string used as value of P-attributes when a value is missing ie if a tab-delimited field is empty. More...
 
#define FIELDSEPS   "\t\n"
 Default string containing the characters that can function as field separators. More...
 
#define MAXRANGES   1024
 max number of s-attributes; also max number of p-attributes (-> could change this to implementation as a linked list) More...
 
#define REP_CHECK_LEXHASH_SIZE   1000
 nr of buckets of lexhashes used for checking duplicate errors (undeclared element and attribute names in XML tags) More...
 
#define MAX_INPUT_LINE_LENGTH   65536
 Input buffer size. More...
 
#define DEFAULT_INFILE_EXTENSION   ".vrt"
 Normal extension for CWB input text files. More...
 
#define STRUC_RNG   "%s" SUBDIR_SEP_STRING "%s.rng"
 CL naming convention for S-attribute RNG files. More...
 
#define STRUC_AVX   "%s" SUBDIR_SEP_STRING "%s.avx"
 CL naming convention for S-attribute AVX (attribute-value index) files. More...
 
#define STRUC_AVS   "%s" SUBDIR_SEP_STRING "%s.avs"
 CL naming convention for S-attribute AVS (attribute values) files. More...
 
#define POS_CORPUS   "%s" SUBDIR_SEP_STRING "%s.corpus"
 CL naming convention for P-attribute Corpus files. More...
 
#define POS_LEX   "%s" SUBDIR_SEP_STRING "%s.lexicon"
 CL naming convention for P-attribute Lexicon files. More...
 
#define POS_LEXIDX   "%s" SUBDIR_SEP_STRING "%s.lexicon.idx"
 CL naming convention for P-attribute Lexicon-index files. More...
 

Typedefs

typedef struct _Range Range
 Range object: represents an S-attribute being encoded, and holds some information about the currently-being-processed instance of that S-attribute. More...
 

Functions

char * encode_strtok (register char *s, register const char *delim)
 A replacement for the strtok() function which doesn't skip empty fields. More...
 
void encode_print_time (FILE *stream, char *msg)
 Prints a message plus the current time to the specified file/stream. More...
 
void encode_usage (void)
 Prints a usage message and exits the program. More...
 
void encode_print_input_lineno (void)
 Prints the input line number (and input filename, if applicable) on STDERR, for error messages and warnings. More...
 
void encode_error (char *format,...)
 Prints an error message to STDERR, automatically adding a message on the location of the error in the corpus. More...
 
cl_string_list encode_scan_directory (char *dir)
 Get a list of files in a given directory. More...
 
int range_find (char *name)
 Gets the index (in the ranges array) of a specified S-attribute. More...
 
void range_print_registry_line (Range *rng, FILE *fd, int print_comment)
 Prints registry lines for a given s-attribute, and its children, if any, to the specified file handle. More...
 
Rangerange_declare (char *name, char *directory, int store_values, int null_attribute)
 Creates a Range object to store a specified s-attribute (and, if appropriate, does the same for children-attributes). More...
 
void range_close (Range *rng, int end_pos)
 Closes a currently open instance of an S-attribute. More...
 
void range_open (Range *rng, int start_pos, char *annot)
 Opens an instance of the given S-attribute. More...
 
int wattr_find (char *name)
 Finds a p-attribute (in the global wattrs array). More...
 
int wattr_declare (char *name, char *directory, int nr_buckets)
 Sets up a new p-attribute, including opening corpus, lex and index file handles. More...
 
void wattr_close_all (void)
 Closes all three file handles for each of the wattr objects in cwb-encode's global array. More...
 
void encode_parse_options (int argc, char **argv)
 Parses program options and sets global variables. More...
 
void encode_add_wattr_line (char *str)
 Processes a token data line. More...
 
int encode_get_input_line (char *buffer, int bufsize)
 Reads one input line into the specified buffer (either from stdin, or from one or more input files). More...
 
void encode_generate_registry_file (char *registry_file)
 Writes a registry file for the corpus that has been encoded. More...
 
int main (int argc, char **argv)
 Main function for cwb-encode. More...
 

Variables

char * field_separators = FIELDSEPS
 string containing the characters that can function as field separators More...
 
char * undef_value = UNDEF_VALUE
 string used as value of P-attributes when a value is missing ie if a tab-delimited field is empty More...
 
int debug = 0
 debug mode on or off? More...
 
int silent = 0
 hide messages More...
 
int verbose = 0
 show progress (this is not the opposite of silent!) More...
 
int xml_aware = 0
 substitute XML entities in p-attributes & ignore <? and <! lines More...
 
int skip_empty_lines = 0
 skip empty lines when encoding? More...
 
unsigned line = 0
 corpus position currently being encoded (ie cpos of next token) More...
 
int strip_blanks = 0
 strip leading and trailing blanks from input and token annotations More...
 
cl_string_list input_files = NULL
 list of input file(s) (-f option(s)) More...
 
int nr_input_files = 0
 number of input files (length of list after option processing) More...
 
int current_input_file = 0
 index of input file currently being processed More...
 
char * current_input_file_name = NULL
 filename of current input file, for error messages More...
 
FILE * input_fd = NULL
 file handle for current input file (or pipe) (text mode!) More...
 
unsigned long input_line = 0
 input line number (reset for each new file) for error messages More...
 
char * registry_file = NULL
 if set, auto-generate registry file named {registry_file}, listing declared attributes More...
 
char * directory = NULL
 corpus data directory (no longer defaults to current directory) More...
 
char * corpus_character_set = "latin1"
 character set label that is inserted into the registry file More...
 
CorpusCharset encoding_charset
 a charset object to be generated from corpus_character_set More...
 
int clean_strings = 0
 clean up input strings by replacing invalid bytes with '?' More...
 
Range ranges [MAXRANGES]
 A global array for keeping track of S-attributes being encoded. More...
 
int range_ptr = 0
 
WAttr wattrs [MAXRANGES]
 A global array for keeping track of P-attributes being encoded. More...
 
int wattr_ptr = 0
 
cl_lexhash undeclared_sattrs = NULL
 lookup hash for undeclared s-attributes and s-attributes declared with -S that have annotations (which will be ignored), so warnings are issued only once More...
 
char * progname = NULL
 name of the currently running program More...
 

Macro Definition Documentation

#define DEFAULT_INFILE_EXTENSION   ".vrt"

Normal extension for CWB input text files.

(must have exactly 4 characters; .gz/.bz2 may be added to this if the file is compressed.)

Referenced by encode_scan_directory(), and encode_usage().

#define FIELDSEPS   "\t\n"

Default string containing the characters that can function as field separators.

#define MAX_INPUT_LINE_LENGTH   65536

Input buffer size.

If we have XML tags with attributes, input lines can become pretty long (but there's basically just a single buffer)

Referenced by encode_get_input_line(), and main().

#define MAXRANGES   1024

max number of s-attributes; also max number of p-attributes (-> could change this to implementation as a linked list)

Referenced by encode_parse_options(), and range_declare().

#define POS_CORPUS   "%s" SUBDIR_SEP_STRING "%s.corpus"

CL naming convention for P-attribute Corpus files.

Referenced by wattr_declare().

#define POS_LEX   "%s" SUBDIR_SEP_STRING "%s.lexicon"

CL naming convention for P-attribute Lexicon files.

Referenced by wattr_declare().

#define POS_LEXIDX   "%s" SUBDIR_SEP_STRING "%s.lexicon.idx"

CL naming convention for P-attribute Lexicon-index files.

Referenced by wattr_declare().

#define REP_CHECK_LEXHASH_SIZE   1000

nr of buckets of lexhashes used for checking duplicate errors (undeclared element and attribute names in XML tags)

Referenced by main(), and range_declare().

#define STRUC_AVS   "%s" SUBDIR_SEP_STRING "%s.avs"

CL naming convention for S-attribute AVS (attribute values) files.

Referenced by range_declare().

#define STRUC_AVX   "%s" SUBDIR_SEP_STRING "%s.avx"

CL naming convention for S-attribute AVX (attribute-value index) files.

Referenced by range_declare().

#define STRUC_RNG   "%s" SUBDIR_SEP_STRING "%s.rng"

CL naming convention for S-attribute RNG files.

Referenced by range_declare().

#define UMASK   0644

User privileges of new files (octal format)

#define UNDEF_VALUE   "__UNDEF__"

Default string used as value of P-attributes when a value is missing ie if a tab-delimited field is empty.

Typedef Documentation

typedef struct _Range Range

Range object: represents an S-attribute being encoded, and holds some information about the currently-being-processed instance of that S-attribute.

TODO should probably be called an SAttr or SAttEncoder or something.

Function Documentation

void encode_add_wattr_line ( char *  str)

Processes a token data line.

That is, it processes a line that is not an XML line.

Note that this is destructive - the argument character string will be changed in situ via an strtok-like mechanim.

Parameters
strA string containing the line to process.

References cl_free, cl_lexhash_add(), cl_lexhash_id(), cl_make_set(), CL_MAX_LINE_LENGTH, cl_strdup(), cl_xml_entity_decode(), encode_error(), encode_print_input_lineno(), encode_strtok(), field_separators, _cl_lexhash_entry::id, NwriteInt(), WAttr::position, silent, strip_blanks, token, undef_value, wattr_ptr, and xml_aware.

Referenced by main().

void encode_error ( char *  format,
  ... 
)

Prints an error message to STDERR, automatically adding a message on the location of the error in the corpus.

Then exits the program.

Parameters
formatFormat-specifying string of the error message.
...Additional arguments, printf-style.

References current_input_file, encode_print_input_lineno(), and input_line.

Referenced by encode_add_wattr_line(), encode_generate_registry_file(), encode_get_input_line(), encode_parse_options(), encode_scan_directory(), main(), range_close(), range_declare(), wattr_close_all(), and wattr_declare().

void encode_generate_registry_file ( char *  registry_file)

Writes a registry file for the corpus that has been encoded.

Part of cwb-encode; not a library function.

Parameters
registry_fileString containing the path of the file to write.

References cl_free, cl_id_tolower(), cl_id_toupper(), cl_id_validate(), cl_malloc(), cl_path_registry_quote(), cl_strdup(), corpus_character_set, debug, directory, encode_error(), INFOFILE_DEFAULT_NAME, range_print_registry_line(), range_ptr, SUBDIR_SEPARATOR, and wattr_ptr.

Referenced by main().

int encode_get_input_line ( char *  buffer,
int  bufsize 
)

Reads one input line into the specified buffer (either from stdin, or from one or more input files).

The input files are not passed to the function, but are taken from the program global variables.

This function returns False when the last input file has been completely read, and automatically closes files.

If the line that is read is not valid according to the character set specified for the corpus, then an error will be printed and the program shut down.

Parameters
bufferWhere to load the line to. Assumed to be MAX_INPUT_LINE_LENGTH long.
bufsizeNot currently used, but should be MAX_INPUT_LINE_LENGTH in case of future use!
Returns
boolean: true for all OK, false for a problem.

References cl_close_stream(), cl_error(), cl_open_stream(), cl_strcpy(), CL_STREAM_MAGIC, CL_STREAM_READ, cl_string_canonical(), cl_string_list_get(), cl_string_validate_encoding(), cl_string_zap_controls(), clean_strings, corpus_character_set, current_input_file, current_input_file_name, encode_error(), encoding_charset, input_fd, input_line, MAX_INPUT_LINE_LENGTH, nr_input_files, REQUIRE_NFC, and utf8.

Referenced by main().

void encode_parse_options ( int  argc,
char **  argv 
)
void encode_print_input_lineno ( void  )

Prints the input line number (and input filename, if applicable) on STDERR, for error messages and warnings.

References current_input_file_name, input_line, and nr_input_files.

Referenced by encode_add_wattr_line(), encode_error(), main(), range_close(), and range_open().

void encode_print_time ( FILE *  stream,
char *  msg 
)

Prints a message plus the current time to the specified file/stream.

Parameters
streamStream to print to.
msgMessage to incorporate into the string that is printed.

Referenced by main().

cl_string_list encode_scan_directory ( char *  dir)

Get a list of files in a given directory.

This function only lists files with .vrt or .vrt.gz extensions, and only files identified by POSIX stat() as "regular".

(Note that .vrt is dependent on DEFAULT_INFILE_EXTENSION.)

See also
DEFAULT_INFILE_EXTENSION
Parameters
dirPath of directory to look in.
Returns
List of paths to files (including the directory name). Returned as a cl_string_list object.

References cl_free, cl_malloc(), cl_new_string_list(), cl_string_list_append(), cl_string_list_qsort(), DEFAULT_INFILE_EXTENSION, encode_error(), and SUBDIR_SEPARATOR.

Referenced by encode_parse_options().

char* encode_strtok ( register char *  s,
register const char *  delim 
)

A replacement for the strtok() function which doesn't skip empty fields.

Parameters
sThe string to split.
delimDelimiters to use in splitting.
Returns
The next token from the string.

References last.

Referenced by encode_add_wattr_line().

void encode_usage ( void  )

Prints a usage message and exits the program.

References DEFAULT_INFILE_EXTENSION, progname, undef_value, and VERSION.

Referenced by encode_parse_options().

int main ( int  argc,
char **  argv 
)
void range_close ( Range rng,
int  end_pos 
)
Range* range_declare ( char *  name,
char *  directory,
int  store_values,
int  null_attribute 
)

Creates a Range object to store a specified s-attribute (and, if appropriate, does the same for children-attributes).

The new Range object is placed in a global variable, but a pointer is also returned. So you can ignore the return value or not, as you prefer.

This is the function where the command-line formalism for defining s-attributes is defined.

See also
ranges
Parameters
nameThe string from the user specifying the name of this attribute, recursion and any "attributes" of this XML element - e.g. "text:0+id"
directoryThe directory where the CWB data files will go.
store_valuesboolean: indicates whether this s-attribute was specified with -V (true) or -S (false) when the program was invoked.
null_attributeboolean: this is a null attribute, i.e. an XML element to be ignored.
Returns
Pointer to the new Range object (which is a member of the global ranges array).

References _Range::annot, _Range::automatic, _Range::avs, _Range::avx, buf, cl_calloc(), cl_free, cl_lexhash_add(), cl_lexhash_id(), CL_MAX_LINE_LENGTH, cl_new_lexhash(), cl_new_string_list(), cl_strcpy(), cl_strdup(), cl_string_list_append(), _cl_lexhash_entry::data, debug, _Range::dir, _Range::el_attributes, _Range::el_atts_list, _Range::el_undeclared_attributes, _Range::element_drop_count, encode_error(), _Range::fd, _Range::feature_set, _Range::has_children, _Range::in_registry, _Range::is_open, _Range::lh, _Range::max_recursion, MAXRANGES, _Range::name, _Range::null_attribute, _Range::num, _Range::offset, _cl_lexhash_entry::_cl_lexhash_entry_data::pointer, range_ptr, _Range::recursion_children, _Range::recursion_level, REP_CHECK_LEXHASH_SIZE, _Range::start_pos, _Range::store_values, STRUC_AVS, STRUC_AVX, and STRUC_RNG.

Referenced by encode_parse_options().

int range_find ( char *  name)

Gets the index (in the ranges array) of a specified S-attribute.

See also
ranges
Parameters
nameThe S-attribute to search for.
Returns
Index (as integer). -1 if the S-attribute is not found.

References range_ptr.

Referenced by encode_parse_options(), and main().

void range_open ( Range rng,
int  start_pos,
char *  annot 
)
void range_print_registry_line ( Range rng,
FILE *  fd,
int  print_comment 
)

Prints registry lines for a given s-attribute, and its children, if any, to the specified file handle.

Parameters
rngThe s-attribute in question.
fdStream for the registry file to write the line to.
print_commentBoolean: if true, a comment on the original XML tags is printed.

References cl_lexhash_find(), cl_string_list_get(), cl_string_list_size(), _cl_lexhash_entry::data, _Range::el_attributes, _Range::el_atts_list, _Range::has_children, _Range::in_registry, _Range::max_recursion, _Range::name, _Range::null_attribute, _cl_lexhash_entry::_cl_lexhash_entry_data::pointer, _Range::recursion_children, and _Range::store_values.

Referenced by encode_generate_registry_file().

void wattr_close_all ( void  )

Closes all three file handles for each of the wattr objects in cwb-encode's global array.

References encode_error(), and wattr_ptr.

Referenced by main().

int wattr_declare ( char *  name,
char *  directory,
int  nr_buckets 
)

Sets up a new p-attribute, including opening corpus, lex and index file handles.

Note: corpus_fd is a binary file, lex_fd is a text file(*), and lexidx_fd is a binary file.

(*) But lexicon items are delimited by '\0' not by '
'. Therefore '
' is never written, so the text/binary distinction doesn't matter much.

Parameters
nameIdentifier string of the p-attribute
directoryDirectory in which CWB data files are to be created.
nr_bucketsNumber of buckets in the lexhash of the new p-attribute (value passed to cl_new_lexhash() )
Returns
Always 1.

References CL_MAX_LINE_LENGTH, cl_new_lexhash(), cl_strdup(), DEFAULT_ATT_NAME, encode_error(), WAttr::feature_set, WAttr::lh, WAttr::name, POS_CORPUS, POS_LEX, POS_LEXIDX, WAttr::position, and wattr_ptr.

Referenced by encode_parse_options().

int wattr_find ( char *  name)

Finds a p-attribute (in the global wattrs array).

Returns the index (in wattrs) of the P-attribute with the given name.

See also
wattrs
Parameters
nameThe P-attribute to search for.
Returns
Index (as integer), or -1 if not found.

References wattr_ptr.

Referenced by encode_parse_options().

Variable Documentation

int clean_strings = 0

clean up input strings by replacing invalid bytes with '?'

Referenced by encode_get_input_line(), and encode_parse_options().

char* corpus_character_set = "latin1"

character set label that is inserted into the registry file

Referenced by encode_generate_registry_file(), encode_get_input_line(), encode_parse_options(), and main().

int current_input_file = 0

index of input file currently being processed

Referenced by encode_error(), and encode_get_input_line().

char* current_input_file_name = NULL

filename of current input file, for error messages

Referenced by encode_get_input_line(), and encode_print_input_lineno().

int debug = 0
char* directory = NULL

corpus data directory (no longer defaults to current directory)

Referenced by encode_generate_registry_file(), encode_parse_options(), and sencode_parse_options().

CorpusCharset encoding_charset

a charset object to be generated from corpus_character_set

Referenced by encode_get_input_line(), and main().

char* field_separators = FIELDSEPS

string containing the characters that can function as field separators

Referenced by encode_add_wattr_line().

FILE* input_fd = NULL

file handle for current input file (or pipe) (text mode!)

Referenced by encode_get_input_line().

cl_string_list input_files = NULL

list of input file(s) (-f option(s))

unsigned long input_line = 0

input line number (reset for each new file) for error messages

Referenced by encode_error(), encode_get_input_line(), encode_print_input_lineno(), load_macro_file(), and main().

unsigned line = 0
int nr_input_files = 0

number of input files (length of list after option processing)

Referenced by encode_get_input_line(), encode_print_input_lineno(), and main().

char* progname = NULL

name of the currently running program

Referenced by encode_parse_options(), encode_usage(), and main().

int range_ptr = 0
Range ranges[MAXRANGES]

A global array for keeping track of S-attributes being encoded.

char* registry_file = NULL

if set, auto-generate registry file named {registry_file}, listing declared attributes

Referenced by encode_parse_options(), and main().

int silent = 0
int skip_empty_lines = 0

skip empty lines when encoding?

Referenced by encode_parse_options(), and main().

int strip_blanks = 0

strip leading and trailing blanks from input and token annotations

Referenced by encode_add_wattr_line(), encode_parse_options(), main(), and range_open().

cl_lexhash undeclared_sattrs = NULL

lookup hash for undeclared s-attributes and s-attributes declared with -S that have annotations (which will be ignored), so warnings are issued only once

char* undef_value = UNDEF_VALUE

string used as value of P-attributes when a value is missing ie if a tab-delimited field is empty

Referenced by encode_add_wattr_line(), encode_parse_options(), and encode_usage().

int verbose = 0

show progress (this is not the opposite of silent!)

Referenced by encode_parse_options(), and main().

int wattr_ptr = 0
WAttr wattrs[MAXRANGES]

A global array for keeping track of P-attributes being encoded.

int xml_aware = 0

substitute XML entities in p-attributes & ignore <? and <! lines

Referenced by encode_add_wattr_line(), encode_parse_options(), and main().