CWB
|
#include "../cl/globals.h"
#include "../cl/corpus.h"
#include "../cl/cl.h"
#include "../cl/special-chars.h"
Data Structures | |
struct | _Hash |
A specialised hashtable for computing frequency distributions over tuples of lexicon IDs. More... | |
Macros | |
#define | MAX_N 32 |
maximum value of N (makes life a little easier) More... | |
Functions | |
void | scancorpus_usage (void) |
Prints a usage message and exits the program. More... | |
int | scancorpus_parse_options (int argc, char *argv[]) |
Parses the command-line options of the program. More... | |
int | scancorpus_word_is_regular (char *s) |
Check regularity of a token. More... | |
void | scancorpus_add_key (char *key) |
Adds a key to global variable Hash. More... | |
int | get_next_range (int *start, int *end) |
Reads the next range of corpus positions. More... | |
int | main (int argc, char *argv[]) |
Main function for cwb-scan-corpus. More... | |
Variables | |
struct _Hash | Hash |
Corpus * | C |
corpus we're working on More... | |
char * | reg_dir = NULL |
registry directory (NULL -> use default) More... | |
char * | corpname = NULL |
corpus name (command-line) More... | |
int | check_words = 0 |
if set, accept only 'regular' words in frequency counts More... | |
CL_Regex | regular_rx = NULL |
regex object for use when check_words is true. More... | |
char * | progname = NULL |
name of this program (from shell command) More... | |
char * | output_file = NULL |
output file name (-o option) More... | |
int | frequency_threshold = 0 |
frequency threshold for result table (-f option) More... | |
char * | frequency_att = NULL |
p-attribute with frequency entries for corpus rows (when abusing corpus as frequency database) More... | |
int | global_start = 0 |
start scanning at this cpos (defaults to start of corpus) More... | |
int | global_end = -1 |
will be set up in main() unless changed with -e switch. More... | |
char * | ranges_file = NULL |
file with ranges to scan (pairs of corpus positions) More... | |
FILE * | ranges_fh = NULL |
corresponding filehandle More... | |
int | quiet = 0 |
if set, don't show progress information on stderr More... | |
int | n_buckets = 0 |
if set, use fixed number of buckets; otherwise, revert to cl_ngram_hash defaults More... | |
int | debug_level = 0 |
CL debug level. More... | |
#define MAX_N 32 |
maximum value of N (makes life a little easier)
Referenced by main(), and scancorpus_usage().
int get_next_range | ( | int * | start, |
int * | end | ||
) |
Reads the next range of corpus positions.
The ranges of corpus positions are taken either from global settings (-s, -e) or from a specified file (-R).
start | Where to put the start of the next range. |
end | Where to put the end of the next range. |
References CL_MAX_LINE_LENGTH, global_end, global_start, and ranges_fh.
Referenced by main().
int main | ( | int | argc, |
char * | argv[] | ||
) |
Main function for cwb-scan-corpus.
argc | Number of command-line arguments. |
argv | Command-line arguments. |
References _Hash::att, ATT_POS, TCorpus::charset, check_words, cl_close_stream(), cl_cpos2id(), cl_delete_regex(), cl_error(), cl_id2str(), cl_malloc(), cl_max_cpos(), cl_max_id(), CL_MAX_LINE_LENGTH, cl_max_struc(), cl_new_attribute, cl_new_corpus(), cl_new_ngram_hash(), cl_new_regex(), cl_ngram_hash_add(), cl_ngram_hash_auto_grow(), cl_ngram_hash_iterator_next(), cl_ngram_hash_iterator_reset(), cl_ngram_hash_print_stats(), cl_ngram_hash_size(), cl_open_stream(), cl_regex_match(), cl_set_debug_level(), cl_standard_registry(), CL_STREAM_MAGIC, CL_STREAM_READ, CL_STREAM_WRITE, cl_struc2cpos(), cl_struc2str(), _Hash::constraint_ok, corpname, _Hash::current_struc, debug_level, _Hash::end_cpos, _cl_ngram_hash_entry::freq, _Hash::frequency, frequency_att, frequency_threshold, _Hash::frequency_values, get_next_range(), global_end, global_start, Hash, _Hash::id_list, _Hash::id_list_size, _Hash::is_constraint, _Hash::is_negated, _Hash::is_structural, _Hash::K, MAX_N, _Hash::max_offset, _Hash::N, n_buckets, _cl_ngram_hash_entry::ngram, _Hash::offset, output_file, progname, quiet, ranges_fh, ranges_file, reg_dir, _Hash::regex, scancorpus_add_key(), scancorpus_parse_options(), scancorpus_usage(), scancorpus_word_is_regular(), _Hash::source_base, _Hash::start_cpos, _Hash::table, utf8, _Hash::virtual_id, and word.
void scancorpus_add_key | ( | char * | key | ) |
Adds a key to global variable Hash.
key | String specifying the key (passed by main() from a command-line argument) |
References _Hash::att, ATT_POS, ATT_STRUC, buf, check_words, cl_corpus_charset(), cl_id2str(), CL_MAX_LINE_LENGTH, cl_max_struc(), cl_new_attribute, cl_new_regex(), cl_regex2id(), cl_struc2str(), cl_struc_values(), _Hash::constraint_ok, corpname, _Hash::current_struc, _Hash::end_cpos, Hash, _Hash::id_list, _Hash::id_list_size, IGNORE_CASE, IGNORE_DIAC, _Hash::is_constraint, _Hash::is_negated, _Hash::is_structural, _Hash::K, _Hash::max_offset, _Hash::N, _Hash::offset, _Hash::regex, scancorpus_word_is_regular(), _Hash::source_base, and _Hash::start_cpos.
Referenced by main().
int scancorpus_parse_options | ( | int | argc, |
char * | argv[] | ||
) |
Parses the command-line options of the program.
References check_words, debug_level, frequency_att, frequency_threshold, global_end, global_start, n_buckets, output_file, quiet, ranges_file, reg_dir, and scancorpus_usage().
Referenced by main().
void scancorpus_usage | ( | void | ) |
Prints a usage message and exits the program.
References MAX_N, and VERSION.
Referenced by main(), and scancorpus_parse_options().
int scancorpus_word_is_regular | ( | char * | s | ) |
Check regularity of a token.
A token is "regular" if it contains only letters, numbers and dashes (with no dash at the start or end).
"Regularity" is used as a filter on the corpus iff the -C option is specified.
s | String containing the token to check. |
References TCorpus::charset, cl_iso_char_is_alphanumeric(), cl_regex_match(), and utf8.
Referenced by main(), and scancorpus_add_key().
Corpus* C |
corpus we're working on
Referenced by regex2dfa(), and WriteStates().
int check_words = 0 |
if set, accept only 'regular' words in frequency counts
Referenced by main(), scancorpus_add_key(), and scancorpus_parse_options().
char* corpname = NULL |
corpus name (command-line)
Referenced by main(), and scancorpus_add_key().
int debug_level = 0 |
CL debug level.
Referenced by main(), and scancorpus_parse_options().
char* frequency_att = NULL |
p-attribute with frequency entries for corpus rows (when abusing corpus as frequency database)
Referenced by main(), and scancorpus_parse_options().
int frequency_threshold = 0 |
frequency threshold for result table (-f option)
Referenced by main(), and scancorpus_parse_options().
int global_end = -1 |
will be set up in main() unless changed with -e switch.
Referenced by get_next_range(), main(), and scancorpus_parse_options().
int global_start = 0 |
start scanning at this cpos (defaults to start of corpus)
Referenced by get_next_range(), main(), and scancorpus_parse_options().
struct _Hash Hash |
Referenced by main(), and scancorpus_add_key().
int n_buckets = 0 |
if set, use fixed number of buckets; otherwise, revert to cl_ngram_hash defaults
Referenced by main(), and scancorpus_parse_options().
char* output_file = NULL |
output file name (-o option)
Referenced by main(), and scancorpus_parse_options().
char* progname = NULL |
name of this program (from shell command)
Referenced by main().
int quiet = 0 |
if set, don't show progress information on stderr
Referenced by main(), and scancorpus_parse_options().
FILE* ranges_fh = NULL |
corresponding filehandle
Referenced by get_next_range(), and main().
char* ranges_file = NULL |
file with ranges to scan (pairs of corpus positions)
Referenced by main(), and scancorpus_parse_options().
char* reg_dir = NULL |
registry directory (NULL -> use default)
Referenced by main(), and scancorpus_parse_options().
CL_Regex regular_rx = NULL |
regex object for use when check_words is true.