CWB
|
#include "../cl/globals.h"
#include "../cl/cl.h"
#include "../cl/corpus.h"
#include "../cl/attributes.h"
#include "../cl/storage.h"
#include "../cl/bitio.h"
#include "../cl/macros.h"
Functions | |
void | huffcode_usage (char *msg, int error_code) |
Prints a usage message and exits the program. More... | |
void | bprintf (unsigned int i, int width, FILE *stream) |
Prints, to the specified stream, a string containing a binary representation of an integer. More... | |
void | dump_heap (int *heap, int heap_size, int node, int indent) |
Dumps the specified heap of memory to the program's STDOUT. More... | |
void | print_heap (int *heap, int heap_size, char *title) |
Prints a description of the specified heap of memory to the program's STDOUT. More... | |
static int | sift (int *heap, int heap_size, int node) |
Sifts the heap into order. More... | |
int | WriteHCD (char *filename, HCD *hc) |
Writes a Huffman code descriptor to file. More... | |
int | ReadHCD (char *filename, HCD *hc) |
Reads a Huffman Code Descriptor from file. More... | |
int | compute_code_lengths (Attribute *attr, HCD *hc, char *fname) |
Compresses the token stream of a p-attribute. More... | |
void | decode_check_huff (Attribute *attr, char *fname) |
Checks a huffcoded attribute for errors by decompressing it. More... | |
int | main (int argc, char **argv) |
Main function for cwb-huffcode. More... | |
Variables | |
int | do_protocol = 0 |
Level of progress-info (inc compression protocol) message output: 0 = none. More... | |
FILE * | protocol |
File handle for this program's progress-info output: note, it is always stdout. More... | |
char * | progname |
Corpus * | corpus |
char * | corpus_id = NULL |
int | debug = 0 |
void bprintf | ( | unsigned int | i, |
int | width, | ||
FILE * | stream | ||
) |
Prints, to the specified stream, a string containing a binary representation of an integer.
i | Integer to print |
width | Number of bits in the integer |
stream | Where to print to. |
Referenced by compute_code_lengths().
Compresses the token stream of a p-attribute.
Three files are created: the compressed token stream, the descriptor block, and a sync file.
attr | The attribute to compress. |
hc | Location for the resulting Huffmann code descriptor block. |
fname | Base filename for the resulting files. |
References _Attribute::any, BFclose(), BFflush(), BFopen(), BFposition(), BFwriteWord(), bprintf(), CDA_OK, cderrno, cdperror, cl_calloc(), cl_cpos2id(), cl_free, cl_id2freq(), cl_malloc(), cl_max_cpos(), cl_max_id(), CL_MAX_LINE_LENGTH, CompCorpus, CompCorpusFreqs, CompHuffCodes, CompHuffSeq, CompHuffSync, CompLexicon, CompLexiconIdx, component_full_name(), corpus_id, do_protocol, ensure_component(), get_id_frequency, get_string_of_id, _huffman_code_descriptor::lcount, _huffman_code_descriptor::length, _huffman_code_descriptor::max_codelen, MAXCODELEN, _huffman_code_descriptor::min_code, _huffman_code_descriptor::min_codelen, NwriteInt(), print_heap(), protocol, TCorpus::registry_dir, TCorpus::registry_name, sift(), _huffman_code_descriptor::size, _huffman_code_descriptor::symbols, _huffman_code_descriptor::symindex, SYNCHRONIZATION, and WriteHCD().
Referenced by main().
void decode_check_huff | ( | Attribute * | attr, |
char * | fname | ||
) |
Checks a huffcoded attribute for errors by decompressing it.
This function assumes that compute_code_lengths() has been called beforehand and made sure that the uncompressed token sequence is used by CL access functions.
attr | The attribute to check. |
fname | Base filename to use for the three compressed-attribute files. Can be NULL, in which case the filenames in the attribute are used. |
References _Attribute::any, BFclose(), BFflush(), BFopen(), BFposition(), BFread(), CDA_OK, cderrno, cl_cpos2id(), cl_max_cpos(), CL_MAX_LINE_LENGTH, CompCorpus, CompHuffCodes, CompHuffSeq, CompHuffSync, component_full_name(), corpus_id, _huffman_code_descriptor::length, _huffman_code_descriptor::min_code, NreadInt(), ReadHCD(), _huffman_code_descriptor::symbols, _huffman_code_descriptor::symindex, and SYNCHRONIZATION.
Referenced by main().
void dump_heap | ( | int * | heap, |
int | heap_size, | ||
int | node, | ||
int | indent | ||
) |
Dumps the specified heap of memory to the program's STDOUT.
heap | Location of the heap to dump. |
heap_size | Number of nodes in the heap. |
node | Heap at which to begin dumping. |
indent | How many tabs to indent the start of each line. |
References protocol.
Referenced by print_heap().
void huffcode_usage | ( | char * | msg, |
int | error_code | ||
) |
Prints a usage message and exits the program.
msg | A message about the error. |
error_code | Value to be returned by the program when it exits. |
References cl_delete_corpus(), progname, and VERSION.
Referenced by main().
int main | ( | int | argc, |
char ** | argv | ||
) |
Main function for cwb-huffcode.
argc | Number of command-line arguments. |
argv | Command-line arguments. |
References _Attribute::any, ATT_POS, TCorpus::attributes, central_corpus_directory, cl_delete_corpus(), cl_new_attribute, cl_new_corpus(), compute_code_lengths(), corpus_id, debug, decode_check_huff(), DEFAULT_ATT_NAME, do_protocol, huffcode_usage(), progname, protocol, and registry_directory.
void print_heap | ( | int * | heap, |
int | heap_size, | ||
char * | title | ||
) |
Prints a description of the specified heap of memory to the program's STDOUT.
heap | Location of the heap to print. |
heap_size | Number of nodes in the heap. |
title | Title of the heap to print. |
References dump_heap(), node, and protocol.
Referenced by compute_code_lengths().
int ReadHCD | ( | char * | filename, |
HCD * | hc | ||
) |
Reads a Huffman Code Descriptor from file.
filename | Path to file where descriptor is saved. |
hc | Pointer to location where the descriptor block will be loaded to. |
References cl_malloc(), _huffman_code_descriptor::lcount, _huffman_code_descriptor::length, _huffman_code_descriptor::max_codelen, MAXCODELEN, _huffman_code_descriptor::min_code, _huffman_code_descriptor::min_codelen, NreadInt(), NreadInts(), _huffman_code_descriptor::size, _huffman_code_descriptor::symbols, and _huffman_code_descriptor::symindex.
Referenced by decode_check_huff().
|
static |
Sifts the heap into order.
heap | Location of the heap to sift. |
heap_size | Number of nodes in the heap. |
node | Node at which to begin sifting. |
Referenced by compute_code_lengths().
int WriteHCD | ( | char * | filename, |
HCD * | hc | ||
) |
Writes a Huffman code descriptor to file.
filename | Path to file where descriptor is to be saved. |
hc | Pointer to the descriptor block to save. |
References _huffman_code_descriptor::lcount, _huffman_code_descriptor::length, _huffman_code_descriptor::max_codelen, MAXCODELEN, _huffman_code_descriptor::min_code, _huffman_code_descriptor::min_codelen, NwriteInt(), NwriteInts(), _huffman_code_descriptor::size, _huffman_code_descriptor::symbols, and _huffman_code_descriptor::symindex.
Referenced by compute_code_lengths().
Corpus* corpus |
char* corpus_id = NULL |
Referenced by compute_code_lengths(), decode_check_huff(), and main().
int debug = 0 |
Referenced by main().
int do_protocol = 0 |
Level of progress-info (inc compression protocol) message output: 0 = none.
Referenced by compute_code_lengths(), and main().
char* progname |
Referenced by huffcode_usage(), and main().
FILE* protocol |
File handle for this program's progress-info output: note, it is always stdout.
Referenced by compute_code_lengths(), dump_heap(), main(), and print_heap().