CWB
|
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <math.h>
#include "../cl/globals.h"
#include "../cl/attributes.h"
#include "feature_maps.h"
Macros | |
#define | DEFAULT_CONFIG_LINES 4 |
number of config lines in the default config More... | |
Functions | |
void | align_usage (void) |
string containing location of the registry directory. More... | |
int | align_parse_args (int ac, char *av[], int min_args) |
Parses the program's commandline arguments. More... | |
void | align_print_line (FILE *fd, int f1, int l1, int f2, int l2, int quality) |
Prints an alignment line. More... | |
int | align_do_alignment (FMS fms, int if1, int il1, int if2, int il2, FILE *outfile) |
Actually does the alignment. More... | |
int | main (int argc, char *argv[]) |
Main function for cwb-align. More... | |
Variables | |
char * | progname |
Name of the program (from the shell) More... | |
char * | default_config [DEFAULT_CONFIG_LINES] |
Set of strings containing default configuration options. More... | |
char ** | config = default_config |
Pointer to configuration strings. More... | |
int | config_lines = DEFAULT_CONFIG_LINES |
Number of lines in the configuration strings array. More... | |
char * | corpus1_name |
name of the source corpus More... | |
char * | corpus2_name |
name of the target corpus More... | |
char * | s_name |
name of the S-attribute containing sentence boundaries More... | |
Corpus * | corpus1 |
corpus handle: source corpus More... | |
Corpus * | corpus2 |
corpus handle: target corpus More... | |
Attribute * | word1 |
word attribute handle: source More... | |
Attribute * | s1 |
sentence attribute handle: source More... | |
Attribute * | word2 |
word attribute handle: target More... | |
Attribute * | s2 |
sentence attribute handle: target More... | |
Attribute * | prealign1 = NULL |
pre-alignment attribute (source) if given More... | |
Attribute * | prealign2 = NULL |
pre-alignment attribute (target) More... | |
int | size1 |
size of source corpus in sentences More... | |
int | size2 |
size of target corpus in sentences More... | |
int | ws1 |
size of source corpus in word tokens (i.e. More... | |
int | ws2 |
size of target corpus in word tokens (i.e. More... | |
int | pre1 = 0 |
number of pre-alignment regions (source corpus) More... | |
int | pre2 = 0 |
number of pre-alignment regions (target corpus) More... | |
char | word_name [CL_MAX_FILENAME_LENGTH] = DEFAULT_ATT_NAME |
name of the word attribute (default: word) More... | |
char | outfile_name [CL_MAX_FILENAME_LENGTH] = "out.align" |
name of the output file More... | |
double | split_factor = 1.2 |
2:2 alignment split factor More... | |
int | beam_width = 50 |
best path search beam width More... | |
char | prealign_name [CL_MAX_FILENAME_LENGTH] = "" |
pre-alignment given by structural attribute More... | |
int | prealign_has_values = 0 |
boolean: if 1, regions with same ID values are pre-aligned More... | |
int | verbose = 0 |
controls printing of some extra progress info More... | |
int | quiet = 0 |
boolean: if 1, turns off progress messages about the alignment. More... | |
char * | registry_directory = NULL |
#define DEFAULT_CONFIG_LINES 4 |
number of config lines in the default config
Referenced by align_usage().
int align_do_alignment | ( | FMS | fms, |
int | if1, | ||
int | il1, | ||
int | if2, | ||
int | il2, | ||
FILE * | outfile | ||
) |
Actually does the alignment.
This function run a best_path alignment on sentence regions [f1,l1]x[f2,l2] and writes the result to {outfile} (in .align format).
Usage:
steps += align_do_alignment(FMS, f1, l1, f2, l2, outfile);
fms | The feature map to use in best_path alignment. |
if1 | Number of s-attribute instance that is the start point (first) in source corpus. |
il1 | Number of s-attribute instance that is the end point (last) in source corpus. |
if2 | Number of s-attribute instance that is the start point (first) in target corpus. |
il2 | Number of s-attribute instance that is the start point (last) in target corpus. |
outfile | File handle to print the alignment lines to. |
References align_print_line(), beam_width, best_path(), feature_match(), split_factor, and verbose.
Referenced by main().
int align_parse_args | ( | int | ac, |
char * | av[], | ||
int | min_args | ||
) |
Parses the program's commandline arguments.
Usage: optindex = align_parse_args(argc, argv, required_arguments);
ac | The program's argc |
av | The program's argv |
min_args | Minimum number of arguments to be parsed. |
References align_usage(), beam_width, outfile_name, prealign_has_values, prealign_name, progname, quiet, registry_directory, split_factor, verbose, and word_name.
Referenced by main().
void align_print_line | ( | FILE * | fd, |
int | f1, | ||
int | l1, | ||
int | f2, | ||
int | l2, | ||
int | quality | ||
) |
Prints an alignment line.
This function writes the given information to the specified file handle as a .align format line.
A .align line looks like this: {f1} {l1} {f2} {l2} {type} [{quality}] eg. "140 169 137 180 1:2" means that corpus (position) ranges [140,169] and [137,180] form a 1:2 alignment pair .
Usage: align_print_line(fd, f1, l1, f2, l2, quality);
fd | File handle to print to. |
f1 | First s-attribute instance in source corpus. |
l1 | Last s-attribute instance in source corpus. |
f2 | First s-attribute instance in target corpus. |
l2 | Last s-attribute instance in target corpus. |
quality | Quality of the alignment. |
References cl_struc2cpos().
Referenced by align_do_alignment().
void align_usage | ( | void | ) |
string containing location of the registry directory.
Prints a message describing how to use the program to STDERR and then exits.
References default_config, DEFAULT_CONFIG_LINES, progname, and VERSION.
Referenced by align_parse_args().
int main | ( | int | argc, |
char * | argv[] | ||
) |
Main function for cwb-align.
argc | Number of command-line arguments. |
argv | Command-line arguments. |
References align_do_alignment(), align_parse_args(), ATT_POS, ATT_STRUC, cl_close_stream(), cl_corpus_charset(), cl_cpos2struc(), cl_delete_lexhash(), cl_error(), cl_lexhash_add(), cl_lexhash_find(), cl_max_cpos(), cl_max_struc(), cl_new_attribute, cl_new_corpus(), cl_new_lexhash(), cl_open_stream(), CL_STREAM_MAGIC, CL_STREAM_WRITE, cl_struc2cpos(), cl_struc2str(), cl_struc_values(), config, config_lines, corpus1_name, corpus2_name, create_feature_maps(), _cl_lexhash_entry::data, _cl_lexhash_entry::_cl_lexhash_entry_data::integer, outfile_name, pre1, pre2, prealign_has_values, prealign_name, progname, quiet, registry_directory, s_name, size1, size2, word_name, ws1, and ws2.
int beam_width = 50 |
best path search beam width
Referenced by align_do_alignment(), align_parse_args(), BAR_write(), and best_path().
char** config = default_config |
Pointer to configuration strings.
Set initially to default_config ; should be reset to the {config} part of argv[], if configuration is specified on the command line.
Referenced by main().
int config_lines = DEFAULT_CONFIG_LINES |
Number of lines in the configuration strings array.
Referenced by create_feature_maps(), and main().
Corpus* corpus1 |
corpus handle: source corpus
char* corpus1_name |
name of the source corpus
Referenced by main().
Corpus* corpus2 |
corpus handle: target corpus
char* corpus2_name |
name of the target corpus
Referenced by main().
char* default_config[DEFAULT_CONFIG_LINES] |
Set of strings containing default configuration options.
Notes on interpreting the lines (in order):
Referenced by align_usage().
char outfile_name[CL_MAX_FILENAME_LENGTH] = "out.align" |
name of the output file
Referenced by align_parse_args(), and main().
int pre1 = 0 |
number of pre-alignment regions (source corpus)
Referenced by main().
int pre2 = 0 |
number of pre-alignment regions (target corpus)
Referenced by main().
Attribute* prealign1 = NULL |
pre-alignment attribute (source) if given
Attribute* prealign2 = NULL |
pre-alignment attribute (target)
int prealign_has_values = 0 |
boolean: if 1, regions with same ID values are pre-aligned
Referenced by align_parse_args(), and main().
char prealign_name[CL_MAX_FILENAME_LENGTH] = "" |
pre-alignment given by structural attribute
Referenced by align_parse_args(), and main().
char* progname |
Name of the program (from the shell)
Referenced by align_parse_args(), align_usage(), and main().
int quiet = 0 |
boolean: if 1, turns off progress messages about the alignment.
Referenced by align_parse_args(), cqp_parse_file(), and main().
char* registry_directory = NULL |
Referenced by align_parse_args(), and main().
Attribute* s1 |
sentence attribute handle: source
Attribute* s2 |
sentence attribute handle: target
char* s_name |
name of the S-attribute containing sentence boundaries
Referenced by main().
int size1 |
size of source corpus in sentences
Referenced by main().
int size2 |
size of target corpus in sentences
Referenced by main().
double split_factor = 1.2 |
2:2 alignment split factor
Referenced by align_do_alignment(), and align_parse_args().
int verbose = 0 |
controls printing of some extra progress info
Referenced by align_do_alignment(), and align_parse_args().
Attribute* word1 |
word attribute handle: source
Referenced by create_feature_maps().
Attribute* word2 |
word attribute handle: target
Referenced by create_feature_maps().
char word_name[CL_MAX_FILENAME_LENGTH] = DEFAULT_ATT_NAME |
name of the word attribute (default: word)
Referenced by align_parse_args(), and main().
int ws1 |
int ws2 |