CWB
Macros | Functions | Variables
ranges.c File Reference
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <limits.h>
#include <string.h>
#include <ctype.h>
#include <unistd.h>
#include "../cl/globals.h"
#include "../cl/corpus.h"
#include "../cl/attributes.h"
#include "../cl/cdaccess.h"
#include "../cl/macros.h"
#include "corpmanag.h"
#include "eval.h"
#include "ranges.h"
#include "output.h"
#include "matchlist.h"
#include "options.h"

Macros

#define SORT_DEBUG   0
 
#define USE_SORT_CACHE
 Defined if a sort cache is to be used in sorting concordance lines. More...
 
#define spaceship(A, B)   ((A) > (B)) ? 1 : ((A) < (B)) ? -1 : 0
 

Functions

int delete_interval (CorpusList *cp, int nr)
 Delete a single concordance hit from a query-generated subcorpus. More...
 
int delete_intervals (CorpusList *cp, Bitfield intervals, int mode)
 Delete a whole bunch of concordance hits from a query-generated subcorpus. More...
 
int copy_intervals (CorpusList *cp, Bitfield intervals, int mode, char *subcorpname)
 Copy concordance hits from a query-generated subcorpus to a (new or existing) subcorpus. More...
 
int calculate_ranges (CorpusList *cl, int cpos, Context spc, int *left, int *right)
 
int calculate_rightboundary (CorpusList *cl, int cpos, Context spc)
 
int calculate_leftboundary (CorpusList *cl, int cpos, Context spc)
 
void rs_cp_range (Range *rng, int *target, int *keyword, int ins, CorpusList *corpus, int j)
 this is a rather specialised utility function for the UNION part of RangeSetop() (copies range + keyword/target (if defined) in corpus into temporary lists) More...
 
int _RS_compare_ranges (const void *pa, const void *pb)
 qsort() helper function for RangeSort() below More...
 
void RangeSort (CorpusList *c, int mk_sortidx)
 Make sure that ranges are sorted in 'natural' order (i.e. More...
 
int RangeSetop (CorpusList *corpus1, RangeSetOp operation, CorpusList *corpus2, Bitfield restrictor)
 Carries out one of a set of operations on corpus1. More...
 
int SortExternally (void)
 Use an external program to sort a query. More...
 
static int i2compare (const void *vidx1, const void *vidx2)
 String comparison function used in query result sorting. More...
 
static int group2compare (const void *vidx1, const void *vidx2)
 Compares two groups of equivalent matches by group sizes (descending), breaking ties through i2compare. More...
 
static int random_compare (const void *vidx1, const void *vidx2)
 Sorts hits in random order by comparing random numbers in the vector random_sort_keys[], breaking ties by start and end positions of matches (from *srt_cl) in order to ensure stable sorting. More...
 
int SortSubcorpusRandomize (CorpusList *cl, int seed)
 Sorts a query result in random order. More...
 
int SortSubcorpus (CorpusList *cl, SortClause sc, int count_mode, struct Redir *redir)
 Sort the (query) subcorpus specified by cl, or count frequencies of matching strings. More...
 
void FreeSortClause (SortClause sc)
 Frees a SortClause object. More...
 

Variables

Range_RS_range = NULL
 Variable used by _RS_compare_ranges; global so data can be passed in without going through that function's parameter list! More...
 
static CorpusListsrt_cl
 The CorpusList object representing a query to be sorted. More...
 
static Attributesrt_attribute
 The )p-)Attribute on which a query is to be sorted. More...
 
static int * srt_start
 When sorting a query, this contains start positions of intervals to be sorted. More...
 
static int * srt_end
 When sorting a query, this contains end positions of intervals to be sorted. More...
 
static FieldType srt_anchor1
 In a query sort, indicates the field type of the start of sort region. More...
 
static int srt_offset1
 In a query sort, indicates the offset of the start of sort region. More...
 
static FieldType srt_anchor2
 In a query sort, indicates the field type of the end of sort region. More...
 
static int srt_offset2
 In a query sort, indicates the offset of the end of sort region. More...
 
static int srt_flags
 Whether to use the c and/or d flags when sorting a query. More...
 
static int srt_ascending
 boolean: sort query into ascending order or not More...
 
static int srt_reverse
 boolean: sort query on reversed-character-sequence strings (and reversed sequences OF strings) or not More...
 
static int text_size
 When sorting a query - this represents the size of the corpus the query belongs to. More...
 
static int break_ties
 whether to break ties (by comparison without cd flags, and by line number in the last instance) More...
 
static unsigned int * random_sort_keys
 random keys for randomized sort order (ties are broken by cpos of matches) More...
 
static int * group_first
 first match for each group of identical (or equivalent) sort strings More...
 
static int * group_size
 number of matches for each group of identical (or equivalent) sort strings More...
 
static int * current_sortidx
 alias to newly created sortidx, so it can be accessed by the callback function More...
 
static int * sort_id_cache = NULL
 Pointer to the sort cache. More...
 

Macro Definition Documentation

#define SORT_DEBUG   0

Referenced by i2compare(), and SortExternally().

#define spaceship (   A,
 
)    ((A) > (B)) ? 1 : ((A) < (B)) ? -1 : 0

Referenced by random_compare().

#define USE_SORT_CACHE

Defined if a sort cache is to be used in sorting concordance lines.

The sort cache (caching the lexicon IDs of the first two tokens to be compared in each line) is indispensable for the internal sorting algorithm since random accesses to a compressed corpus are painfully slow; when sorting variable length matches such as German NPs on word forms, the current implementation has a hit rate of around 99%.

Referenced by SortSubcorpus().

Function Documentation

int _RS_compare_ranges ( const void *  pa,
const void *  pb 
)

qsort() helper function for RangeSort() below

References _Range::end, and _Range::start.

Referenced by RangeSort().

int calculate_leftboundary ( CorpusList cl,
int  cpos,
Context  spc 
)

References calculate_ranges(), left, and right.

Referenced by expand_dataspace(), and findcorpus().

int calculate_ranges ( CorpusList cl,
int  cpos,
Context  spc,
int *  left,
int *  right 
)
int calculate_rightboundary ( CorpusList cl,
int  cpos,
Context  spc 
)

References calculate_ranges(), left, and right.

Referenced by expand_dataspace(), findcorpus(), and simulate().

int copy_intervals ( CorpusList cp,
Bitfield  intervals,
int  mode,
char *  subcorpname 
)

Copy concordance hits from a query-generated subcorpus to a (new or existing) subcorpus.

This function is not currently in use.

Parameters
cpThe CorpusList indicating the query to copy from.
intervalsA Bitfield containing a bit for each query hit, which is true if the hit is "selected", false if not.
modeALL_LINES, SELECTED_LINES or UNSELECTED_LINES (indicating which lines to copy).
subcorpnameName for the subcorpus to which the lines are to be copied.
Returns
Boolean: true for success, false for failure.

References auto_save, cqpmessage(), delete_intervals(), dropcorpus(), duplicate_corpus(), BFBuf::elements, Error, False, findcorpus(), cl::mother_name, cl::name, RangeSetop(), RUnion, save_subcorpus(), cl::saved, SELECTED_LINES, cl::size, SUB, SYSTEM, toggle_bit(), cl::type, UNDEF, and UNSELECTED_LINES.

int delete_interval ( CorpusList cp,
int  nr 
)

Delete a single concordance hit from a query-generated subcorpus.

This function is not currently in use.

Parameters
cpThe CorpusList indicating the query to delete from.
nrThe index of the interval to delete (by setting its start and end values to -1).
Returns
Boolean: true for success, false for failure.

References cl_free, _Range::end, cl::range, RangeSetop(), RReduce, cl::size, cl::sortidx, _Range::start, SUB, and cl::type.

int delete_intervals ( CorpusList cp,
Bitfield  intervals,
int  mode 
)

Delete a whole bunch of concordance hits from a query-generated subcorpus.

Parameters
cpThe CorpusList indicating the query to delete from.
intervalsA Bitfield containing a bit for each query hit, which is true if the hit is "selected", false if not.
modeALL_LINES, SELECTED_LINES or UNSELECTED_LINES (indicating which lines to delete).
Returns
Boolean: true for success, false for failure.

References ALL_LINES, auto_save, cl_free, BFBuf::elements, _Range::end, get_bit(), cl::keywords, cl::range, RangeSetop(), RReduce, save_subcorpus(), SELECTED_LINES, cl::size, cl::sortidx, _Range::start, SUB, cl::targets, TEMP, touch_corpus(), cl::type, and UNSELECTED_LINES.

Referenced by copy_intervals(), do_delete_lines(), do_delete_lines_num(), do_reduce(), and do_StandardQuery().

void FreeSortClause ( SortClause  sc)

Frees a SortClause object.

References _sort_clause::attribute_name, and cl_free.

static int group2compare ( const void *  vidx1,
const void *  vidx2 
)
static

Compares two groups of equivalent matches by group sizes (descending), breaking ties through i2compare.

References current_sortidx, EvaluationIsRunning, group_first, group_size, i2compare(), s1, and s2.

Referenced by SortSubcorpus().

static int i2compare ( const void *  vidx1,
const void *  vidx2 
)
static

String comparison function used in query result sorting.

This is NOT unicode compatible - it uses a maptable, which is baaaaad, mmkay.

This function can be deleted once it has been checked that all calls to it have been replaced with cl_string_qsort_compare

static int srt_strcmp(unsigned char *s1, unsigned char *s2, unsigned char *maptable, int reverse) { int l1, l2, minl, i, c1, c2, step; unsigned char *p1, *p2;

step = (reverse) ? -1 : +1; l1 = strlen((char *) s1); l2 = strlen((char *) s2);

p1 = (reverse) ? s1 + l1 - 1 : s1; p2 = (reverse) ? s2 + l2 - 1 : s2;

minl = MIN(l1, l2);

for (i = 1; i <= minl; i++) { if (maptable) { c1 = maptable[*p1]; c2 = maptable[*p2]; } else { c1 = *p1; c2 = *p2; } p1 += step; p2 += step;

if (c1 < c2) return -1; else if (c1 > c2) return 1; }

if (l1 < l2) return -1; else if (l1 > l2) return 1; else return 0; } Compare two matches according to current sort settings in static variables (qsort callback used in query result sorting).

This is the primary query-hit-comparison function. It wraps cl_string_qsort_compare for string comparison, but does much more as well, because we are not just comparing individual strings but rather, potentially, whole bundles of strings from various different positions.

Parameters
vidx1Pointer to the integer index of the first of the intervals to be compared (ie an index into an array of start/end positions).
vidx2Pointer to the integer index of the second of the intervals to be compared.
Returns
Usual returns for qsort callbacks.

References break_ties, TCorpus::charset, cl_cpos2id(), cl_id2str(), cl_string_qsort_compare(), cl::corpus, EvaluationIsRunning, MIN, s1, s2, SORT_DEBUG, sort_id_cache, srt_ascending, srt_end, srt_flags, srt_reverse, and srt_start.

Referenced by group2compare(), and SortSubcorpus().

static int random_compare ( const void *  vidx1,
const void *  vidx2 
)
static

Sorts hits in random order by comparing random numbers in the vector random_sort_keys[], breaking ties by start and end positions of matches (from *srt_cl) in order to ensure stable sorting.

This is another qsort callback function.

References _Range::end, random_sort_keys, cl::range, spaceship, and _Range::start.

Referenced by SortSubcorpusRandomize().

int RangeSetop ( CorpusList corpus1,
RangeSetOp  operation,
CorpusList corpus2,
Bitfield  restrictor 
)

Carries out one of a set of operations on corpus1.

The operations that can be carried out are as follows:

RUnion - copy intervals from corpus2 to corpus1 (no duplicates); RIntersection - remove from corpus1 any intervals that are not also in corpus2; RDiff RMaximalMatches - remove spurious matches according to "longest" strategy; RMinimalMatches - remove spurious matches according to "shortest" strategy; RLeftMaximalMatches - remove spurious matches according to "standard" strategy; RNonOverlapping RUniq - remove duplicate intervals from corpus1; RReduce - remove intervals marked for deletion (by having the start memebr set to -1).

TODO to avopid confusion with the object, a better name for this function would be do_RangeSetOp

Parameters
corpus1The corpus to be changed.
operationSpecifies which operation is to be carried out.
corpus2The corpus that is the second argument for this operation. Can be NULL if no corpus2 is required for operation.
restrictorSpecifies which intervals in corpus2 are to be taken notice of versus ignored. Can be NULL.
Returns
Boolean, true for all OK, otherwise false.

References cl_free, cl_malloc(), cl_realloc(), _Range::end, get_bit(), cl::keywords, cl::range, RangeSetop(), RDiff, RIntersection, RLeftMaximalMatches, RMaximalMatches, RMinimalMatches, RNonOverlapping, RReduce, rs_cp_range(), RUnion, RUniq, cl::size, cl::sortidx, _Range::start, cl::targets, and touch_corpus().

Referenced by copy_intervals(), delete_interval(), delete_intervals(), do_cut(), do_setop(), do_StandardQuery(), do_translate(), evaluate_subset(), expand_dataspace(), findcorpus(), prepare_Query(), RangeSetop(), and set_corpus_matchlists().

void RangeSort ( CorpusList c,
int  mk_sortidx 
)

Make sure that ranges are sorted in 'natural' order (i.e.

by start and end cpos).

This function has to be called when matching ranges are modified and may be needed when loading a query result (with "undump") that is not sorted in ascending order; with optional "mk_sortidx" flag, a sortidx corresponding to the original ordering is created.

Parameters
cThe corpus (ie subcorpus/query) whose intervals ('ranges') are to be sorted.
mk_sortidxBoolean flag: if true a sortidx is created.

References _RS_compare_ranges(), cl_free, cl_malloc(), cqpmessage(), Error, cl::keywords, cl::name, cl::range, cl::size, cl::sortidx, SUB, cl::targets, TEMP, cl::type, and Warning.

Referenced by do_translate(), do_undump(), evaluate_target(), and set_target().

void rs_cp_range ( Range rng,
int *  target,
int *  keyword,
int  ins,
CorpusList corpus,
int  j 
)

this is a rather specialised utility function for the UNION part of RangeSetop() (copies range + keyword/target (if defined) in corpus into temporary lists)

References _Range::end, cl::keywords, cl::range, _Range::start, and cl::targets.

Referenced by RangeSetop().

int SortExternally ( void  )
int SortSubcorpus ( CorpusList cl,
SortClause  sc,
int  count_mode,
struct Redir redir 
)

Sort the (query) subcorpus specified by cl, or count frequencies of matching strings.

(Note that frequency counting and query result sorting are done via the same sorting algorithm.)

If the sort was not performed successfully, the sort index is reset to the default sort order, and the function returns false.

Parameters
clSubcorpus designating the query to sort.
scA sort clause. sc = NULL resets the sort index to the default sort order (i.e. sorted by corpus position).
count_modeBoolean: run the function in count frequency mode?
redirRedir object for where the output of string-counting is to be displayed.
Returns
Boolean: true for successful sort, false for unsuccessful.

References access_corpus(), _sort_clause::anchor1, _sort_clause::anchor2, ATT_POS, _sort_clause::attribute_name, break_ties, TCorpus::charset, cl_broken_pipe, cl_cpos2id(), cl_cpos2str(), cl_free, cl_malloc(), cl_max_cpos(), cl_string_canonical(), cl_string_reverse(), close_stream(), cl::corpus, cqp, cqpmessage(), current_sortidx, DEFAULT_ATT_NAME, _Range::end, Error, EvaluationIsRunning, find_attribute, _sort_clause::flags, group2compare(), group_first, group_size, i2compare(), Info, insecure, install_signal_handler(), KeywordField, cl::keywords, MatchEndField, MatchField, cl::name, NoField, _sort_clause::offset1, _sort_clause::offset2, open_stream(), pretty_print, cl::range, cl::size, _sort_clause::sort_ascending, sort_id_cache, _sort_clause::sort_reverse, SortExternally(), cl::sortidx, srt_anchor1, srt_anchor2, srt_ascending, srt_end, srt_flags, srt_offset1, srt_offset2, srt_reverse, srt_start, _Range::start, Redir::stream, TargetField, cl::targets, text_size, token, touch_corpus(), USE_SORT_CACHE, UseExternalSorting, utf8, Warning, and which_app.

int SortSubcorpusRandomize ( CorpusList cl,
int  seed 
)

Sorts a query result in random order.

If seed > 0, a reproducible and stable ordering is generated based on the start and end corpus positions of matches (i.e. two given matches will always be sorted in the same order).

Parameters
clCorpus-list object representing the query to sort.
seedSeed for the randomiser; should ideally be a prime number (2^31 is a particularly bad choice); if it is 0, then the internal RNG's standard random order is used.

References access_corpus(), cl_free, cl_malloc(), cl_random(), cl_set_rng_state(), cqp, cqpmessage(), _Range::end, Error, EvaluationIsRunning, Info, install_signal_handler(), cl::name, random_compare(), random_sort_keys, cl::range, cl::size, cl::sortidx, _Range::start, touch_corpus(), Warning, and which_app.

Variable Documentation

Range* _RS_range = NULL

Variable used by _RS_compare_ranges; global so data can be passed in without going through that function's parameter list!

int break_ties
static

whether to break ties (by comparison without cd flags, and by line number in the last instance)

Referenced by i2compare(), and SortSubcorpus().

int* current_sortidx
static

alias to newly created sortidx, so it can be accessed by the callback function

Referenced by group2compare(), and SortSubcorpus().

int* group_first
static

first match for each group of identical (or equivalent) sort strings

Referenced by group2compare(), and SortSubcorpus().

int* group_size
static

number of matches for each group of identical (or equivalent) sort strings

Referenced by group2compare(), and SortSubcorpus().

unsigned int* random_sort_keys
static

random keys for randomized sort order (ties are broken by cpos of matches)

Referenced by random_compare(), and SortSubcorpusRandomize().

int* sort_id_cache = NULL
static

Pointer to the sort cache.

See also
USE_SORT_CACHE

Referenced by i2compare(), and SortSubcorpus().

FieldType srt_anchor1
static

In a query sort, indicates the field type of the start of sort region.

Referenced by SortExternally(), and SortSubcorpus().

FieldType srt_anchor2
static

In a query sort, indicates the field type of the end of sort region.

Referenced by SortExternally(), and SortSubcorpus().

int srt_ascending
static

boolean: sort query into ascending order or not

Referenced by i2compare(), SortExternally(), and SortSubcorpus().

Attribute* srt_attribute
static

The )p-)Attribute on which a query is to be sorted.

CorpusList* srt_cl
static

The CorpusList object representing a query to be sorted.

int* srt_end
static

When sorting a query, this contains end positions of intervals to be sorted.

Referenced by i2compare(), and SortSubcorpus().

int srt_flags
static

Whether to use the c and/or d flags when sorting a query.

Referenced by i2compare(), SortExternally(), and SortSubcorpus().

int srt_offset1
static

In a query sort, indicates the offset of the start of sort region.

Referenced by SortExternally(), and SortSubcorpus().

int srt_offset2
static

In a query sort, indicates the offset of the end of sort region.

Referenced by SortExternally(), and SortSubcorpus().

int srt_reverse
static

boolean: sort query on reversed-character-sequence strings (and reversed sequences OF strings) or not

Referenced by i2compare(), SortExternally(), and SortSubcorpus().

int* srt_start
static

When sorting a query, this contains start positions of intervals to be sorted.

Referenced by i2compare(), and SortSubcorpus().

int text_size
static

When sorting a query - this represents the size of the corpus the query belongs to.

Referenced by compose_kwic_line(), SortExternally(), and SortSubcorpus().