Wrappers for the CWB tools (cwb-makeall, cwb-huffcode, cwb-compress-rdx). Unlike the 'original' command line tools, these wrappers will always perform a specific indexing/compression step on one positional attribute, and produce all components.

cwb_makeall(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGISTRY"))

cwb_huffcode(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGISTRY"))

cwb_compress_rdx(corpus, p_attribute,
  registry = Sys.getenv("CORPUS_REGISTRY"))

Arguments

corpus

name of a CWB corpus (upper case)

p_attribute

name p-attribute

registry

path to the registry directory, defaults to the value of the environment variable CORPUS_REGISTRY

Examples

# The package includes and 'unfinished' corpus of debates in the UN General # Assembly ("UNGA"), i.e. it does not yet include the reverse index, and it is # not compressed. # # The first step in the following example is to copy the raw # corpus to a temporary place. registry <- if (!check_pkg_registry_files()) use_tmp_registry() else get_pkg_registry() home_dir <- system.file(package = "RcppCWB", "extdata", "cwb", "indexed_corpora", "unga") tmpdir <- tempdir() win <- if (Sys.info()[["sysname"]] == "Windows") TRUE else FALSE if (win) tmpdir <- normalizePath(tmpdir) tmp_regdir <- file.path(tmpdir, "registry", fsep = if (win) "\\" else "/") tmp_data_dir <- file.path(tmpdir, "indexed_corpora", fsep = if (win) "\\" else "/") tmp_unga_dir <- file.path(tmp_data_dir, "unga", fsep = if (win) "\\" else "/") if (!file.exists(tmp_regdir)) dir.create(tmp_regdir) if (!file.exists(tmp_data_dir)) dir.create(tmp_data_dir) if (!file.exists(tmp_unga_dir)){ dir.create(tmp_unga_dir) } else { file.remove(list.files(tmp_unga_dir, full.names = TRUE)) } regfile <- readLines(file.path(registry, "unga")) homedir_line <- grep("^HOME", regfile) regfile[homedir_line] <- sprintf('HOME "%s"', tmp_unga_dir) writeLines(text = regfile, con = file.path(tmp_regdir, "unga")) for (x in list.files(home_dir, full.names = TRUE)){ file.copy(from = x, to = tmp_unga_dir) } # perform cwb_makeall (equivalent to cwb-makeall command line utility) cwb_makeall(corpus = "UNGA", p_attribute = "word", registry = tmp_regdir)
#> === Makeall: processing corpus UNGA === #> Registry directory: /var/folders/r6/1k6mxnbj5077980k11xvr0q40000gn/T//Rtmpfnlipg/registry #> ATTRIBUTE word #> + creating LEXSRT ... OK #> - lexicon OK #> + creating FREQS ... OK #> - frequencies OK #> - token stream OK #> + creating REVCIDX ... OK #> + creating REVCORP ... OK #> ? validating REVCORP ... OK #> - index OK #> ========================================
#> [1] 0
# see whether it works ids_sentence_1 <- cl_cpos2id( corpus = "UNGA", p_attribute = "word", registry = tmp_regdir, cpos = 0:83 ) tokens_sentence_1 <- cl_id2str( corpus = "UNGA", p_attribute = "word", registry = tmp_regdir, id = ids_sentence_1 ) sentence <- gsub("\\s+([\\.,])", "\\1", paste(tokens_sentence_1, collapse = " ")) cwb_huffcode(corpus = "UNGA", p_attribute = "word", registry = tmp_regdir)
#> COMPRESSING TOKEN STREAM of (null).word #> Allocated heap with 15902 cells for 7951 items #> #> Minimal code length: 4 #> Maximal code length: 17 #> Compressed code len: 1163628 bits, 145453 (+1) bytes #> #> #> CL #codes MinCode SymIdx #> ---------------------------------------- #> 4 2 14 0 #> 5 5 23 2 #> 6 1 45 7 #> 7 8 82 8 #> 8 20 144 16 #> 9 38 250 36 #> 10 94 406 74 #> 11 199 613 168 #> 12 351 875 367 #> 13 625 1125 718 #> 14 873 1377 1343 #> 15 1373 1381 2216 #> 16 1162 1600 3589 #> 17 3200 0 4751 #> ---------------------------------------- #> 7951 #> - writing code descriptor block to /var/folders/r6/1k6mxnbj5077980k11xvr0q40000gn/T//Rtmpfnlipg/indexed_corpora/unga/word.hcd #> - writing compressed item sequence to /var/folders/r6/1k6mxnbj5077980k11xvr0q40000gn/T//Rtmpfnlipg/indexed_corpora/unga/word.huf #> - writing sync (every 128 tokens) to /var/folders/r6/1k6mxnbj5077980k11xvr0q40000gn/T//Rtmpfnlipg/indexed_corpora/unga/word.huf.syn #> VALIDATING UNGA.word #> - reading code descriptor block from /var/folders/r6/1k6mxnbj5077980k11xvr0q40000gn/T//Rtmpfnlipg/indexed_corpora/unga/word.hcd #> - reading compressed item sequence from /var/folders/r6/1k6mxnbj5077980k11xvr0q40000gn/T//Rtmpfnlipg/indexed_corpora/unga/word.huf #> - reading sync (mod 128) from /var/folders/r6/1k6mxnbj5077980k11xvr0q40000gn/T//Rtmpfnlipg/indexed_corpora/unga/word.huf.syn #> !! You can delete the file </var/folders/r6/1k6mxnbj5077980k11xvr0q40000gn/T//Rtmpfnlipg/indexed_corpora/unga/word.corpus> now.
#> [1] 0
cwb_compress_rdx(corpus = "UNGA", p_attribute = "word", registry = tmp_regdir)
#> COMPRESSING INDEX of UNGA.word #> - writing compressed index to /var/folders/r6/1k6mxnbj5077980k11xvr0q40000gn/T//Rtmpfnlipg/indexed_corpora/unga/word.crc #> - writing compressed index offsets to /var/folders/r6/1k6mxnbj5077980k11xvr0q40000gn/T//Rtmpfnlipg/indexed_corpora/unga/word.crx #> VALIDATING UNGA.word #> - reading compressed index from /var/folders/r6/1k6mxnbj5077980k11xvr0q40000gn/T//Rtmpfnlipg/indexed_corpora/unga/word.crc #> - reading compressed index offsets from /var/folders/r6/1k6mxnbj5077980k11xvr0q40000gn/T//Rtmpfnlipg/indexed_corpora/unga/word.crx #> !! You can delete the file </var/folders/r6/1k6mxnbj5077980k11xvr0q40000gn/T//Rtmpfnlipg/indexed_corpora/unga/word.corpus.rev> now. #> !! You can delete the file </var/folders/r6/1k6mxnbj5077980k11xvr0q40000gn/T//Rtmpfnlipg/indexed_corpora/unga/word.corpus.rdx> now.
#> [1] 0