This function recompiles a serialized tokens object when the vocabulary has been changed in a way that makes some of its types identical, such as lowercasing when a lowercased version of the type already exists in the type table, or introduces gaps in the integer map of the types. It also re-indexes the types attribute to account for types that may have become duplicates, through a procedure such as stemming or lowercasing; or the addition of new tokens through compounding.
tokens_recompile(x, method = c("C++", "R"), gap = TRUE, dup = TRUE)
x | the tokens object to be recompiled |
---|---|
method |
|
gap | if |
dup | if |
# lowercasing toks1 <- tokens(c(one = "a b c d A B C D", two = "A B C d")) attr(toks1, "types") <- char_tolower(attr(toks1, "types")) unclass(toks1)#> $one #> [1] 1 2 3 4 5 6 7 8 #> #> $two #> [1] 5 6 7 4 #> #> attr(,"types") #> [1] "a" "b" "c" "d" "a" "b" "c" "d" #> attr(,"padding") #> [1] FALSE #> attr(,"docvars") #> docname_ docid_ segid_ #> 1 one one 1 #> 2 two two 1 #> attr(,"meta") #> attr(,"meta")$system #> attr(,"meta")$system$`package-version` #> [1] ‘2.1.1’ #> #> attr(,"meta")$system$`r-version` #> [1] ‘4.0.2’ #> #> attr(,"meta")$system$system #> sysname machine user #> "Darwin" "x86_64" "kbenoit" #> #> attr(,"meta")$system$directory #> [1] "/Users/kbenoit/Dropbox (Personal)/GitHub/quanteda/quanteda/docs/reference" #> #> attr(,"meta")$system$created #> [1] "2020-07-27" #> #> #> attr(,"meta")$object #> attr(,"meta")$object$unit #> [1] "documents" #> #> attr(,"meta")$object$what #> [1] "word" #> #> attr(,"meta")$object$ngram #> [1] 1 #> #> attr(,"meta")$object$skip #> [1] 0 #> #> attr(,"meta")$object$concatenator #> [1] "_" #> #> attr(,"meta")$object$summary #> attr(,"meta")$object$summary$hash #> character(0) #> #> attr(,"meta")$object$summary$data #> NULL #> #> #> #> attr(,"meta")$user #> list() #>#> $one #> [1] 1 2 3 4 1 2 3 4 #> #> $two #> [1] 1 2 3 4 #> #> attr(,"types") #> [1] "a" "b" "c" "d" #> attr(,"padding") #> [1] FALSE #> attr(,"docvars") #> docname_ docid_ segid_ #> 1 one one 1 #> 2 two two 1 #> attr(,"meta") #> attr(,"meta")$system #> attr(,"meta")$system$`package-version` #> [1] ‘2.1.1’ #> #> attr(,"meta")$system$`r-version` #> [1] ‘4.0.2’ #> #> attr(,"meta")$system$system #> sysname machine user #> "Darwin" "x86_64" "kbenoit" #> #> attr(,"meta")$system$directory #> [1] "/Users/kbenoit/Dropbox (Personal)/GitHub/quanteda/quanteda/docs/reference" #> #> attr(,"meta")$system$created #> [1] "2020-07-27" #> #> #> attr(,"meta")$object #> attr(,"meta")$object$unit #> [1] "documents" #> #> attr(,"meta")$object$what #> [1] "word" #> #> attr(,"meta")$object$ngram #> [1] 1 #> #> attr(,"meta")$object$skip #> [1] 0 #> #> attr(,"meta")$object$concatenator #> [1] "_" #> #> attr(,"meta")$object$summary #> attr(,"meta")$object$summary$hash #> character(0) #> #> attr(,"meta")$object$summary$data #> NULL #> #> #> #> attr(,"meta")$user #> list() #>#> $text1 #> [1] 1 2 3 4 5 6 #> #> attr(,"types") #> [1] "Stemming" "stemmed" "many" "word" "stems" "." #> attr(,"padding") #> [1] FALSE #> attr(,"docvars") #> docname_ docid_ segid_ #> 1 text1 text1 1 #> attr(,"meta") #> attr(,"meta")$system #> attr(,"meta")$system$`package-version` #> [1] ‘2.1.1’ #> #> attr(,"meta")$system$`r-version` #> [1] ‘4.0.2’ #> #> attr(,"meta")$system$system #> sysname machine user #> "Darwin" "x86_64" "kbenoit" #> #> attr(,"meta")$system$directory #> [1] "/Users/kbenoit/Dropbox (Personal)/GitHub/quanteda/quanteda/docs/reference" #> #> attr(,"meta")$system$created #> [1] "2020-07-27" #> #> #> attr(,"meta")$object #> attr(,"meta")$object$unit #> [1] "documents" #> #> attr(,"meta")$object$what #> [1] "word" #> #> attr(,"meta")$object$ngram #> [1] 1 #> #> attr(,"meta")$object$skip #> [1] 0 #> #> attr(,"meta")$object$concatenator #> [1] "_" #> #> attr(,"meta")$object$summary #> attr(,"meta")$object$summary$hash #> character(0) #> #> attr(,"meta")$object$summary$data #> NULL #> #> #> #> attr(,"meta")$user #> list() #>#> $text1 #> [1] 1 2 3 4 2 5 #> #> attr(,"types") #> [1] "Stem" "stem" "mani" "word" "." #> attr(,"padding") #> [1] FALSE #> attr(,"docvars") #> docname_ docid_ segid_ #> 1 text1 text1 1 #> attr(,"meta") #> attr(,"meta")$system #> attr(,"meta")$system$`package-version` #> [1] ‘2.1.1’ #> #> attr(,"meta")$system$`r-version` #> [1] ‘4.0.2’ #> #> attr(,"meta")$system$system #> sysname machine user #> "Darwin" "x86_64" "kbenoit" #> #> attr(,"meta")$system$directory #> [1] "/Users/kbenoit/Dropbox (Personal)/GitHub/quanteda/quanteda/docs/reference" #> #> attr(,"meta")$system$created #> [1] "2020-07-27" #> #> #> attr(,"meta")$object #> attr(,"meta")$object$unit #> [1] "documents" #> #> attr(,"meta")$object$what #> [1] "word" #> #> attr(,"meta")$object$ngram #> [1] 1 #> #> attr(,"meta")$object$skip #> [1] 0 #> #> attr(,"meta")$object$concatenator #> [1] "_" #> #> attr(,"meta")$object$summary #> attr(,"meta")$object$summary$hash #> character(0) #> #> attr(,"meta")$object$summary$data #> NULL #> #> #> #> attr(,"meta")$user #> list() #>#> $text1 #> [1] 1 2 3 4 5 #> #> attr(,"types") #> [1] "One" "two" "three" "four" "." #> attr(,"padding") #> [1] FALSE #> attr(,"docvars") #> docname_ docid_ segid_ #> 1 text1 text1 1 #> attr(,"meta") #> attr(,"meta")$system #> attr(,"meta")$system$`package-version` #> [1] ‘2.1.1’ #> #> attr(,"meta")$system$`r-version` #> [1] ‘4.0.2’ #> #> attr(,"meta")$system$system #> sysname machine user #> "Darwin" "x86_64" "kbenoit" #> #> attr(,"meta")$system$directory #> [1] "/Users/kbenoit/Dropbox (Personal)/GitHub/quanteda/quanteda/docs/reference" #> #> attr(,"meta")$system$created #> [1] "2020-07-27" #> #> #> attr(,"meta")$object #> attr(,"meta")$object$unit #> [1] "documents" #> #> attr(,"meta")$object$what #> [1] "word" #> #> attr(,"meta")$object$ngram #> [1] 1 #> #> attr(,"meta")$object$skip #> [1] 0 #> #> attr(,"meta")$object$concatenator #> [1] "_" #> #> attr(,"meta")$object$summary #> attr(,"meta")$object$summary$hash #> character(0) #> #> attr(,"meta")$object$summary$data #> NULL #> #> #> #> attr(,"meta")$user #> list() #>#> $text1 #> [1] 1 2 3 4 5 #> #> attr(,"types") #> [1] "One" "two" "three" "four" "." #> attr(,"padding") #> [1] FALSE #> attr(,"docvars") #> docname_ docid_ segid_ #> 1 text1 text1 1 #> attr(,"meta") #> attr(,"meta")$system #> attr(,"meta")$system$`package-version` #> [1] ‘2.1.1’ #> #> attr(,"meta")$system$`r-version` #> [1] ‘4.0.2’ #> #> attr(,"meta")$system$system #> sysname machine user #> "Darwin" "x86_64" "kbenoit" #> #> attr(,"meta")$system$directory #> [1] "/Users/kbenoit/Dropbox (Personal)/GitHub/quanteda/quanteda/docs/reference" #> #> attr(,"meta")$system$created #> [1] "2020-07-27" #> #> #> attr(,"meta")$object #> attr(,"meta")$object$unit #> [1] "documents" #> #> attr(,"meta")$object$what #> [1] "word" #> #> attr(,"meta")$object$ngram #> [1] 1 #> #> attr(,"meta")$object$skip #> [1] 0 #> #> attr(,"meta")$object$concatenator #> [1] "_" #> #> attr(,"meta")$object$summary #> attr(,"meta")$object$summary$hash #> character(0) #> #> attr(,"meta")$object$summary$data #> NULL #> #> #> #> attr(,"meta")$user #> list() #>#> $text1 #> [1] 1 1 #> #> attr(,"padding") #> [1] FALSE #> attr(,"types") #> [1] "test" #> attr(,"docvars") #> docname_ docid_ segid_ #> 1 text1 text1 1 #> attr(,"meta") #> attr(,"meta")$system #> attr(,"meta")$system$`package-version` #> [1] ‘2.1.1’ #> #> attr(,"meta")$system$`r-version` #> [1] ‘4.0.2’ #> #> attr(,"meta")$system$system #> sysname machine user #> "Darwin" "x86_64" "kbenoit" #> #> attr(,"meta")$system$directory #> [1] "/Users/kbenoit/Dropbox (Personal)/GitHub/quanteda/quanteda/docs/reference" #> #> attr(,"meta")$system$created #> [1] "2020-07-27" #> #> #> attr(,"meta")$object #> attr(,"meta")$object$unit #> [1] "documents" #> #> attr(,"meta")$object$what #> [1] "dictionary" #> #> attr(,"meta")$object$ngram #> [1] 1 #> #> attr(,"meta")$object$skip #> [1] 0 #> #> attr(,"meta")$object$concatenator #> [1] "_" #> #> attr(,"meta")$object$summary #> attr(,"meta")$object$summary$hash #> character(0) #> #> attr(,"meta")$object$summary$data #> NULL #> #> #> #> attr(,"meta")$user #> list() #>#> $text1 #> [1] 1 2 #> #> attr(,"types") #> [1] "One" "three" #> attr(,"padding") #> [1] FALSE #> attr(,"docvars") #> docname_ docid_ segid_ #> 1 text1 text1 1 #> attr(,"meta") #> attr(,"meta")$system #> attr(,"meta")$system$`package-version` #> [1] ‘2.1.1’ #> #> attr(,"meta")$system$`r-version` #> [1] ‘4.0.2’ #> #> attr(,"meta")$system$system #> sysname machine user #> "Darwin" "x86_64" "kbenoit" #> #> attr(,"meta")$system$directory #> [1] "/Users/kbenoit/Dropbox (Personal)/GitHub/quanteda/quanteda/docs/reference" #> #> attr(,"meta")$system$created #> [1] "2020-07-27" #> #> #> attr(,"meta")$object #> attr(,"meta")$object$unit #> [1] "documents" #> #> attr(,"meta")$object$what #> [1] "word" #> #> attr(,"meta")$object$ngram #> [1] 1 #> #> attr(,"meta")$object$skip #> [1] 0 #> #> attr(,"meta")$object$concatenator #> [1] "_" #> #> attr(,"meta")$object$summary #> attr(,"meta")$object$summary$hash #> character(0) #> #> attr(,"meta")$object$summary$data #> NULL #> #> #> #> attr(,"meta")$user #> list() #>#> $text1 #> [1] 1 0 2 0 0 #> #> attr(,"types") #> [1] "One" "three" #> attr(,"padding") #> [1] TRUE #> attr(,"docvars") #> docname_ docid_ segid_ #> 1 text1 text1 1 #> attr(,"meta") #> attr(,"meta")$system #> attr(,"meta")$system$`package-version` #> [1] ‘2.1.1’ #> #> attr(,"meta")$system$`r-version` #> [1] ‘4.0.2’ #> #> attr(,"meta")$system$system #> sysname machine user #> "Darwin" "x86_64" "kbenoit" #> #> attr(,"meta")$system$directory #> [1] "/Users/kbenoit/Dropbox (Personal)/GitHub/quanteda/quanteda/docs/reference" #> #> attr(,"meta")$system$created #> [1] "2020-07-27" #> #> #> attr(,"meta")$object #> attr(,"meta")$object$unit #> [1] "documents" #> #> attr(,"meta")$object$what #> [1] "word" #> #> attr(,"meta")$object$ngram #> [1] 1 #> #> attr(,"meta")$object$skip #> [1] 0 #> #> attr(,"meta")$object$concatenator #> [1] "_" #> #> attr(,"meta")$object$summary #> attr(,"meta")$object$summary$hash #> character(0) #> #> attr(,"meta")$object$summary$data #> NULL #> #> #> #> attr(,"meta")$user #> list() #>#> $text1 #> [1] 1 2 3 4 5 6 7 #> #> attr(,"types") #> [1] "One_two" "two_three" "three_four" "four_." #> [5] "One_two_three" "two_three_four" "three_four_." #> attr(,"padding") #> [1] FALSE #> attr(,"docvars") #> docname_ docid_ segid_ #> 1 text1 text1 1 #> attr(,"meta") #> attr(,"meta")$system #> attr(,"meta")$system$`package-version` #> [1] ‘2.1.1’ #> #> attr(,"meta")$system$`r-version` #> [1] ‘4.0.2’ #> #> attr(,"meta")$system$system #> sysname machine user #> "Darwin" "x86_64" "kbenoit" #> #> attr(,"meta")$system$directory #> [1] "/Users/kbenoit/Dropbox (Personal)/GitHub/quanteda/quanteda/docs/reference" #> #> attr(,"meta")$system$created #> [1] "2020-07-27" #> #> #> attr(,"meta")$object #> attr(,"meta")$object$unit #> [1] "documents" #> #> attr(,"meta")$object$what #> [1] "word" #> #> attr(,"meta")$object$ngram #> [1] 2 3 #> #> attr(,"meta")$object$skip #> [1] 0 #> #> attr(,"meta")$object$concatenator #> [1] "_" #> #> attr(,"meta")$object$summary #> attr(,"meta")$object$summary$hash #> character(0) #> #> attr(,"meta")$object$summary$data #> NULL #> #> #> #> attr(,"meta")$user #> list() #>