This function recompiles a serialized tokens object when the vocabulary has been changed in a way that makes some of its types identical, such as lowercasing when a lowercased version of the type already exists in the type table, or introduces gaps in the integer map of the types. It also reindexes the types attribute to account for types that may have become duplicates, through a procedure such as stemming or lowercasing; or the addition of new tokens through compounding.
tokens_recompile(x, method = c("C++", "R"))
x | the tokens object to be recompiled |
---|---|
method |
|
# lowercasing toks1 <- tokens(c(one = "a b c d A B C D", two = "A B C d")) attr(toks1, "types") <- char_tolower(attr(toks1, "types")) unclass(toks1)#> $one #> [1] 1 2 3 4 5 6 7 8 #> #> $two #> [1] 5 6 7 4 #> #> attr(,"types") #> [1] "a" "b" "c" "d" "a" "b" "c" "d" #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 2 rows#> $one #> [1] 1 2 3 4 1 2 3 4 #> #> $two #> [1] 1 2 3 4 #> #> attr(,"types") #> [1] "a" "b" "c" "d" #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 2 rows#> $text1 #> [1] 1 2 3 4 5 6 #> #> attr(,"types") #> [1] "Stemming" "stemmed" "many" "word" "stems" "." #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row#> $text1 #> [1] 1 2 3 4 2 5 #> #> attr(,"types") #> [1] "Stem" "stem" "mani" "word" "." #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row#> $text1 #> [1] 1 2 3 4 5 #> #> attr(,"types") #> [1] "One" "two" "three" "four" "." #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row#> $text1 #> [1] 1 2 3 4 5 #> #> attr(,"types") #> [1] "One" "two" "three" "four" "." #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row#> $text1 #> [1] 1 1 #> #> attr(,"padding") #> [1] FALSE #> attr(,"types") #> [1] "test" #> attr(,"what") #> [1] "dictionary" #> attr(,"dictionary") #> attr(,"dictionary")$test #> [1] "one" "three" #> #> attr(,"dictionary")attr(,"concatenator") #> [1] " " #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row#> $text1 #> [1] 1 2 #> #> attr(,"types") #> [1] "One" "three" #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row#> $text1 #> [1] 1 0 2 0 0 #> #> attr(,"types") #> [1] "One" "three" #> attr(,"padding") #> [1] TRUE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row#> $text1 #> [1] 1 2 3 4 5 6 7 #> #> attr(,"types") #> [1] "One_two" "two_three" "three_four" "four_." #> [5] "One_two_three" "two_three_four" "three_four_." #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 2 3 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row