tokens_recompile.Rd
This function recompiles a serialized tokens object when the vocabulary has been changed in a way that makes some of its types identical, such as lowercasing when a lowercased version of the type already exists in the type table, or introduces gaps in the integer map of the types. It also re-indexes the types attribute to account for types that may have become duplicates, through a procedure such as stemming or lowercasing; or the addition of new tokens through compounding.
tokens_recompile(x, method = c("C++", "R"), gap = TRUE, dup = TRUE)
x | the tokens object to be recompiled |
---|---|
method |
|
gap | if |
dup | if |
# lowercasing toks1 <- tokens(c(one = "a b c d A B C D", two = "A B C d")) attr(toks1, "types") <- char_tolower(attr(toks1, "types")) unclass(toks1)#> $one #> [1] 1 2 3 4 5 6 7 8 #> #> $two #> [1] 5 6 7 4 #> #> attr(,"types") #> [1] "a" "b" "c" "d" "a" "b" "c" "d" #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 2 rows#> $one #> [1] 1 2 3 4 1 2 3 4 #> #> $two #> [1] 1 2 3 4 #> #> attr(,"types") #> [1] "a" "b" "c" "d" #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 2 rows#> $text1 #> [1] 1 2 3 4 5 6 #> #> attr(,"types") #> [1] "Stemming" "stemmed" "many" "word" "stems" "." #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row#> $text1 #> [1] 1 2 3 4 2 5 #> #> attr(,"types") #> [1] "Stem" "stem" "mani" "word" "." #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row#> $text1 #> [1] 1 2 3 4 5 #> #> attr(,"types") #> [1] "One" "two" "three" "four" "." #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row#> $text1 #> [1] 1 2 3 4 5 #> #> attr(,"types") #> [1] "One" "two" "three" "four" "." #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row#> $text1 #> [1] 1 1 #> #> attr(,"padding") #> [1] FALSE #> attr(,"types") #> [1] "test" #> attr(,"what") #> [1] "dictionary" #> attr(,"dictionary") #> Dictionary object with 1 key entry. #> - [test]: #> - one, three #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row#> $text1 #> [1] 1 2 #> #> attr(,"types") #> [1] "One" "three" #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row#> $text1 #> [1] 1 0 2 0 0 #> #> attr(,"types") #> [1] "One" "three" #> attr(,"padding") #> [1] TRUE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row#> $text1 #> [1] 1 2 3 4 5 6 7 #> #> attr(,"types") #> [1] "One_two" "two_three" "three_four" "four_." #> [5] "One_two_three" "two_three_four" "three_four_." #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 2 3 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row