This function recompiles a serialized tokens object when the vocabulary has been changed in a way that makes some of its types identical, such as lowercasing when a lowercased version of the type already exists in the type table, or introduces gaps in the integer map of the types. It also re-indexes the types attribute to account for types that may have become duplicates, through a procedure such as stemming or lowercasing; or the addition of new tokens through compounding.

tokens_recompile(x, method = c("C++", "R"), gap = TRUE, dup = TRUE)

Arguments

x

the tokens object to be recompiled

method

"C++" for C++ implementation or "R" for an older R-based method

gap

if TRUE, remove gaps between token IDs

dup

if TRUE, merge duplicated token types into the same ID

Examples

# lowercasing toks1 <- tokens(c(one = "a b c d A B C D", two = "A B C d")) attr(toks1, "types") <- char_tolower(attr(toks1, "types")) unclass(toks1)
#> $one #> [1] 1 2 3 4 5 6 7 8 #> #> $two #> [1] 5 6 7 4 #> #> attr(,"types") #> [1] "a" "b" "c" "d" "a" "b" "c" "d" #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 2 rows
unclass(quanteda:::tokens_recompile(toks1))
#> $one #> [1] 1 2 3 4 1 2 3 4 #> #> $two #> [1] 1 2 3 4 #> #> attr(,"types") #> [1] "a" "b" "c" "d" #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 2 rows
# stemming toks2 <- tokens("Stemming stemmed many word stems.") unclass(toks2)
#> $text1 #> [1] 1 2 3 4 5 6 #> #> attr(,"types") #> [1] "Stemming" "stemmed" "many" "word" "stems" "." #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row
#> $text1 #> [1] 1 2 3 4 2 5 #> #> attr(,"types") #> [1] "Stem" "stem" "mani" "word" "." #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row
# compounding toks3 <- tokens("One two three four.") unclass(toks3)
#> $text1 #> [1] 1 2 3 4 5 #> #> attr(,"types") #> [1] "One" "two" "three" "four" "." #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row
unclass(tokens_compound(toks3, "two three"))
#> $text1 #> [1] 1 2 3 4 5 #> #> attr(,"types") #> [1] "One" "two" "three" "four" "." #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row
# lookup dict <- dictionary(list(test = c("one", "three"))) unclass(tokens_lookup(toks3, dict))
#> $text1 #> [1] 1 1 #> #> attr(,"padding") #> [1] FALSE #> attr(,"types") #> [1] "test" #> attr(,"what") #> [1] "dictionary" #> attr(,"dictionary") #> Dictionary object with 1 key entry. #> - [test]: #> - one, three #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row
# empty pads unclass(tokens_select(toks3, dict))
#> $text1 #> [1] 1 2 #> #> attr(,"types") #> [1] "One" "three" #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row
unclass(tokens_select(toks3, dict, pad = TRUE))
#> $text1 #> [1] 1 0 2 0 0 #> #> attr(,"types") #> [1] "One" "three" #> attr(,"padding") #> [1] TRUE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row
# ngrams unclass(tokens_ngrams(toks3, n = 2:3))
#> $text1 #> [1] 1 2 3 4 5 6 7 #> #> attr(,"types") #> [1] "One_two" "two_three" "three_four" "four_." #> [5] "One_two_three" "two_three_four" "three_four_." #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 2 3 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row