This function recompiles a hashed tokens object when the vocabulary has been changed in a way that makes some of its types identical, such as lowercasing when a lowercased version of the type already exists in the hash table, or introduces gaps in the integer map of the types. It also reindexes the types atttribute to account for types that may have become duplicates, through a procedure such as stemming or lowercasing; or the addition of new tokens through compounding.

tokens_recompile(x, method = c("C++", "R"))

Arguments

x

the tokens object to be recompiled

method

"C++" for C++ implementation or "R" for an older R-based method

Examples

# lowercasing toks1 <- tokens(c(one = "a b c d A B C D", two = "A B C d")) attr(toks1, "types") <- char_tolower(attr(toks1, "types")) unclass(toks1)
#> $one #> [1] 1 2 3 4 5 6 7 8 #> #> $two #> [1] 5 6 7 4 #> #> attr(,"types") #> [1] "a" "b" "c" "d" "a" "b" "c" "d" #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 2 rows
unclass(quanteda:::tokens_recompile(toks1))
#> $one #> [1] 1 2 3 4 1 2 3 4 #> #> $two #> [1] 1 2 3 4 #> #> attr(,"types") #> [1] "a" "b" "c" "d" #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 2 rows
# stemming toks2 <- tokens("Stemming stemmed many word stems.") unclass(toks2)
#> $text1 #> [1] 1 2 3 4 5 6 #> #> attr(,"types") #> [1] "Stemming" "stemmed" "many" "word" "stems" "." #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row
unclass(quanteda:::tokens_recompile(tokens_wordstem(toks2)))
#> $text1 #> [1] 1 2 3 4 2 5 #> #> attr(,"types") #> [1] "Stem" "stem" "mani" "word" "." #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row
# compounding toks3 <- tokens("One two three four.") unclass(toks3)
#> $text1 #> [1] 1 2 3 4 5 #> #> attr(,"types") #> [1] "One" "two" "three" "four" "." #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row
unclass(tokens_compound(toks3, "two three"))
#> $text1 #> [1] 1 2 3 4 5 #> #> attr(,"types") #> [1] "One" "two" "three" "four" "." #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row
# lookup dict <- dictionary(list(test = c("one", "three"))) unclass(tokens_lookup(toks3, dict))
#> $text1 #> [1] 1 1 #> #> attr(,"padding") #> [1] FALSE #> attr(,"types") #> [1] "test" #> attr(,"what") #> [1] "dictionary" #> attr(,"dictionary") #> attr(,"dictionary")$test #> [1] "one" "three" #> #> attr(,"dictionary")attr(,"concatenator") #> [1] " " #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row
# empty pads unclass(tokens_select(toks3, dict))
#> $text1 #> [1] 1 2 #> #> attr(,"types") #> [1] "One" "three" #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row
unclass(tokens_select(toks3, dict, pad = TRUE))
#> $text1 #> [1] 1 0 2 0 0 #> #> attr(,"types") #> [1] "One" "three" #> attr(,"padding") #> [1] TRUE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row
# ngrams unclass(tokens_ngrams(toks3, n = 2:3))
#> $text1 #> [1] 1 2 3 4 5 6 7 #> #> attr(,"types") #> [1] "One_two" "two_three" "three_four" "four_." #> [5] "One_two_three" "two_three_four" "three_four_." #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 2 3 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"docvars") #> data frame with 0 columns and 1 row