This function recompiles a hashed tokens object when the vocabulary has been changed in a way that makes some of its types identical, such as lowercasing when a lowercased version of the type already exists in the hash table, or introduces gaps in the integer map of the types. It also reindexes the types atttribute to account for types that may have become duplicates, through a procedure such as stemming or lowercasing; or the addition of new tokens through compounding.

tokens_hashed_recompile(x, method = c("C++", "R"))

Arguments

x

the tokens object to be recompiled

method

"C++" for C++ implementation or "R" for an older R-based method

Examples

# lowercasing toks1 <- tokens(c(one = "a b c d A B C D", two = "A B C d")) attr(toks1, "types") <- char_tolower(attr(toks1, "types")) unclass(toks1)
#> $one #> [1] 1 2 3 4 5 6 7 8 #> #> $two #> [1] 5 6 7 4 #> #> attr(,"types") #> [1] "a" "b" "c" "d" "a" "b" "c" "d" #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"padding") #> [1] FALSE
unclass(quanteda:::tokens_hashed_recompile(toks1))
#> $one #> [1] 1 2 3 4 1 2 3 4 #> #> $two #> [1] 1 2 3 4 #> #> attr(,"types") #> [1] "a" "b" "c" "d" #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_"
# stemming toks2 <- tokens("Stemming stemmed many word stems.") unclass(toks2)
#> [[1]] #> [1] 1 2 3 4 5 6 #> #> attr(,"types") #> [1] "Stemming" "stemmed" "many" "word" "stems" "." #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"padding") #> [1] FALSE
unclass(quanteda:::tokens_hashed_recompile(tokens_wordstem(toks2)))
#> [[1]] #> [1] 1 2 3 4 2 5 #> #> attr(,"types") #> [1] "Stem" "stem" "mani" "word" "." #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_"
# compounding toks3 <- tokens("One two three four.") unclass(toks3)
#> [[1]] #> [1] 1 2 3 4 5 #> #> attr(,"types") #> [1] "One" "two" "three" "four" "." #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"padding") #> [1] FALSE
unclass(tokens_compound(toks3, "two three"))
#> [[1]] #> [1] 1 2 3 4 5 #> #> attr(,"types") #> [1] "One" "two" "three" "four" "." #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"padding") #> [1] FALSE
# lookup dict <- dictionary(list(test = c("one", "three"))) unclass(tokens_lookup(toks3, dict))
#> [[1]] #> [1] 1 1 #> #> attr(,"padding") #> [1] FALSE #> attr(,"types") #> [1] "test" #> attr(,"what") #> [1] "dictionary" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_" #> attr(,"dictionary") #> attr(,"dictionary")$test #> [1] "one" "three" #> #> attr(,"dictionary")attr(,"concatenator") #> [1] " "
# empty pads unclass(tokens_select(toks3, dict))
#> [[1]] #> [1] 1 2 #> #> attr(,"types") #> [1] "One" "three" #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_"
unclass(tokens_select(toks3, dict, pad = TRUE))
#> [[1]] #> [1] 1 0 2 0 0 #> #> attr(,"types") #> [1] "One" "three" #> attr(,"padding") #> [1] TRUE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 1 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_"
# ngrams unclass(tokens_ngrams(toks3, n = 2:3))
#> [[1]] #> [1] 1 2 3 4 5 6 7 #> #> attr(,"types") #> [1] "One_two" "two_three" "three_four" "four_." #> [5] "One_two_three" "two_three_four" "three_four_." #> attr(,"padding") #> [1] FALSE #> attr(,"what") #> [1] "word" #> attr(,"ngrams") #> [1] 2 3 #> attr(,"skip") #> [1] 0 #> attr(,"concatenator") #> [1] "_"