Convert tokens into equivalence classes defined by values of a dictionary object.
tokens_lookup(x, dictionary, levels = 1:5, valuetype = c("glob", "regex", "fixed"), case_insensitive = TRUE, capkeys = !exclusive, exclusive = TRUE, nomatch = NULL, verbose = quanteda_options("verbose"))
x | tokens object to which dictionary or thesaurus will be supplied |
---|---|
dictionary | the dictionary-class object that will be applied to
|
levels | integers specifying the levels of entries in a hierarchical dictionary that will be applied. The top level is 1, and subsequent levels describe lower nesting levels. Values may be combined, even if these levels are not contiguous, e.g. `levels = c(1:3)` will collapse the second level into the first, but record the third level (if present) collapsed below the first. (See examples.) |
valuetype | the type of pattern matching: |
case_insensitive | ignore the case of dictionary values if |
capkeys | if TRUE, convert dictionary keys to uppercase to distinguish them from other features |
exclusive | if |
nomatch | an optional character naming a new key for tokens that do not
matched to a dictionary values If |
verbose | print status messages if |
toks <- tokens(data_corpus_inaugural) dict <- dictionary(list(country = "united states", law=c('law*', 'constitution'), freedom=c('free*', 'libert*'))) dfm(tokens_lookup(toks, dict, valuetype='glob', verbose = TRUE))#>#> Document-feature matrix of: 58 documents, 3 features (14.9% sparse).#>#> Document-feature matrix of: 58 documents, 4 features (11.2% sparse).dict_fix <- dictionary(list(country = "united states", law = c('law', 'constitution'), freedom = c('freedom', 'liberty'))) # dfm(applyDictionary(toks, dict_fix, valuetype='fixed')) dfm(tokens_lookup(toks, dict_fix, valuetype='fixed'))#> Document-feature matrix of: 58 documents, 3 features (21.3% sparse).# hierarchical dictionary example txt <- c(d1 = "The United States has the Atlantic Ocean and the Pacific Ocean.", d2 = "Britain and Ireland have the Irish Sea and the English Channel.") toks <- tokens(txt) dict <- dictionary(list(US = list(Countries = c("States"), oceans = c("Atlantic", "Pacific")), Europe = list(Countries = c("Britain", "Ireland"), oceans = list(west = "Irish Sea", east = "English Channel")))) tokens_lookup(toks, dict, levels = 1)#> tokens from 2 documents. #> d1 : #> [1] "US" "US" "US" #> #> d2 : #> [1] "Europe" "Europe" "Europe" "Europe" #>tokens_lookup(toks, dict, levels = 2)#> tokens from 2 documents. #> d1 : #> [1] "Countries" "oceans" "oceans" #> #> d2 : #> [1] "Countries" "Countries" "oceans" "oceans" #>tokens_lookup(toks, dict, levels = 1:2)#> tokens from 2 documents. #> d1 : #> [1] "US.Countries" "US.oceans" "US.oceans" #> #> d2 : #> [1] "Europe.Countries" "Europe.Countries" "Europe.oceans" "Europe.oceans" #>tokens_lookup(toks, dict, levels = 3)#> tokens from 2 documents. #> d1 : #> [1] "" "" "" #> #> d2 : #> [1] "" "" "west" "east" #>tokens_lookup(toks, dict, levels = c(1,3))#> tokens from 2 documents. #> d1 : #> [1] "US" "US" "US" #> #> d2 : #> [1] "Europe" "Europe" "Europe.west" "Europe.east" #>tokens_lookup(toks, dict, levels = c(2,3))#> tokens from 2 documents. #> d1 : #> [1] "Countries" "oceans" "oceans" #> #> d2 : #> [1] "Countries" "Countries" "oceans.west" "oceans.east" #># show unmatched tokens tokens_lookup(toks, dict, nomatch = "_UNMATCHED")#> tokens from 2 documents. #> d1 : #> [1] "_UNMATCHED" "_UNMATCHED" "US.Countries" "_UNMATCHED" "_UNMATCHED" #> [6] "US.oceans" "_UNMATCHED" "_UNMATCHED" "_UNMATCHED" "US.oceans" #> [11] "_UNMATCHED" "_UNMATCHED" #> #> d2 : #> [1] "Europe.Countries" "_UNMATCHED" "Europe.Countries" #> [4] "_UNMATCHED" "_UNMATCHED" "Europe.oceans.west" #> [7] "_UNMATCHED" "_UNMATCHED" "Europe.oceans.east" #> [10] "_UNMATCHED" #>