This function selects or discards features from a dfm.variety of objects,
such as tokenized texts, a dfm, or a list of collocations. The most common
usage for removeFeatures
will be to eliminate stop words from a text
or text-based object, or to select only features from a list of regular
expression.
selectFeatures(x, features, ...) # S3 method for dfm selectFeatures(x, features, selection = c("keep", "remove"), valuetype = c("glob", "regex", "fixed"), case_insensitive = TRUE, verbose = FALSE, ...) # S3 method for tokenizedTexts selectFeatures(x, features, selection = c("keep", "remove"), valuetype = c("glob", "regex", "fixed"), case_insensitive = TRUE, padding = FALSE, indexing = FALSE, verbose = FALSE, ...) # S3 method for tokens selectFeatures(x, features, selection = c("keep", "remove"), valuetype = c("glob", "regex", "fixed"), case_insensitive = TRUE, padding = FALSE, ...) # S3 method for collocations selectFeatures(x, features, selection = c("keep", "remove"), valuetype = c("glob", "regex", "fixed"), case_insensitive = TRUE, verbose = TRUE, pos = 1:3, ...)
x | object whose features will be selected |
---|---|
features | one of: a character vector of features to be selected, a dfm whose features will be used for selection, or a dictionary class object whose values (not keys) will provide the features to be selected. For dfm objects, see details in the Value section below. |
... | supplementary arguments passed to the underlying functions in
|
selection | whether to keep or remove the features |
valuetype | the type of pattern matching: |
case_insensitive | ignore the case of dictionary values if |
verbose | if |
padding | (only for |
indexing | use dfm-based index to efficiently process large tokenizedTexts object |
pos | indexes of word position if called on collocations: remove if word
|
A dfm after the feature selection has been applied.
When features
is a dfm object, then the returned object
will be identical in its feature set to the dfm supplied as the
features
argument. This means that any features in x
not in
features
will be discarded, and that any features in found in the
dfm supplied as features
but not found in x
will be added
with all zero counts. This is useful when you have trained a model on one dfm, and
need to project this onto a test set whose features must be identical.
This function selects features based on their labels. To select
features based on the values of a the document-feature matrix, use
trim
.
not_run({ data(SOTUCorpus, package = "quantedaData") toks <- tokenize(SOTUCorpus, remove_punct = TRUE) # toks <- tokenize(tokenize(SOTUCorpus, what='sentence', simplify = TRUE), remove_punct = TRUE) # head to head, old v. new system.time(selectFeaturesOLD(toks, stopwords("english"), "remove", verbose = FALSE)) system.time(selectFeatures(toks, stopwords("english"), "remove", verbose = FALSE)) system.time(selectFeaturesOLD(toks, c("and", "of"), "remove", verbose = FALSE, valuetype = "regex")) system.time(selectFeatures(toks, c("and", "of"), "remove", verbose = FALSE, valuetype = "regex")) microbenchmark::microbenchmark( old = selectFeaturesOLD(toks, stopwords("english"), "remove", verbose = FALSE), new = selectFeatures(toks, stopwords("english"), "remove", verbose = FALSE), times = 5, unit = "relative") microbenchmark::microbenchmark( new = selectFeaturesOLD(toks, c("and", "of"), "remove", verbose = FALSE, valuetype = "regex"), old = selectFeatures(toks, c("and", "of"), "remove", verbose = FALSE, valuetype = "regex"), times = 2, unit = "relative") types <- unique(unlist(toks)) numbers <- types[stringi::stri_detect_regex(types, '[0-9]')] microbenchmark::microbenchmark( new = selectFeaturesOLD(toks, numbers, "remove", verbose = FALSE, valuetype = "fixed"), old = selectFeatures(toks, numbers, "remove", verbose = FALSE, valuetype = "fixed"), times = 2, unit = "relative") # removing tokens before dfm, versus after microbenchmark::microbenchmark( pre = dfm(selectFeaturesOLD(toks, stopwords("english"), "remove"), verbose = FALSE), post = dfm(toks, remove = stopwords("english"), verbose = FALSE), times = 5, unit = "relative") ## with simple examples toks <- tokenize(c("This is a sentence.", "This is a second sentence."), remove_punct = TRUE) selectFeatures(toks, c("is", "a", "this"), selection = "remove", valuetype = "fixed", padding = TRUE, case_insensitive = TRUE) # how case_insensitive works selectFeatures(toks, c("is", "a", "this"), selection = "remove", valuetype = "fixed", padding = TRUE, case_insensitive = FALSE) selectFeatures(toks, c("is", "a", "this"), selection = "remove", valuetype = "fixed", padding = TRUE, case_insensitive = TRUE) selectFeatures(toks, c("is", "a", "this"), selection = "remove", valuetype = "glob", padding = TRUE, case_insensitive = TRUE) selectFeatures(toks, c("is", "a", "this"), selection = "remove", valuetype = "glob", padding = TRUE, case_insensitive = FALSE) # with longer texts toks <- tokenize(data_corpus_inaugural[1:2]) selectFeatures(toks, stopwords("english"), "remove") selectFeatures(toks, stopwords("english"), "keep") selectFeatures(toks, stopwords("english"), "remove", padding = TRUE) selectFeatures(toks, stopwords("english"), "keep", padding = TRUE) selectFeatures(tokenize(data_corpus_inaugural[2]), stopwords("english"), "remove", padding = TRUE) }) not_run({ toksh <- tokens(c(doc1 = "This is a SAMPLE text", doc2 = "this sample text is better")) feats <- c("this", "sample", "is") # keeping features selectFeatures(toksh, feats, selection = "keep") selectFeatures(toksh, feats, selection = "keep", padding = TRUE) selectFeatures(toksh, feats, selection = "keep", case_insensitive = FALSE) selectFeatures(toksh, feats, selection = "keep", padding = TRUE, case_insensitive = FALSE) # removing features selectFeatures(toksh, feats, selection = "remove") selectFeatures(toksh, feats, selection = "remove", padding = TRUE) selectFeatures(toksh, feats, selection = "remove", case_insensitive = FALSE) selectFeatures(toksh, feats, selection = "remove", padding = TRUE, case_insensitive = FALSE) }) not_run({ ## example for collocations (myCollocs <- collocations(data_corpus_inaugural[1:3], n=20)) selectFeatures(myCollocs, stopwords("english"), "remove") })