Construct a sparse document-feature matrix, from a character, corpus, tokens, or even other dfm object.
dfm(x, tolower = TRUE, stem = FALSE, select = NULL, remove = NULL, dictionary = NULL, thesaurus = NULL, valuetype = c("glob", "regex", "fixed"), groups = NULL, verbose = quanteda_options("verbose"), ...)
x | |
---|---|
tolower | convert all features to lowercase |
stem | if |
select | a pattern of user-supplied features to keep, while
excluding all others. This can be used in lieu of a dictionary if there
are only specific features that a user wishes to keep. To extract only
Twitter usernames, for example, set |
remove | a pattern of user-supplied features to ignore, such as
"stop words". To access one possible list (from any list you wish), use
|
dictionary | a dictionary object to apply to the tokens when creating the dfm |
thesaurus | a dictionary object that will be applied as if
|
valuetype | the type of pattern matching: |
groups | either: a character vector containing the names of document variables to be used for grouping; or a factor or object that can be coerced into a factor equal in length or rows to the number of documents. See groups for details. |
verbose | display messages if |
... | additional arguments passed to tokens; not used when |
a dfm-class object
The default behavior for remove
/select
when
constructing ngrams using dfm(x,
ngrams > 1)
is to
remove/select any ngram constructed from a matching feature. If you wish
to remove these before constructing ngrams, you will need to first tokenize
the texts with ngrams, then remove the features to be ignored, and then
construct the dfm using this modified tokenization object. See the code
examples for an illustration.
When x
is a dfm, groups
provides a convenient and
fast method of combining and refactoring the documents of the dfm according
to the groups.
## for a corpus corpus_post80inaug <- corpus_subset(data_corpus_inaugural, Year > 1980) dfm(corpus_post80inaug)#> Document-feature matrix of: 10 documents, 3,260 features (77.4% sparse).dfm(corpus_post80inaug, tolower = FALSE)#> Document-feature matrix of: 10 documents, 3,478 features (77.7% sparse).# grouping documents by docvars in a corpus dfm(corpus_post80inaug, groups = "President", verbose = TRUE)#>#>#>#>#>#> #>#> Document-feature matrix of: 5 documents, 3,260 features (64.2% sparse).# with English stopwords and stemming dfm(corpus_post80inaug, remove = stopwords("english"), stem = TRUE, verbose = TRUE)#>#>#>#>#>#>#>#>#>#> #>#> Document-feature matrix of: 10 documents, 2,303 features (75.1% sparse).# works for both words in ngrams too dfm("Banking industry", stem = TRUE, ngrams = 2, verbose = FALSE)#> Document-feature matrix of: 1 document, 1 feature (0% sparse). #> 1 x 1 sparse Matrix of class "dfmSparse" #> features #> docs bank_industri #> text1 1# with dictionaries corpus_post1900inaug <- corpus_subset(data_corpus_inaugural, Year > 1900) mydict <- dictionary(list(christmas = c("Christmas", "Santa", "holiday"), opposition = c("Opposition", "reject", "notincorpus"), taxing = "taxing", taxation = "taxation", taxregex = "tax*", country = "states")) dfm(corpus_post1900inaug, dictionary = mydict)#> Document-feature matrix of: 30 documents, 6 features (73.3% sparse).# removing stopwords testText <- "The quick brown fox named Seamus jumps over the lazy dog also named Seamus, with the newspaper from a boy named Seamus, in his mouth." testCorpus <- corpus(testText) # note: "also" is not in the default stopwords("english") featnames(dfm(testCorpus, select = stopwords("english")))#> [1] "the" "over" "with" "from" "a" "in" "his"# for ngrams featnames(dfm(testCorpus, ngrams = 2, select = stopwords("english"), remove_punct = TRUE))#> character(0)#> [1] "the" "over" "with" "from" "a" "in" "his"# removing stopwords before constructing ngrams tokensAll <- tokens(char_tolower(testText), remove_punct = TRUE) tokensNoStopwords <- removeFeatures(tokensAll, stopwords("english")) tokensNgramsNoStopwords <- tokens_ngrams(tokensNoStopwords, 2) featnames(dfm(tokensNgramsNoStopwords, verbose = FALSE))#> [1] "quick_brown" "brown_fox" "fox_named" "named_seamus" #> [5] "seamus_jumps" "jumps_lazy" "lazy_dog" "dog_also" #> [9] "also_named" "seamus_newspaper" "newspaper_boy" "boy_named" #> [13] "seamus_mouth"# keep only certain words dfm(testCorpus, select = "*s", verbose = FALSE) # keep only words ending in "s"#> Document-feature matrix of: 1 document, 3 features (0% sparse). #> 1 x 3 sparse Matrix of class "dfmSparse" #> features #> docs seamus jumps his #> text1 3 1 1dfm(testCorpus, select = "s$", valuetype = "regex", verbose = FALSE)#> Document-feature matrix of: 1 document, 3 features (0% sparse). #> 1 x 3 sparse Matrix of class "dfmSparse" #> features #> docs seamus jumps his #> text1 3 1 1# testing Twitter functions testTweets <- c("My homie @justinbieber #justinbieber shopping in #LA yesterday #beliebers", "2all the ha8ers including my bro #justinbieber #emabiggestfansjustinbieber", "Justin Bieber #justinbieber #belieber #fetusjustin #EMABiggestFansJustinBieber") dfm(testTweets, select = "#*", remove_twitter = FALSE) # keep only hashtags#> Document-feature matrix of: 3 documents, 6 features (50% sparse). #> 3 x 6 sparse Matrix of class "dfmSparse" #> features #> docs #justinbieber #la #beliebers #emabiggestfansjustinbieber #belieber #> text1 1 1 1 0 0 #> text2 1 0 0 1 0 #> text3 1 0 0 1 1 #> features #> docs #fetusjustin #> text1 0 #> text2 0 #> text3 1dfm(testTweets, select = "^#.*$", valuetype = "regex", remove_twitter = FALSE)#> Document-feature matrix of: 3 documents, 6 features (50% sparse). #> 3 x 6 sparse Matrix of class "dfmSparse" #> features #> docs #justinbieber #la #beliebers #emabiggestfansjustinbieber #belieber #> text1 1 1 1 0 0 #> text2 1 0 0 1 0 #> text3 1 0 0 1 1 #> features #> docs #fetusjustin #> text1 0 #> text2 0 #> text3 1# for a dfm dfm1 <- dfm(data_corpus_irishbudget2010) dfm2 <- dfm(dfm1, groups = ifelse(docvars(data_corpus_irishbudget2010, "party") %in% c("FF", "Green"), "Govt", "Opposition"), tolower = FALSE, verbose = TRUE)#>#>#>#> #>