Convert a quanteda dfm object to a format useable by other text
analysis packages. The general function convert
provides easy
conversion from a dfm to the document-term representations used in all other
text analysis packages for which conversions are defined. See also
convert-wrappers for convenience functions for specific package converters.
convert(x, to = c("lda", "tm", "stm", "austin", "topicmodels", "lsa", "matrix", "data.frame"), docvars = NULL, ...)
x | dfm to be converted |
---|---|
to | target conversion format, consisting of the name of the package into whose document-term matrix representation the dfm will be converted:
|
docvars | optional data.frame of document variables used as the
|
... | unused |
A converted object determined by the value of to
(see above).
See conversion target package documentation for more detailed descriptions
of the return formats.
There also exist a variety of converter shortcut commands, designed to mimic the idioms of the packages into whose format they convert. See convert-wrappers for details.
mycorpus <- corpus_subset(data_corpus_inaugural, Year > 1970) quantdfm <- dfm(mycorpus, verbose = FALSE) # austin's wfm format identical(dim(quantdfm), dim(convert(quantdfm, to = "austin")))#> [1] TRUE# stm package format stmdfm <- convert(quantdfm, to = "stm") str(stmdfm)#> List of 3 #> $ documents:List of 12 #> ..$ 1973-Nixon : int [1:2, 1:515] 1 34 2 96 3 1 4 5 6 3 ... #> ..$ 1977-Carter : int [1:2, 1:501] 1 18 2 65 3 7 4 4 7 52 ... #> ..$ 1981-Reagan : int [1:2, 1:850] 1 19 2 174 3 7 4 3 6 5 ... #> ..$ 1985-Reagan : int [1:2, 1:876] 1 24 2 177 3 13 4 7 6 3 ... #> ..$ 1989-Bush : int [1:2, 1:756] 1 15 2 166 3 14 4 16 6 5 ... #> ..$ 1993-Clinton: int [1:2, 1:605] 2 139 3 6 4 5 7 81 9 4 ... #> ..$ 1997-Clinton: int [1:2, 1:726] 1 26 2 131 3 13 4 7 6 3 ... #> ..$ 2001-Bush : int [1:2, 1:592] 1 4 2 110 3 4 4 7 6 1 ... #> ..$ 2005-Bush : int [1:2, 1:735] 1 2 2 120 3 3 4 8 6 2 ... #> ..$ 2009-Obama : int [1:2, 1:900] 1 44 2 130 3 22 4 4 5 1 ... #> ..$ 2013-Obama : int [1:2, 1:786] 1 13 2 99 3 14 4 5 7 89 ... #> ..$ 2017-Trump : int [1:2, 1:547] 1 11 2 96 3 9 4 8 7 88 ... #> $ vocab : chr [1:3462] "-" "," ";" ":" ... #> $ meta :'data.frame': 12 obs. of 3 variables: #> ..$ Year : num [1:12] 1973 1977 1981 1985 1989 ... #> ..$ President: chr [1:12] "Nixon" "Carter" "Reagan" "Reagan" ... #> ..$ FirstName: chr [1:12] "Richard Milhous" "Jimmy" "Ronald" "Ronald" ...# illustrate what happens with zero-length documents quantdfm2 <- dfm(c(punctOnly = "!!!", mycorpus[-1]), verbose = FALSE) rowSums(quantdfm2)#> Error in rowSums(quantdfm2): 'x' must be an array of at least two dimensions#> List of 3 #> $ documents:List of 12 #> ..$ punctOnly : int [1:2, 1] 5 3 #> ..$ 1977-Carter : int [1:2, 1:501] 1 18 2 65 3 7 4 4 7 52 ... #> ..$ 1981-Reagan : int [1:2, 1:850] 1 19 2 174 3 7 4 3 6 5 ... #> ..$ 1985-Reagan : int [1:2, 1:876] 1 24 2 177 3 13 4 7 6 3 ... #> ..$ 1989-Bush : int [1:2, 1:756] 1 15 2 166 3 14 4 16 6 5 ... #> ..$ 1993-Clinton: int [1:2, 1:605] 2 139 3 6 4 5 7 81 9 4 ... #> ..$ 1997-Clinton: int [1:2, 1:726] 1 26 2 131 3 13 4 7 6 3 ... #> ..$ 2001-Bush : int [1:2, 1:592] 1 4 2 110 3 4 4 7 6 1 ... #> ..$ 2005-Bush : int [1:2, 1:735] 1 2 2 120 3 3 4 8 6 2 ... #> ..$ 2009-Obama : int [1:2, 1:900] 1 44 2 130 3 22 4 4 5 1 ... #> ..$ 2013-Obama : int [1:2, 1:786] 1 13 2 99 3 14 4 5 7 89 ... #> ..$ 2017-Trump : int [1:2, 1:547] 1 11 2 96 3 9 4 8 7 88 ... #> $ vocab : chr [1:3376] "-" "," ";" ":" ... #> $ meta :'data.frame': 12 obs. of 3 variables: #> ..$ Year : num [1:12] 1973 1977 1981 1985 1989 ... #> ..$ President: chr [1:12] "Nixon" "Carter" "Reagan" "Reagan" ... #> ..$ FirstName: chr [1:12] "Richard Milhous" "Jimmy" "Ronald" "Ronald" ...not_run({ #' # tm's DocumentTermMatrix format tmdfm <- convert(quantdfm, to = "tm") str(tmdfm) # topicmodels package format str(convert(quantdfm, to = "topicmodels")) # lda package format ldadfm <- convert(quantdfm, to = "lda") str(ldadfm) })