Tidy a Corpus object from the tm package. Returns a data frame
with one-row-per-document, with a text column containing
the document's text, and one column for each local (per-document)
metadata tag. For corpus objects from the quanteda package,
see tidy.corpus.
# S3 method for Corpus tidy(x, collapse = "\n", ...)
| x | A Corpus object, such as a VCorpus or PCorpus |
|---|---|
| collapse | A string that should be used to collapse text within each corpus (if a document has multiple lines). Give NULL to not collapse strings, in which case a corpus will end up as a list column if there are multi-line documents. |
| ... | Extra arguments, not used |
library(dplyr) # displaying tbl_dfs if (requireNamespace("tm", quietly = TRUE)) { library(tm) #' # tm package examples txt <- system.file("texts", "txt", package = "tm") ovid <- VCorpus(DirSource(txt, encoding = "UTF-8"), readerControl = list(language = "lat")) ovid tidy(ovid) # choose different options for collapsing text within each # document tidy(ovid, collapse = "")$text tidy(ovid, collapse = NULL)$text # another example from Reuters articles reut21578 <- system.file("texts", "crude", package = "tm") reuters <- VCorpus(DirSource(reut21578), readerControl = list(reader = readReut21578XMLasPlain)) reuters tidy(reuters) }#> # A tibble: 20 x 17 #> author datetimestamp description #> <chr> <dttm> <chr> #> 1 <NA> 1987-02-26 10:00:56 #> 2 BY TED D'AFFLISIO, Reuters 1987-02-26 10:34:11 #> 3 <NA> 1987-02-26 11:18:00 #> 4 <NA> 1987-02-26 11:21:01 #> 5 <NA> 1987-02-26 12:00:57 #> 6 <NA> 1987-02-28 20:25:46 #> 7 By Jeremy Clift, Reuters 1987-02-28 20:39:14 #> 8 <NA> 1987-02-28 22:27:27 #> 9 <NA> 1987-03-01 01:22:30 #> 10 <NA> 1987-03-01 11:31:44 #> 11 <NA> 1987-03-01 18:05:49 #> 12 <NA> 1987-03-02 00:39:23 #> 13 <NA> 1987-03-02 00:43:22 #> 14 <NA> 1987-03-02 00:43:41 #> 15 <NA> 1987-03-02 01:25:42 #> 16 <NA> 1987-03-02 04:20:05 #> 17 <NA> 1987-03-02 04:28:26 #> 18 <NA> 1987-03-02 05:13:46 #> 19 By BERNICE NAPACH, Reuters 1987-03-02 07:38:34 #> 20 <NA> 1987-03-02 07:49:06 #> # ... with 14 more variables: heading <chr>, id <chr>, language <chr>, #> # origin <chr>, topics <chr>, lewissplit <chr>, cgisplit <chr>, oldid <chr>, #> # topics_cat <list>, places <list>, people <chr>, orgs <chr>, #> # exchanges <chr>, text <chr>