require(quanteda)
## Loading required package: quanteda
## Package version: 1.3.18
## Parallel computing: 2 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
quanteda_options(threads = 8)
sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: KDE neon User Edition 5.14
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] quanteda_1.3.18
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.0 pillar_1.3.0 compiler_3.4.4
## [4] plyr_1.8.4 bindr_0.1.1 tools_3.4.4
## [7] stopwords_0.9.0 digest_0.6.18 lubridate_1.7.4
## [10] evaluate_0.12 tibble_1.4.2 gtable_0.2.0
## [13] lattice_0.20-35 pkgconfig_2.0.2 rlang_0.3.0.1
## [16] fastmatch_1.1-0 Matrix_1.2-12 yaml_2.2.0
## [19] xfun_0.4 bindrcpp_0.2.2 stringr_1.3.1
## [22] dplyr_0.7.8 knitr_1.21 grid_3.4.4
## [25] tidyselect_0.2.5 glue_1.3.0 data.table_1.11.8
## [28] R6_2.3.0 rmarkdown_1.11 spacyr_0.9.91
## [31] ggplot2_3.1.0 purrr_0.2.5 magrittr_1.5
## [34] scales_1.0.0 htmltools_0.3.6 assertthat_0.2.0
## [37] colorspace_1.3-2 stringi_1.2.4 lazyeval_0.2.1
## [40] RcppParallel_4.4.2 munsell_0.5.0 crayon_1.3.4
corp <- readRDS("/home/kohei/Documents/Brexit/Data/data_corpus_guardian.RDS")
system.time(
corp2 <- corpus_sample(corp, 5000)
)
## user system elapsed
## 0.01 0.00 0.01
system.time(
toks <- tokens(corp)
)
## user system elapsed
## 214.611 6.907 198.440
system.time(
toks2 <- tokens_sample(toks, 5000)
)
## user system elapsed
## 0.880 0.019 0.882
system.time(
toks3 <- tokens_remove(toks, stopwords("en"))
)
## user system elapsed
## 25.145 0.491 9.432
system.time(
mt <- dfm(toks)
)
## user system elapsed
## 23.147 3.014 25.301
system.time(
mt2 <- dfm_group(mt, "date")
)
## user system elapsed
## 9.426 1.753 11.183
system.time(
mt3 <- dfm_remove(mt, stopwords("en"))
)
## user system elapsed
## 2.360 0.223 2.586