To test the performance of dfm construction using tokens versus classic tokenized methods.
require(quanteda, quietly = TRUE, warn.conflicts = FALSE)
## quanteda version 0.9.8.9017
data(SOTUCorpus, package = "quantedaData")
toks <- tokenize(SOTUCorpus)
toksh <- tokens(SOTUCorpus)
When already tokenized:
microbenchmark::microbenchmark(hashed = dfm(toksh, verbose = FALSE),
classic = dfm(toks, verbose = FALSE),
times = 20, unit = "relative")
## Unit: relative
## expr min lq mean median uq max neval
## hashed 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 20
## classic 4.470103 4.383913 3.709468 4.335245 3.107245 3.122707 20
Combining tokenization (as with dfm()
on a character or corpus):
microbenchmark::microbenchmark(hashed = dfm(tokens(SOTUCorpus), verbose = FALSE),
classic = dfm(tokenize(SOTUCorpus), verbose = FALSE),
times = 20, unit = "relative")
## Unit: relative
## expr min lq mean median uq max neval
## hashed 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 20
## classic 1.239813 1.217851 1.217632 1.229421 1.217117 1.146923 20
i, j, x
sparseMatrix v. i, p, x
Not much difference - but the ipx()
could be taking longer because of the transpose operation.
ijx <- function(x) {
# index documents
nTokens <- lengths(x)
i <- rep(seq_along(nTokens), nTokens)
# index features
allFeatures <- unlist(x)
uniqueFeatures <- unique(allFeatures)
j <- match(allFeatures, uniqueFeatures)
new("dfmSparse", Matrix::sparseMatrix(i = i, j = j, x = 1L,
dimnames = list(docs = names(x),
features = uniqueFeatures)))
}
ipx <- function(x) {
# index documents
p <- cumsum(c(1, ntoken(x))) - 1
# index features
allFeatures <- unlist(x)
uniqueFeatures <- unique(allFeatures)
i <- match(allFeatures, uniqueFeatures)
new("dfmSparse", t(Matrix::sparseMatrix(i = i, p = p, x = 1L,
dimnames = list(features = uniqueFeatures,
docs = names(x)))))
}
microbenchmark::microbenchmark(ijx(toks), ipx(toks),
times = 50, unit = "relative")
## Unit: relative
## expr min lq mean median uq max neval
## ijx(toks) 1.000000 1.000000 1.00000 1.000000 1.00000 1.000000 50
## ipx(toks) 1.025118 1.024159 1.04169 1.007245 1.08364 1.604885 50