Work with the SOTUCorpus of 230 US presidential speeches.

require(quanteda)
## Loading required package: quanteda
## quanteda version 0.9.8.8
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:base':
## 
##     sample
data(SOTUCorpus, package = "quantedaData")
# standard list of characters
toks <- tokenize(SOTUCorpus)
# make this into a giant single vector
toksUL <- unlist(toks, use.names = FALSE)
ndoc(SOTUCorpus)
## [1] 230

Purpose

To compare the performance of different methods of “hashing” the tokens, formerly a list of characters.

Format

Although they differ slightly depending on the object type, each function will return, more or less:

Functions to compare

# quick token hashing to convert characters into integers
# returned as a vector
hashtokens_list <- function(x) {
    # doclengths <- cumsum(lengths(x)) + 1
    # docindex <- c(1, doclengths[-length(doclengths)])
    # names(docindex) <- names(x)
    vocabulary <- unique(unlist(x, use.names = FALSE))
    tokens <- lapply(x, fastmatch::fmatch, vocabulary)
    list(tokens = tokens, types = vocabulary, docindex = NULL)
}

# quick token hashing to convert characters into integers
# returned as a list
hashtokens_vector <- function(x) {
    tokens <- unlist(x, use.names = FALSE)
    doclengths <- cumsum(lengths(x)) + 1
    docindex <- c(1, doclengths[-length(doclengths)])
    names(docindex) <- names(x)
    vocabulary <- unique(tokens)
    tokensi <- fastmatch::fmatch(tokens, vocabulary)
    list(tokens = tokensi, types = vocabulary, docindex = docindex)
}

# convert the tokens into a data.table, docname as index, tokens
makedt <- function(x) {
    docname <- rep(names(x), lengths(x))
    data.table::data.table(docname = docname,
                           token = unlist(x, use.names = FALSE))
}

# convert the tokens into a data.table, single column of tokens, 
# separate docindex
makedt_docindex <- function(x) {
    doclengths <- cumsum(lengths(x)) + 1
    docindex <- c(1, doclengths[-length(doclengths)])
    names(docindex) <- names(x)
    list(tokens = data.table::data.table(token = unlist(x, use.names = FALSE)), 
         docindex = docindex)
}

# use factor() to index the types; vectorized 
factortoks <- function(x) {
    tokens <- unlist(x, use.names = FALSE)
    types <- unique(tokens)
    doclengths <- cumsum(lengths(x)) + 1
    docindex <- c(1, doclengths[-length(doclengths)])
    names(docindex) <- names(x)
    list(tokens = factor(tokens, levels = types),
         types = NULL,
         docindex = docindex) 
}

Speed comparisons

microbenchmark::microbenchmark(factor = factortoks(toks),
                               hash_list = hashtokens_list(toks),
                               hash_vec = hashtokens_vector(toks),
                               dt = makedt(toks),
                               dt2 = makedt_docindex(toks),
                               times = 5, unit = "relative")
## Unit: relative
##       expr       min       lq     mean   median       uq      max neval
##     factor 15.114048 6.117156 4.095976 5.814442 2.693821 2.907854     5
##  hash_list  9.152124 3.693185 2.182315 3.481243 1.341597 1.246665     5
##   hash_vec  8.523220 3.607275 2.502042 3.589716 1.446334 2.071463     5
##         dt  2.914193 1.468429 1.329771 1.536697 1.063894 1.278209     5
##        dt2  1.000000 1.000000 1.000000 1.000000 1.000000 1.000000     5

Object size comparisons

format(object.size(factortoks(toks)), units = "Mb")
## [1] "9.4 Mb"
format(object.size(hashtokens_list(toks)), units = "Mb")
## [1] "9.4 Mb"
format(object.size(hashtokens_vector(toks)), units = "Mb")
## [1] "9.4 Mb"
format(object.size(makedt(toks)), units = "Mb")
## [1] "31.5 Mb"
format(object.size(makedt_docindex(toks)), units = "Mb")
## [1] "16.6 Mb"
format(object.size(toks), units = "Mb")
## [1] "35.7 Mb"

Conclusions

The data.table methods are the fastest, but almost as large in size as the original format of a list of characters. The version with the docindex is smaller but still about 180% larger than the hashedtokens method.

For the hashed methods, the list approach is actually faster than the fully vectorized method, and no smaller. Since it would require a lot less programming to maintain the docindex integrity, the list method seems the better way to go.

Going forward, it remains to be tested which is the most efficient when performing actual operations.