set.seed(94253)
K <- 100
V <- 100000
library(Matrix)
sparseCmat <- Matrix(runif(K*V), nrow = K, ncol = V)
densemat <- as.matrix(sparseCmat)
sparseTmat <- as(sparseCmat, "TsparseMatrix")
microbenchmark::microbenchmark(sparseC = sparseCmat[3, 2],
sparseT = sparseTmat[3, 2],
brandon = sparseCmat@x[(2-1)*K + 3],
densem = densemat[3,2],
unit = "relative", times = 200)
## Unit: relative
## expr min lq mean median uq
## sparseC 3.207522e+04 1.881232e+04 1.254118e+04 8.555601e+03 12407.01369
## sparseT 6.325018e+05 3.740093e+05 1.763886e+05 1.576997e+05 139595.75414
## brandon 6.830986e-01 6.594927e-01 1.273947e+00 1.330862e+00 1.32417
## densem 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.00000
## max neval
## 10227.194680 200
## 59350.785462 200
## 1.204846 200
## 1.000000 200
require(quanteda, quietly = TRUE, warn.conflicts = FALSE)
## quanteda version 0.9.8.5
myDfm <- as(dfm(inaugCorpus, verbose = FALSE), "CsparseMatrix")
myDfm[56, "month"] <- 99999
idx <- which(myDfm@x == 99999) # get index of this inserted value
myDfmTriplet <- as(myDfm, "TsparseMatrix")
myDfmDense <- as.matrix(myDfm)
colidx <- which(colnames(myDfm) == "month")
microbenchmark::microbenchmark(sparseCname = myDfm[56, "month"],
sparseTname = myDfmTriplet[56, "month"],
densemname = myDfmDense[56, "month"],
sparseCnum = myDfm[56, colidx],
sparseTnum = myDfmTriplet[56, colidx],
densemnum = myDfmDense[56, colidx],
direct = myDfm@x[idx],
unit = "relative", times = 500)
## Unit: relative
## expr min lq mean median uq
## sparseCname 1720.265625 424.011088 346.309349 252.509689 222.780639
## sparseTname 6419.395833 1548.009504 970.562821 923.715502 813.870085
## densemname 44.916667 15.281943 17.306882 10.697005 20.625891
## sparseCnum 1586.067708 386.073390 243.627398 228.106283 198.335709
## sparseTnum 6036.046875 1502.758184 957.727768 880.393130 768.400552
## densemnum 3.635417 1.661035 1.843584 1.753376 1.939526
## direct 1.000000 1.000000 1.000000 1.000000 1.000000
## max neval
## 593.473943 500
## 256.025788 500
## 29.755604 500
## 98.735124 500
## 725.077580 500
## 1.078979 500
## 1.000000 500
If your data is dense, then indexing using the sparse matrix method for "["
is much less efficient than it is for dense matrix objects. But for sparse objects, the differences are negligable.
Yes the direct way of indexing the cell is always going to be faster, but its location has to be computed in some way before the position of the x
slot can be known. In the Matrix source this is line 309 of sparseMatrix.R
, and it first detects the type of sparse Matrix, then coerces the sparse column matrix to a triplet sparse matrix, then invokes a lower-level index method. But I must have something slightly wrong here, since the "["
for the sparse triplet format is actually slower!