Fit the Latent Semantic Analysis scaling model to a dfm, which may be
weighted (for instance using dfm_tfidf
).
textmodel_lsa(x, nd = 10, margin = c("both", "documents", "features"))
x | the dfm on which the model will be fit |
---|---|
nd | the number of dimensions to be included in output |
margin | margin to be smoothed by the SVD |
svds in the RSpectra package is applied to enable the fast computation of the SVD.
The number of dimensions nd
retained in LSA is an empirical
issue. While a reduction in \(k\) can remove much of the noise, keeping
too few dimensions or factors may lose important information.
Rosario, Barbara. 2000. "Latent Semantic Indexing: An overview". Technical report INFOSYS 240 Spring Paper, University of California, Berkeley.
Deerwester, S., Dumais, S. T., Furnas, G. W., Landauer, T. K., & Harshman, R. 1990. "Indexing by latent semantic analysis". Journal of the American society for information science 41(6), 391.
predict.textmodel_lsa
, coef.textmodel_lsa
ie_dfm <- dfm(data_corpus_irishbudget2010) # create an LSA space and return its truncated representation in the low-rank space ie_lsa <- textmodel_lsa(ie_dfm[1:10, ])#> Warning: all singular values are requested, svd() is used insteadhead(ie_lsa$docs)#> [,1] [,2] [,3] [,4] #> 2010_BUDGET_01_Brian_Lenihan_FF -0.5132082 0.6611990 0.5010158 0.03718041 #> 2010_BUDGET_02_Richard_Bruton_FG -0.2774006 -0.3444475 0.1538104 0.84969109 #> 2010_BUDGET_03_Joan_Burton_LAB -0.3840362 -0.3455358 -0.1080534 -0.22254097 #> 2010_BUDGET_04_Arthur_Morgan_SF -0.4381501 -0.2675310 0.1958565 -0.42928912 #> 2010_BUDGET_05_Brian_Cowen_FF -0.3932116 0.3587097 -0.7698150 0.14403049 #> 2010_BUDGET_06_Enda_Kenny_FG -0.2611641 -0.1547760 -0.1003581 -0.12282063 #> [,5] [,6] [,7] #> 2010_BUDGET_01_Brian_Lenihan_FF -0.18932417 0.024642794 -0.04354314 #> 2010_BUDGET_02_Richard_Bruton_FG 0.13605925 -0.009346201 0.11169768 #> 2010_BUDGET_03_Joan_Burton_LAB -0.62996056 0.022839615 0.51620557 #> 2010_BUDGET_04_Arthur_Morgan_SF 0.65830177 -0.206942503 0.15992742 #> 2010_BUDGET_05_Brian_Cowen_FF 0.19068539 -0.097840896 0.08922500 #> 2010_BUDGET_06_Enda_Kenny_FG 0.05878167 0.813209501 -0.37318871 #> [,8] [,9] [,10] #> 2010_BUDGET_01_Brian_Lenihan_FF 0.03511621 -0.02558590 0.082457683 #> 2010_BUDGET_02_Richard_Bruton_FG 0.12502463 -0.10974219 0.004679789 #> 2010_BUDGET_03_Joan_Burton_LAB 0.04871506 -0.02433495 -0.071523773 #> 2010_BUDGET_04_Arthur_Morgan_SF 0.10149400 0.01181985 0.039985771 #> 2010_BUDGET_05_Brian_Cowen_FF -0.19256676 0.01576936 -0.110120661 #> 2010_BUDGET_06_Enda_Kenny_FG 0.08277396 -0.23320209 -0.131952742# matrix in low_rank LSA space ie_lsa$matrix_low_rank[,1:5]#> when i presented the supplementary #> 2010_BUDGET_01_Brian_Lenihan_FF 5 73 1.000000e+00 539 7.000000e+00 #> 2010_BUDGET_02_Richard_Bruton_FG 2 6 1.725749e-14 305 1.214406e-13 #> 2010_BUDGET_03_Joan_Burton_LAB 11 40 1.110657e-14 428 1.812092e-13 #> 2010_BUDGET_04_Arthur_Morgan_SF 21 26 -2.171103e-12 501 1.000000e+00 #> 2010_BUDGET_05_Brian_Cowen_FF 4 17 -1.101752e-12 394 7.704948e-14 #> 2010_BUDGET_06_Enda_Kenny_FG 12 25 1.000000e+00 304 1.000000e+00 #> 2010_BUDGET_07_Kieran_ODonnell_FG 5 11 -2.284291e-12 193 5.337258e-13 #> 2010_BUDGET_08_Eamon_Gilmore_LAB 6 10 2.470182e-12 270 -8.729094e-13 #> 2010_BUDGET_09_Michael_Higgins_LAB 3 7 -1.068381e-13 78 -4.569123e-13 #> 2010_BUDGET_10_Ruairi_Quinn_LAB 5 19 1.203898e-13 80 3.574918e-14# fold queries into the space generated by ie_dfm[1:10,] # and return its truncated versions of its representation in the new low-rank space new_lsa <- predict(ie_lsa, ie_dfm[11:14, ]) new_lsa$docs_newspace#> 4 x 10 Matrix of class "dgeMatrix" #> [,1] [,2] [,3] #> 2010_BUDGET_11_John_Gormley_Green -0.06232233 0.02556855 0.01586808 #> 2010_BUDGET_12_Eamon_Ryan_Green -0.09764584 -0.05532927 -0.03798847 #> 2010_BUDGET_13_Ciaran_Cuffe_Green -0.07289841 -0.01397222 -0.08691196 #> 2010_BUDGET_14_Caoimhghin_OCaolain_SF -0.24271908 -0.05221856 0.14035456 #> [,4] [,5] [,6] #> 2010_BUDGET_11_John_Gormley_Green 0.002090294 0.008423089 -0.062365633 #> 2010_BUDGET_12_Eamon_Ryan_Green 0.290792321 -0.059380796 -0.222737473 #> 2010_BUDGET_13_Ciaran_Cuffe_Green 0.108245813 0.031632546 -0.002166229 #> 2010_BUDGET_14_Caoimhghin_OCaolain_SF -0.140740721 0.095472404 0.004089615 #> [,7] [,8] [,9] #> 2010_BUDGET_11_John_Gormley_Green -0.01828161 -0.06628157 0.01334491 #> 2010_BUDGET_12_Eamon_Ryan_Green -0.05317940 -0.01139819 0.28550581 #> 2010_BUDGET_13_Ciaran_Cuffe_Green -0.01630824 0.04101057 0.07250855 #> 2010_BUDGET_14_Caoimhghin_OCaolain_SF -0.01793895 0.06060947 -0.07710551 #> [,10] #> 2010_BUDGET_11_John_Gormley_Green -0.04928801 #> 2010_BUDGET_12_Eamon_Ryan_Green -0.19176318 #> 2010_BUDGET_13_Ciaran_Cuffe_Green -0.18028126 #> 2010_BUDGET_14_Caoimhghin_OCaolain_SF 0.23586845