Load libraries
library(polmineR)
library(data.table)
library(pbapply)
library(topicmodels)
library(ldatuning)
library(DT)
Prepare document-term-matrix
speeches <- as.speeches("GERMAPARLMINI", s_attribute_name = "speaker")
# count_bundle <- count(speeches, p_attribute = c("word", "pos"), verbose = FALSE)
as.list.bundle <- function(x, ...) x@objects
count_bundle <- lapply(speeches, count, p_attribute = c("word", "pos")) %>%
lapply(subset, pos %in% "NN") %>%
as.bundle(x2)
dtm <- as.DocumentTermMatrix(count_bundle, p_attribute = "word", col = "count", verbose = FALSE)
# remove short documents
docs_to_drop_length <- which(slam::row_sums(dtm) < 100) # drop docs with less than 100 words
if (length(docs_to_drop_length) > 0L) dtm <- dtm[-docs_to_drop_length,]
# remove noisy words
noise_to_drop <- noise(colnames(dtm), specialChars = NULL, stopwordsLanguage = "de", verbose = FALSE)
noise_to_drop[["stopwords"]] <- c(
noise_to_drop[["stopwords"]],
paste(
toupper(substr(noise_to_drop[["stopwords"]], 1, 1)),
substr(noise_to_drop[["stopwords"]], 2, nchar(noise_to_drop[["stopwords"]])),
sep = ""
)
)
dtm <- dtm[,-which(unique(unlist(noise_to_drop)) %in% colnames(dtm))]
# remove rare words
terms_to_drop_rare <- which(slam::col_sums(dtm) <= 3L)
if (length(terms_to_drop_rare) > 0L) dtm <- dtm[,-terms_to_drop_rare]
# remove documents that are empty now
empty_docs <- which(slam::row_sums(dtm) == 0L)
if (length(empty_docs) > 0L) dtm <- dtm[-empty_docs,]
Calculate topic models
control <- list(burnin = 500, iter = 1000L, keep = 100, verbose = FALSE)
topics <- seq.int(from = 10, to = 100, by = 10)
models <- lapply(
setNames(topics, topics),
function(k){
lda <- LDA(dtm, k = k, method = "Gibbs", control = control)
saveRDS(object = lda, file = sprintf("~/Lab/tmp/germaparlsample/germaparlsample_lda_%d.rds", k))
lda
}
)
Topic model optimization
result <- data.frame(topics)
result[["Griffiths2004"]] <- Griffiths2004(models, as.vector(control))
result[["CaoJuan2009"]] <- CaoJuan2009(models)
result[["Arun2010"]] <- Arun2010(models, dtm)
result[["Deveaud2014"]] <- Deveaud2014(models)
FindTopicsNumber_plot(result)
Inspect top words for k = 30
options("DT.fillContainer" = TRUE)
DT::datatable(terms(models[["30"]], 10), options = list(dom ="t", scrollY = "350px"))