Create LDA Topic Model for GermaParlSample

Load libraries

library(polmineR)
library(data.table)
library(pbapply)
library(topicmodels)
library(ldatuning)
library(DT)

Prepare document-term-matrix

speeches <- as.speeches("GERMAPARLMINI", s_attribute_name = "speaker")
# count_bundle <- count(speeches, p_attribute = c("word", "pos"), verbose = FALSE)
as.list.bundle <- function(x, ...) x@objects
count_bundle <- lapply(speeches, count, p_attribute = c("word", "pos")) %>%
  lapply(subset, pos %in% "NN") %>%
  as.bundle(x2)

dtm <- as.DocumentTermMatrix(count_bundle, p_attribute = "word", col = "count", verbose = FALSE)

# remove short documents
docs_to_drop_length <- which(slam::row_sums(dtm) < 100) # drop docs with less than 100 words
if (length(docs_to_drop_length) > 0L) dtm <- dtm[-docs_to_drop_length,]

# remove noisy words
noise_to_drop <- noise(colnames(dtm), specialChars = NULL, stopwordsLanguage = "de", verbose = FALSE)
noise_to_drop[["stopwords"]] <- c(
  noise_to_drop[["stopwords"]],
  paste(
    toupper(substr(noise_to_drop[["stopwords"]], 1, 1)),
    substr(noise_to_drop[["stopwords"]], 2, nchar(noise_to_drop[["stopwords"]])),
    sep = ""
  )
)
dtm <- dtm[,-which(unique(unlist(noise_to_drop)) %in% colnames(dtm))]

# remove rare words
terms_to_drop_rare <- which(slam::col_sums(dtm) <= 3L)
if (length(terms_to_drop_rare) > 0L) dtm <- dtm[,-terms_to_drop_rare]

# remove documents that are empty now
empty_docs <- which(slam::row_sums(dtm) == 0L)
if (length(empty_docs) > 0L) dtm <- dtm[-empty_docs,]

Calculate topic models

control <- list(burnin = 500, iter = 1000L, keep = 100, verbose = FALSE)
topics <- seq.int(from = 10, to = 100, by = 10)

models <- lapply(
  setNames(topics, topics),
  function(k){
    lda <- LDA(dtm, k = k, method = "Gibbs", control = control)
    saveRDS(object = lda, file = sprintf("~/Lab/tmp/germaparlsample/germaparlsample_lda_%d.rds", k))
    lda
  }
)

Topic model optimization

result <- data.frame(topics)
result[["Griffiths2004"]] <- Griffiths2004(models, as.vector(control))
result[["CaoJuan2009"]] <- CaoJuan2009(models)
result[["Arun2010"]] <- Arun2010(models, dtm)
result[["Deveaud2014"]] <- Deveaud2014(models)

FindTopicsNumber_plot(result)

Inspect top words for k = 30

options("DT.fillContainer" = TRUE) 
DT::datatable(terms(models[["30"]], 10), options = list(dom ="t", scrollY = "350px"))

Create LDA Topic Model for GermaParlSample

2020-05-12

Load libraries

Prepare document-term-matrix

Calculate topic models

Topic model optimization

Inspect top words for k = 30