actual_class <- dfmat_matched$sentiment
predicted_class <- predict(tmod_nb, newdata = dfmat_matched) # this is where the prediction happens
tab_class <- table(actual_class, predicted_class)
tab_class
confusionMatrix(tab_class, mode = "everything", positive = "pos")
list.of.packages <- c("caret")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
require(caret)
?require
confusionMatrix(tab_class, mode = "everything", positive = "pos")
View(dfmt_movie)
print(dfmt_movie)
confusionMatrix(tab_class, mode = "everything", positive = "pos")
library(quanteda)
library(quanteda.textstats)
library(quanteda.textplots)
library(quanteda.textmodels)
library(tidyverse)
require(caret)
corp_movies <- data_corpus_moviereviews
summary(corp_movies, 5)
corp_ger <- download(url = "https://www.dropbox.com/s/uysdoep4unfz3zp/data_corpus_germanifestos.rds?dl=1")
?download
??download
load("~/Desktop/data_corpus_germanifestos.rds")
load("~/Desktop/data_corpus_germanifestos.rds")
library(quanteda)
library(quanteda.textstats)
library(quanteda.textplots)
library(quanteda.textmodels)
library(tidyverse)
require(caret)
corp_movies <- data_corpus_moviereviews
summary(corp_movies, 5)
corp_movies[["cv000_29416.txt"]]
# generate 1500 numbers without replacement
set.seed(300)
id_train <- sample(1:2000, 1500, replace = FALSE)
head(id_train, 10)
# create docvar with ID
corp_movies$id_numeric <- 1:ndoc(corp_movies)
# tokenize texts
toks_movies <- tokens(corp_movies, remove_punct = TRUE, remove_number = TRUE) %>%
tokens_remove(pattern = stopwords("en")) %>%
tokens_wordstem()
dfmt_movie <- dfm(toks_movies)
print(dfmt_movie)
# get training set
dfmat_training <- dfm_subset(dfmt_movie, id_numeric %in% id_train)
# get test set (documents not in id_train)
dfmat_test <- dfm_subset(dfmt_movie, !id_numeric %in% id_train)
tmod_nb <- textmodel_nb(dfmat_training, dfmat_training$sentiment)
summary(tmod_nb)
dfmat_matched <- dfm_match(dfmat_test, features = featnames(dfmat_training))
actual_class <- dfmat_matched$sentiment
predicted_class <- predict(tmod_nb, newdata = dfmat_matched) # this is where the prediction happens
tab_class <- table(actual_class, predicted_class)
tab_class
confusionMatrix(tab_class, mode = "everything", positive = "pos")
418/(418+45+37)
213/250
205/250
205/(205+37)
corp_movies <- data_corpus_moviereviews
summary(corp_movies, 5)
corp_movies[["cv000_29416.txt"]]
# generate 1500 numbers without replacement
set.seed(300)
id_train <- sample(1:2000, 1500, replace = FALSE)
head(id_train, 10)
# generate 1500 numbers without replacement
#set.seed(300)
id_train <- sample(1:2000, 1500, replace = FALSE)
head(id_train, 10)
# generate 1500 numbers without replacement
#set.seed(300)
id_train <- sample(1:2000, 1500, replace = FALSE)
head(id_train, 10)
# generate 1500 numbers without replacement
set.seed(300)
id_train <- sample(1:2000, 1500, replace = FALSE)
head(id_train, 10)
# create docvar with ID
corp_movies$id_numeric <- 1:ndoc(corp_movies)
# tokenize texts
toks_movies <- tokens(corp_movies, remove_punct = TRUE, remove_number = TRUE) %>%
tokens_remove(pattern = stopwords("en")) %>%
tokens_wordstem()
dfmt_movie <- dfm(toks_movies)
print(dfmt_movie)
# get training set
dfmat_training <- dfm_subset(dfmt_movie, id_numeric %in% id_train)
# get test set (documents not in id_train)
dfmat_test <- dfm_subset(dfmt_movie, !id_numeric %in% id_train)
tmod_nb <- textmodel_nb(dfmat_training, dfmat_training$sentiment)
summary(tmod_nb)
dfmat_matched <- dfm_match(dfmat_test, features = featnames(dfmat_training))
actual_class <- dfmat_matched$sentiment
predicted_class <- predict(tmod_nb, newdata = dfmat_matched) # this is where the prediction happens
tab_class <- table(actual_class, predicted_class)
tab_class
confusionMatrix(tab_class, mode = "everything", positive = "pos")
(213+205)/500
?textmodel_nb
library(rvest)
url <- "https://en.wikipedia.org/wiki/COVID-19_pandemic_by_country_and_territory"
table_covid <- read_html(url) %>%
html_node(xpath = "/html/body/div[3]/div[3]/div[5]/div[1]/div[15]/div[5]/table") %>%
html_table()
url <- "https://en.wikipedia.org/wiki/COVID-19_pandemic_by_country_and_territory"
# /html/body/div[3]/div[3]/div[5]/div[1]/div[16]/div[5]/table
table_covid <- read_html(url) %>%
html_node(xpath = "/html/body/div[3]/div[3]/div[5]/div[1]/div[15]/div[5]/table") %>%
html_table()
View(table_covid)
table_covid <- read_html(url) %>%
html_node("#covid-19-cases-deaths-and-rates-by-location > div:nth-child(6) > table:nth-child(2)") %>%
html_table()
View(table_covid)
url <- "https://www.admin.ch/gov/de/start/dokumentation/reden/neujahrsansprachen.html"
announce_page <- read_html(url)
announce_page
# /html/body/div[2]/div[3]/div/div[2]/div/div[1]/div[4]/article/ul/li[1]/a
# Above is the XPath to the first speech
# Now we can collect them all
announce_page_href <- html_nodes(announce_page,
xpath = '/html/body/div[2]/div[3]/div/div[2]/div/div[1]/div[4]/article/ul/li[*]/a') %>%
html_attr('href')
announce_page_href
# We will try to scrape the text of only the most recent speech first
url_one <- paste0("https://www.admin.ch", announce_page_href[1]) # create the URL
url_one
# Where is the text stored?
# /html/body/div[2]/div[3]/div/div[2]/div/div[1]/div[4]
one_speech <- read_html(url_one) %>%
html_nodes(xpath = '/html/body/div[2]/div[3]/div/div[2]/div/div[1]/div[4]') %>%
html_text()
one_speech
all_texts1 <- data.frame()
for (i in c(1:46)) {
url_one <- paste0("https://www.admin.ch", announce_page_href[i])
message(url_one)
text <- read_html(url_one) %>%
html_nodes(xpath = '/html/body/div[2]/div[3]/div/div[2]/div/div[1]/div[4]') %>%
html_text()
tmp_df <- data.frame(url = announce_page_href[i], text = text)
all_texts1 <- bind_rows(all_texts1, tmp_df)
}
library(tidyverse)
all_texts1 <- data.frame()
for (i in c(1:46)) {
url_one <- paste0("https://www.admin.ch", announce_page_href[i])
message(url_one)
text <- read_html(url_one) %>%
html_nodes(xpath = '/html/body/div[2]/div[3]/div/div[2]/div/div[1]/div[4]') %>%
html_text()
tmp_df <- data.frame(url = announce_page_href[i], text = text)
all_texts1 <- bind_rows(all_texts1, tmp_df)
}
View(all_texts1)
all_texts1 <- data.frame()
for (i in c(1:7)) {
url_one <- paste0("https://www.admin.ch", announce_page_href[i])
message(url_one)
text <- read_html(url_one) %>%
html_nodes(xpath = '/html/body/div[2]/div[3]/div/div[2]/div/div[1]/div[4]') %>%
html_text()
tmp_df <- data.frame(url = announce_page_href[i], text = text)
all_texts1 <- bind_rows(all_texts1, tmp_df)
}
all_texts1 <- data.frame()
for (i in c(1:7)) {
url_one <- paste0("https://www.admin.ch", announce_page_href[i])
message(url_one)
text <- read_html(url_one) %>%
html_nodes(xpath = '/html/body/div[2]/div[3]/div/div[2]/div/div[1]/div[4]') %>%
html_text()
tmp_df <- data.frame(url = announce_page_href[i], text = text)
all_texts1 <- bind_rows(all_texts1, tmp_df)
}
all_texts2 <- data.frame()
for (i in c(8:46)) {
url_one <- paste0("https://www.admin.ch", announce_page_href[i])
message(url_one)
text <- read_html(url_one) %>%
html_nodes(xpath = '/html/body/div[2]/div[3]/div/div[2]/div/div[1]/div[3]') %>%
html_text()
tmp_df <- data.frame(url = announce_page_href[i], text = text)
all_texts2 <- bind_rows(all_texts2, tmp_df)
}
all_texts <- rbind(all_texts1, all_texts2)
View(all_texts)
View(all_texts)
speaker <-
html_nodes(announce_page, xpath = '/html/body/div[2]/div[3]/div/div[2]/div/div[1]/div[4]/article/ul/li[*]/a') %>%
html_text()
speaker
all_texts$speaker <- speaker
View(all_texts)
View(all_texts)
all_texts$text[2]
# First, we'll delete "boilerplate text"
all_texts$text <- gsub(all_texts$text, pattern = "Das Abspielen von Audiodaten ist mit dem aktuellen Browser nicht möglich.", replacement = "")
all_texts$text <- gsub(all_texts$text, pattern = "Es gilt das gesprochene Wort", replacement = "", ignore.case = T)
all_texts$text[2]
all_texts$text[10]
all_texts$text <- str_replace(all_texts$text, pattern = "\\d+. Januar \\d+", replacement = "")
all_texts$text[10]
View(all_texts)
all_texts$text <- str_replace(all_texts$text, pattern = ".   -", replacement = "")
all_texts$text <- str_replace_all(all_texts$text, pattern = "\n", replacement = " ")
all_texts$text <- trimws(all_texts$text, which = "both", whitespace = "[ \t\r\n]")
all_texts$text <- str_replace_all(all_texts$text, "Download.*","")
all_texts$text[10] # Already looks much better
View(all_texts)
?trimws()
View(all_texts)
# Let's clean the speaker column too
# And create a column indicating the year a speech was given
all_texts$year <- str_sub(all_texts$speaker,1,4) # extracts first 4 characters from the speaker column; here that's the year
View(all_texts)
length(all_texts$speaker[1])
nchar(all_texts$speaker[1])
all_texts$speaker <- str_replace_all(all_texts$speaker, pattern = "\\d+", replacement = "")
View(all_texts)
all_texts$speaker <- str_replace_all(all_texts$speaker, pattern = "- ", replacement = "")
all_texts$speaker <- str_replace_all(all_texts$speaker, pattern = "– ", replacement = "")
View(all_texts)
all_texts$speaker <- str_replace_all(all_texts$speaker, pattern = "Neujahrsansprache von Bundespräsident[a-zA-Z]*", replacement = "")
View(all_texts)
all_texts$speaker <- trimws(all_texts$speaker, which = "both", whitespace = "[ \t\r\n]")
View(all_texts)
all_texts$speaker[1:2] <- sub(all_texts$speaker[1:2], pattern = "^\\S+\\s+", replacement = "") # for whitespace that doesn't act like whitespace
View(all_texts)
View(all_texts)
# Verify that the character vector called "sentence" does indeed contain the word "apples"
str_detect(sentence, "apple")
sentence <- c("I like apples more than oranges.")
# Verify that the character vector called "sentence" does indeed contain the word "apples"
str_detect(sentence, "apple")
str_detect(sentence, "apple*")
# Extract the word "oranges" from the sentence
str_extract_all(sentence, "oranges")
str_extract_all(sentence_upper, "[:upper:]")
str_extract_all(sentence_upper, "[A-Z]")
# From sentence_upper extract all uppercase letters (I A O)
sentence_upper <- c("I like Apples more than Oranges.")
str_extract_all(sentence_upper, "[:upper:]")
str_extract_all(sentence_upper, "[A-Z]")
str_extract_all(sentence_upper, "[a-z]")
str_extract_all(sentence_upper, "[a-zA-Z]")
str_replace(sentence, "apples", "cherries")
str_extract(sentence_nr, "[:digit:]")
str_extract(sentence_nr, "[1-9]")
# From sentence_nr, only extract the first number (2)
sentence_nr <- c("Today I ate 2 apples and 3 tangerines.")
str_extract(sentence_nr, "[:digit:]")
str_extract(sentence_nr, "[1-9]")
str_extract(sentence_nr, "[^A-Za-z\\s+]")
str_extract(sentence_nr, "[^A-Za-z]")
str_extract(sentence_nr, "[A-Za-z\\s+]")
str_extract_all(sentence_nr, "[:digit:]")
####
library(quanteda.textstats)
library(stm)
library(quanteda)
corp <- data_corpus_inaugural
corp_recent <- corpus_subset(corp, Year >= 1940 & Year <= 1965)
summary(corp_recent)
toks_recent <- tokens(corp_recent, remove_punct = T)
dfmat <- dfm(toks_recent) %>%
dfm_remove(pattern = stopwords("en"))
print(dfmat)
?textstat_simil
tstat1 <- textstat_simil(dfmat, method = "jaccard", margin = "documents") # (elements in common)/(elements in common + elements not in common)
tstat1
tstat2 <- textstat_simil(dfmat, method = "cosine", margin = "documents") # calculates the cosine between document vectors
tstat2
dat2 <- as.matrix(tstat2)
corrplot::corrplot(dat2, method="circle")
dfmat <- dfm(corp_recent) %>%
dfm_remove(pattern = stopwords("en"))
print(dfmat)
?textstat_simil
tstat1 <- textstat_simil(dfmat, method = "jaccard", margin = "documents") # (elements in common)/(elements in common + elements not in common)
tstat1
tstat2 <- textstat_simil(dfmat, method = "cosine", margin = "documents") # calculates the cosine between document vectors
tstat2
knitr::opts_chunk$set(echo = TRUE)
library(quanteda)
library(quanteda.textmodels)
library(quanteda.textplots)
library(quanteda.textstats)
library(tidyverse)
library(ggplot2)
library(stm)
corp <- data_corpus_inaugural
corp_speaches
knitr::opts_chunk$set(echo = TRUE)
library(quanteda)
library(quanteda.textmodels)
library(quanteda.textplots)
library(quanteda.textstats)
library(tidyverse)
library(ggplot2)
library(stm)
corp <- data_corpus_inaugural
corp_speaches
corp_speaches <- corpus_subset(corp, Year >= 1940 & Year <= 1965)
toks_recent <- tokens(corp_speaches, remove_punct = T)
dfmat <- dfm(toks_recent) %>%
dfm_remove(pattern = stopwords("en"))
print(dfmat)
tstat <- textstat_simil(dfmat, method = "cosine", margin = "documents") # calculates the cosine between document vectors
tstat
print(corp_speaches)
dat <- as.data.frame(tstat)
dat2 <- as.matrix(tstat)
ggplot(dat, aes(x=document1, y=document2, fill=cosine)) + geom_tile()
# 2nd option
corrplot::corrplot(dat2, method="circle")
# 1. In your own words, describe how the Jaccard similarity works. (1p)
# This part of the program allow us to know the similarity of two documents by counting the common words and each document and compared them to to total words in both document.
# 2. Subset the corpus containing the inaugural speeches to contain only speeches given between 1940 and 1965. Calculate the cosine similarity between the speeches, and briefly explain your results. Which speeches are most, which speeches are least similar according to this measure? (2p)
corp <- data_corpus_inaugural
corp_recent <- corpus_subset(corp, Year >= 1940)
corp_recent <- corpus_subset(corp_recent, Year < 1965)
toks_recent <- tokens(corp_recent, remove_punct = T)
dfmat <- dfm(toks_recent) %>%
dfm_remove(pattern = stopwords("en"))
tstat2 <- textstat_simil(dfmat, method = "cosine", margin = "documents") # calculates the cosine between document vectors
tstat2
# 3. Visualize these correlations. You may use either ggplot2 or corrplot. (1p)
corrplot(as.matrix(tstat2),method="circle")
library(corrplot)
# 3. Visualize these correlations. You may use either ggplot2 or corrplot. (1p)
corrplot(as.matrix(tstat2),method="circle")
library(quanteda)
library(quanteda.textmodels)
library(quanteda.textplots)
library(quanteda.textstats)
library(tidyverse)
## 2. Working with a corpus #####
corp <- data_corpus_inaugural # this is already a corpus, so no need to transform it
head(docvars(corp))
### EXERCISE: how to get the last n entries?
tail(docvars(corp))
summary(data_corpus_inaugural)
docvars(corp, field = "Year")
corp$Year
## 3. Tokenizing ####
toks_inaug <- tokens(corp, remove_punct = T)
# Various elements can be reomved when tokenizing: punctuation, symbols, numbers, URLs...
print(toks_inaug)
toks_neujahr <- tokens(corp_neujahr, remove_punct = T)
## 4. Document-feature matrix ####
dfmat_inaug <- dfm(toks_inaug)
print(dfmat_inaug)
df <- read.csv("view_election.csv")
setwd("/Users/antheaalberto/switchdrive/RISE/Veranstaltungen/2021_2022_Crash_Courses/RStudio-DataViz")
df <- read.csv("view_election.csv")
names(x)
names(df)
table(df$country_name)
df <- df[df$country_name=="Switzerland",]
range(df$election_date)
x <- df[df$election_date >= "2019-01-01",] # nur die letzte Wahl
ggplot(x[x$seats>1,], aes(x=party_name_short, y=seats, fill=party_name_short)) + geom_col() +
labs(x="",
y="Sitze im Parlament",
title="Nationalratswahl 2019")
ggplot(x[x$seats>1,], aes(x=party_name_short, y=seats, fill=party_name_short)) + geom_col(show.legend = F) + theme_minimal() +
labs(x="",
y="Sitze im Parlament",
title="Nationalratswahl 2019")
library(ggplot2)
ggplot(x[x$seats>1,], aes(x=party_name_short, y=seats, fill=party_name_short)) + geom_col() +
labs(x="",
y="Sitze im Parlament",
title="Nationalratswahl 2019")
ggplot(x[x$seats>1,], aes(x=party_name_short, y=seats, fill=party_name_short)) + geom_col(show.legend = F) + theme_minimal() +
labs(x="",
y="Sitze im Parlament",
title="Nationalratswahl 2019")
ggplot(x[x$seats>1,], aes(x=party_name_short, y=seats, fill=party_name_short)) + geom_col(show.legend = F) + theme_minimal() +
scale_fill_manual(values = c("yellow","khaki1","royalblue","darkseagreen","chartreuse","orange","red","green4")) +
labs(x="",
y="Sitze im Parlament",
title="Nationalratswahl 2019")
ggplot(x[x$seats>1,], aes(x=reorder(party_name_short, -seats), y=seats, fill=party_name_short)) + geom_col(show.legend = F) + theme_minimal() +
scale_fill_manual(values = c("yellow","khaki1","royalblue","darkseagreen","chartreuse","orange","red","green4")) +
labs(x="",
y="Sitze im Parlament",
title="Nationalratswahl 2019")
library(rvest)
library(tidyverse)
url <- "https://www.admin.ch/gov/de/start/dokumentation/reden/neujahrsansprachen.html"
page <- read_html(url)
page
# /html/body/div[2]/div[3]/div/div[2]/div/div[1]/div[4]/article/ul/li[1]/a
# Above is the XPath to the first speech
page_href <- html_nodes(page,
xpath = '/html/body/div[2]/div[3]/div/div[2]/div/div[1]/div[4]/article/ul/li[*]/a') %>%
html_attr('href')
page_href
urlone <- paste0("https://www.admin.ch", page_href[1])
urlone
one_speech <- read_html(urlone) %>%
html_nodes(xpath = '/html/body/div[2]/div[3]/div/div[2]/div/div[1]/div[4]') %>%
html_text()
one_speech
length(page_href)
df1 <- data.frame()
for (i in c(1:47)) {
url_one <- paste0("https://www.admin.ch", page_href[i])
message(url_one)
text <- read_html(url_one) %>%
html_nodes(xpath = '/html/body/div[2]/div[3]/div/div[2]/div/div[1]/div[4]') %>%
html_text()
tmp_df <- data.frame(url = page_href[i], text = text)
df1 <- bind_rows(df1, tmp_df)
}
View(df1)
df1 <- data.frame()
View(df1)
df1 <- data.frame()
for (i in c(1:8)) {
url_one <- paste0("https://www.admin.ch", page_href[i])
message(url_one)
text <- read_html(url_one) %>%
html_nodes(xpath = '/html/body/div[2]/div[3]/div/div[2]/div/div[1]/div[4]') %>%
html_text()
tmp_df <- data.frame(url = page_href[i], text = text)
df1 <- bind_rows(df1, tmp_df)
}
df2 <- data.frame()
for (i in c(9:47)) {
url_one <- paste0("https://www.admin.ch", page_href[i])
message(url_one)
text <- read_html(url_one) %>%
html_nodes(xpath = '/html/body/div[2]/div[3]/div/div[2]/div/div[1]/div[3]') %>%
html_text()
tmp_df <- data.frame(url = page_href[i], text = text)
all_texts2 <- bind_rows(df2, tmp_df)
}
df <- rbind(df1, df2)
View(df)
df2 <- data.frame()
for (i in c(9:47)) {
url_one <- paste0("https://www.admin.ch", page_href[i])
message(url_one)
text <- read_html(url_one) %>%
html_nodes(xpath = '/html/body/div[2]/div[3]/div/div[2]/div/div[1]/div[3]') %>%
html_text()
tmp_df <- data.frame(url = page_href[i], text = text)
df2 <- bind_rows(df2, tmp_df)
}
df <- rbind(df1, df2)
View(df)
xxx <- 10000
yyy <- 0.1
zzz <- 10
for (iii in range(zzz)) {
print(xxx*(1+yyy)**iii)
}
range(zzz)
zzz <- 1:10
range(zzz)
for (iii in range(zzz)) {
print(xxx*(1+yyy)**iii)
}
zzz <- c(1:10)
range(zzz)
for (iii in range(zzz)) {
print(xxx*(1+yyy)**iii)
}
zzz <- 1:10
for (iii in zzz) {
print(xxx*(1+yyy)**iii)
}
print(xxx*(1+yyy)^iii)
for (iii in zzz) {
print(xxx*(1+yyy)^iii)
}
for (year in years) {
print(investements*(1+yearly_return)^year)
}
# Better:
investements <- 10000
yearly_return <- 0.1
years <- 1:10
for (year in years) {
print(investements*(1+yearly_return)^year)
}
getwd()
setwd("/Users/antheaalberto/switchdrive/Research and Infrastructure Support/Veranstaltungen/20230217_WWZ")
setwd("/Users/antheaalberto/switchdrive/Research and Infrastructure Support/Veranstaltungen/20230217_WWZ")
# Bad!
xxx <- 10000
yyy <- 0.1
zzz <- 1:10
for (iii in zzz) {
print(xxx*(1+yyy)^iii)
}
# Better:
investements <- 10000
yearly_return <- 0.1
years <- 1:10
for (year in years) {
print(investements*(1+yearly_return)^year)
}
