Load libraries:
op <- options(warn = (-1)) # suppress warnings
library(readtext)
library(stringr)
options(op) # reset the default value
Directory and file names:
path = file.path("/Users", "bmcgillivray", "Documents", "OneDrive", "The Alan Turing Institute", "Martina Astrid Rodda - MAR dphil project", fsep = "/")
path_wl = paste(path, "TAL paper", "ancient lexicography", sep = "/")
word_list_file_name = 'word lists for philomen.txt'
Set parameters for initial exploration:
window = 1
freq_threshold = 1
lexicon = "SCHMIDT"
Read word list:
word_list_file = readtext(file = paste(path_wl, word_list_file_name, sep = "/"), encoding = "UTF-8")
word_list_text = word_list_file$text
Process word list:
word_list_text = gsub("^.+?asterisk\n", "", word_list_text)
word_list = strsplit(word_list_text, "\n")[[1]]
#word_list = strsplit(word_list, "\t")
#word_list = as.list(word_list, recursive = F)
word_list = data.frame(word_list)
word_list = word_list[!word_list$word_list == "",] # remove empty rows
word_list = gsub("*", "", word_list, fixed = T) # remove *
word_list = gsub("\t$", "", word_list, perl = T) # remove *
word_list = data.frame(word_list)
word_list = str_split(word_list$word_list, "\t", 3) # split by tab
word_list = data.frame(matrix(unlist(word_list), ncol = 3, byrow = T))
colnames(word_list) = c("ID", "lemma", "cat")
word_list2 = str_split(word_list$cat, "/", 2) # split by /
word_list2 = data.frame(matrix(unlist(word_list2), ncol = 2, byrow = T))
colnames(word_list2) = c("frequency_class", "polysemy_class")
word_list2$frequent = ifelse(word_list2$frequency_class == "F", T, F)
word_list2$polysemous = ifelse(word_list2$polysemy_class == "P", T, F)
word_list = data.frame(word_list[,c("lemma")], word_list2[,c("frequent", "polysemous")])
colnames(word_list)[1] = "lemma"
word_list
## lemma frequent polysemous
## 1 ὄμμα TRUE TRUE
## 2 ἔργον TRUE TRUE
## 3 νόμος TRUE TRUE
## 4 γυνή TRUE TRUE
## 5 τραῦμα TRUE TRUE
## 6 μνήμη TRUE TRUE
## 7 θεραπεία TRUE TRUE
## 8 ἀνήρ TRUE TRUE
## 9 ἄνεμος TRUE FALSE
## 10 ἔλεος TRUE FALSE
## 11 ἐπιθυμία TRUE FALSE
## 12 ἐλπίς TRUE FALSE
## 13 ἡσυχία TRUE FALSE
## 14 εὐχή TRUE FALSE
## 15 ἔχθρα TRUE FALSE
## 16 κίνησις TRUE FALSE
## 17 ἴς FALSE TRUE
## 18 τάγμα FALSE TRUE
## 19 κάρα FALSE TRUE
## 20 ὀξύτης FALSE TRUE
## 21 ῥόος FALSE TRUE
## 22 ἄδεια FALSE TRUE
## 23 ῥάβδος FALSE TRUE
## 24 κλῆσις FALSE TRUE
## 25 ὀδύνη FALSE FALSE
## 26 ἠχή FALSE FALSE
## 27 βραδυτής FALSE FALSE
## 28 ὕπνος FALSE FALSE
## 29 ψυχρότης FALSE FALSE
## 30 οἰκειότης FALSE FALSE
## 31 μῖσος FALSE FALSE
## 32 κρήνη FALSE FALSE
Function for creating name of evaluation file from parameters:
path_ev_fun = function(path_var, window_var, freq_threshold_var, lexicon_var) {
print(paste("Creating name of evaluation file for", path_var, window_var, freq_threshold_var, lexicon_var, sep = " "))
path_ev = paste(path_var, "Evaluation", "output", paste("semantic-space-w", window_var, "_t", freq_threshold_var, sep = ""), paste("Lexicon_", lexicon_var, sep = ""), sep = "/")
evaluation_file_name = paste("summary_overlap_Lexicon_", lexicon_var, "_semantic-space_w", window_var, "_t", freq_threshold_var, "_neighbours.txt", sep = "")
print(paste("Path_ev:", path_ev, sep = " "))
print(paste("Evaluation_file_name:", evaluation_file_name, sep = " "))
outlist <- list("path_ev" = path_ev, "evaluation_file_name" = evaluation_file_name)
return(outlist)
}
Function for reading evaluation file:
read_ev_file_fun = function(path_var, window_var, freq_threshold_var, lexicon_var){
print(paste("Reading evaluation file for ", path_var, window_var, freq_threshold_var, lexicon_var, sep = " "))
ev_list = path_ev_fun(path_var, window_var, freq_threshold_var, lexicon_var)
path_ev = ev_list$path_ev
evaluation_file_name = ev_list$evaluation_file_name
evaluation_file = readtext(file = paste(path_ev, evaluation_file_name, sep = "/"), encoding = "UTF-8")
evaluation_text = evaluation_file$text
print(paste("Length of evaluation text:", length(evaluation_text[[1]])), sep = " ")
return (evaluation_text)
}
Function for processing evaluation text:
process_evaluation_text_fun = function(path_var, window_var, freq_threshold_var, lexicon_var){
print(paste("Processing evaluation text for ", path_var, window_var, freq_threshold_var, lexicon_var, sep = " "))
evaluation_text = read_ev_file_fun(path_var, window_var, freq_threshold_var, lexicon_var)
evaluation_text = gsub("Mean of precision.+$", "", evaluation_text)
evaluation = strsplit(evaluation_text, "\n")[[1]]
evaluation = data.frame(evaluation)
evaluation = str_split(evaluation$evaluation, "\t", 6) # split by tab
cnames = evaluation[[1]]
evaluation = data.frame(matrix(unlist(evaluation), ncol = 6, byrow = T))
colnames(evaluation) = cnames
evaluation = evaluation[2:nrow(evaluation),]
evaluation$precision = as.numeric(as.character(evaluation$precision))
evaluation$recall = as.numeric(as.character(evaluation$recall))
print(paste("Dimension of evaluation data frame:", dim(evaluation), sep = " "))
return (evaluation)
}
Create evaluation dataset:
evaluation = process_evaluation_text_fun(path, window, freq_threshold, lexicon)
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 1 SCHMIDT"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 1 SCHMIDT"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 1 SCHMIDT"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w1_t1/Lexicon_SCHMIDT"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_SCHMIDT_semantic-space_w1_t1_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 1029"
## [2] "Dimension of evaluation data frame: 6"
Combine word list with evaluation dataset:
data = merge(word_list, evaluation, by = c("lemma"))
hist(data$precision)
hist(data$recall)
shapiro.test(data$precision)
##
## Shapiro-Wilk normality test
##
## data: data$precision
## W = 0.63833, p-value = 1.172e-07
shapiro.test(data$recall)
##
## Shapiro-Wilk normality test
##
## data: data$recall
## W = 0.62816, p-value = 8.599e-08
The distributions are not normal. However, the t-test assumes that the means of the different samples are normally distributed; it does not assume that the population is normally distributed. By the central limit theorem, means of samples from a population with finite variance approach a normal distribution regardless of the distribution of the population. Rules of thumb say that the sample means are basically normally distributed as long as the sample size is at least 20 or 30. For a t-test to be valid on a sample of smaller size, the population distribution would have to be approximately normal. The t-test is invalid for small samples from non-normal distributions, but it is valid for large samples from non-normal distributions. In our case we have 32 data points.
par(mfrow = c(1,2))
boxplot(precision ~ frequent, data = data, main = "Precision by frequency")
boxplot(recall ~ frequent, data = data, main = "Recall by frequency")
par(mfrow = c(1,1))
Frequency does not seem to make a difference to evaluation metrics.
Is there a difference between frequent and non-frequent lemmas in terms of their precision?
tapply(data$precision, data$frequent, mean)
## FALSE TRUE
## 0.0375 0.0375
tapply(data$precision, data$frequent, sd)
## FALSE TRUE
## 0.06191392 0.06191392
boxplot(data$precision ~ data$frequent)
t.test(precision ~ frequent, data = data, paired = F)
##
## Welch Two Sample t-test
##
## data: precision by frequent
## t = 0, df = 30, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.04470509 0.04470509
## sample estimates:
## mean in group FALSE mean in group TRUE
## 0.0375 0.0375
No.
Is there a difference between frequent and non-frequent lemmas in terms of their recall?
tapply(data$recall, data$frequent, mean)
## FALSE TRUE
## 0.05797697 0.04195350
tapply(data$recall, data$frequent, sd)
## FALSE TRUE
## 0.1051991 0.0758737
boxplot(data$recall ~ data$frequent)
t.test(recall ~ frequent, data = data, paired = F)
##
## Welch Two Sample t-test
##
## data: recall by frequent
## t = 0.49415, df = 27.282, p-value = 0.6252
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.05047807 0.08252501
## sample estimates:
## mean in group FALSE mean in group TRUE
## 0.05797697 0.04195350
No.
The high p-value indicates that we cannot reject the null hypothesis that the two groups have the same mean.
par(mfrow = c(1,2))
boxplot(precision ~ polysemous, data = data, main = "Precision by polysemy")
boxplot(recall ~ polysemous, data = data, main = "Recall by polysemy")
par(mfrow = c(1,1))
Polysemy, rather than frequency, seems to make a difference to evaluation metrics, with polysemous lemmas performing worse than monosemous ones.
Is there a difference between polysemous and monosemous lemmas in terms of their precision?
tapply(data$precision, data$polysemous, mean)
## FALSE TRUE
## 0.04375 0.03125
tapply(data$precision, data$polysemous, sd)
## FALSE TRUE
## 0.06291529 0.06020797
boxplot(data$precision ~ data$polysemous)
t.test(precision ~ polysemous, data = data, paired = F)
##
## Welch Two Sample t-test
##
## data: precision by polysemous
## t = 0.57417, df = 29.942, p-value = 0.5701
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.03196507 0.05696507
## sample estimates:
## mean in group FALSE mean in group TRUE
## 0.04375 0.03125
No. The high p-value indicates that we cannot reject the null hypothesis that the two groups have the same mean.
Is there a difference between polysemous and monosemous lemmas in terms of their recall?
tapply(data$recall, data$polysemous, mean)
## FALSE TRUE
## 0.06329853 0.03663194
tapply(data$recall, data$polysemous, sd)
## FALSE TRUE
## 0.10637037 0.07257194
boxplot(data$recall ~ data$polysemous)
t.test(recall ~ polysemous, data = data, paired = F)
##
## Welch Two Sample t-test
##
## data: recall by polysemous
## t = 0.82836, df = 26.477, p-value = 0.4149
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.03944728 0.09278045
## sample estimates:
## mean in group FALSE mean in group TRUE
## 0.06329853 0.03663194
No. The high p-value indicates that we cannot reject the null hypothesis that the two groups have the same mean.
ev.pr.lm <- lm(precision ~ frequent + polysemous, data = data)
summary(ev.pr.lm)
##
## Call:
## lm(formula = precision ~ frequent + polysemous, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.04375 -0.04375 -0.03125 0.05625 0.16875
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.375e-02 1.918e-02 2.281 0.030 *
## frequentTRUE -1.472e-18 2.214e-02 0.000 1.000
## polysemousTRUE -1.250e-02 2.214e-02 -0.565 0.577
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06263 on 29 degrees of freedom
## Multiple R-squared: 0.01087, Adjusted R-squared: -0.05735
## F-statistic: 0.1593 on 2 and 29 DF, p-value: 0.8534
Model diagnostics:
par(mfrow = c(2,2))
plot(ev.pr.lm)
## hat values (leverages) are all = 0.09375
## and there are no factor predictors; no plot no. 5
par(mfrow = c(1,1))
Not a good model.
ev.re.lm <- lm(recall ~ frequent + polysemous, data = data)
summary(ev.re.lm)
##
## Call:
## lm(formula = recall ~ frequent + polysemous, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.071310 -0.055287 -0.044644 0.008989 0.262023
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.07131 0.02824 2.525 0.0173 *
## frequentTRUE -0.01602 0.03261 -0.491 0.6268
## polysemousTRUE -0.02667 0.03261 -0.818 0.4201
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09223 on 29 degrees of freedom
## Multiple R-squared: 0.03043, Adjusted R-squared: -0.03643
## F-statistic: 0.4552 on 2 and 29 DF, p-value: 0.6388
Model diagnostics:
par(mfrow = c(2,2))
plot(ev.re.lm)
## hat values (leverages) are all = 0.09375
## and there are no factor predictors; no plot no. 5
par(mfrow = c(1,1))
Not a good model.
Loop over all parameter combinations:
windows = c(1,5,10)
#windows = c(5,10)
freq_thresholds = c(1,20,50,100)
#freq_thresholds = c(20,50,100)
lexicons = c("SCHMIDT", "AGWN", "POLLUX")
#lexicons = c("POLLUX")
evaluations = data.frame(lemma=(character()),
precision=double(),
recall=double(),
window=integer(),
freq_threshold=integer(),
lexicon=factor(),
stringsAsFactors=FALSE)
for (window in windows) {
print(paste("Window:", window, sep = ""))
for (f in freq_thresholds) {
print(paste("Freq threshold:", f, sep = ""))
for (lexicon in lexicons){
print(paste("Lexicon:", lexicon, sep = ""))
if ( !(window == 1 && f == 1 && lexicon == "AGWN") && !(window == 5 && f == 1 && lexicon == "AGWN") && !(window == 10 && f ==1 && lexicon == "AGWN")){
print(paste("Continue with", "window:", window, "Freq threshold:", f, "Lexicon:", lexicon, sep = " "))
evaluation = process_evaluation_text_fun(path, window, f, lexicon)
evaluation = evaluation[,c("lemma", "precision", "recall")]
evaluation$window = window
evaluation$freq_threshold = f
evaluation$lexicon = lexicon
evaluations = rbind(evaluations, evaluation)}
}
}
}
## [1] "Window:1"
## [1] "Freq threshold:1"
## [1] "Lexicon:SCHMIDT"
## [1] "Continue with window: 1 Freq threshold: 1 Lexicon: SCHMIDT"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 1 SCHMIDT"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 1 SCHMIDT"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 1 SCHMIDT"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w1_t1/Lexicon_SCHMIDT"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_SCHMIDT_semantic-space_w1_t1_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 1029"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Lexicon:AGWN"
## [1] "Lexicon:POLLUX"
## [1] "Continue with window: 1 Freq threshold: 1 Lexicon: POLLUX"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 1 POLLUX"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 1 POLLUX"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 1 POLLUX"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w1_t1/Lexicon_POLLUX"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_POLLUX_semantic-space_w1_t1_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 309"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Freq threshold:20"
## [1] "Lexicon:SCHMIDT"
## [1] "Continue with window: 1 Freq threshold: 20 Lexicon: SCHMIDT"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 20 SCHMIDT"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 20 SCHMIDT"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 20 SCHMIDT"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w1_t20/Lexicon_SCHMIDT"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_SCHMIDT_semantic-space_w1_t20_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 701"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Lexicon:AGWN"
## [1] "Continue with window: 1 Freq threshold: 20 Lexicon: AGWN"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 20 AGWN"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 20 AGWN"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 20 AGWN"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w1_t20/Lexicon_AGWN"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_AGWN_semantic-space_w1_t20_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 6864"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Lexicon:POLLUX"
## [1] "Continue with window: 1 Freq threshold: 20 Lexicon: POLLUX"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 20 POLLUX"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 20 POLLUX"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 20 POLLUX"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w1_t20/Lexicon_POLLUX"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_POLLUX_semantic-space_w1_t20_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 236"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Freq threshold:50"
## [1] "Lexicon:SCHMIDT"
## [1] "Continue with window: 1 Freq threshold: 50 Lexicon: SCHMIDT"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 50 SCHMIDT"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 50 SCHMIDT"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 50 SCHMIDT"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w1_t50/Lexicon_SCHMIDT"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_SCHMIDT_semantic-space_w1_t50_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 531"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Lexicon:AGWN"
## [1] "Continue with window: 1 Freq threshold: 50 Lexicon: AGWN"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 50 AGWN"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 50 AGWN"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 50 AGWN"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w1_t50/Lexicon_AGWN"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_AGWN_semantic-space_w1_t50_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 4666"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Lexicon:POLLUX"
## [1] "Continue with window: 1 Freq threshold: 50 Lexicon: POLLUX"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 50 POLLUX"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 50 POLLUX"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 50 POLLUX"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w1_t50/Lexicon_POLLUX"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_POLLUX_semantic-space_w1_t50_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 177"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Freq threshold:100"
## [1] "Lexicon:SCHMIDT"
## [1] "Continue with window: 1 Freq threshold: 100 Lexicon: SCHMIDT"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 100 SCHMIDT"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 100 SCHMIDT"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 100 SCHMIDT"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w1_t100/Lexicon_SCHMIDT"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_SCHMIDT_semantic-space_w1_t100_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 423"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Lexicon:AGWN"
## [1] "Continue with window: 1 Freq threshold: 100 Lexicon: AGWN"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 100 AGWN"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 100 AGWN"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 100 AGWN"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w1_t100/Lexicon_AGWN"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_AGWN_semantic-space_w1_t100_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 3329"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Lexicon:POLLUX"
## [1] "Continue with window: 1 Freq threshold: 100 Lexicon: POLLUX"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 100 POLLUX"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 100 POLLUX"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 1 100 POLLUX"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w1_t100/Lexicon_POLLUX"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_POLLUX_semantic-space_w1_t100_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 146"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Window:5"
## [1] "Freq threshold:1"
## [1] "Lexicon:SCHMIDT"
## [1] "Continue with window: 5 Freq threshold: 1 Lexicon: SCHMIDT"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 1 SCHMIDT"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 1 SCHMIDT"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 1 SCHMIDT"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w5_t1/Lexicon_SCHMIDT"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_SCHMIDT_semantic-space_w5_t1_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 1046"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Lexicon:AGWN"
## [1] "Lexicon:POLLUX"
## [1] "Continue with window: 5 Freq threshold: 1 Lexicon: POLLUX"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 1 POLLUX"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 1 POLLUX"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 1 POLLUX"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w5_t1/Lexicon_POLLUX"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_POLLUX_semantic-space_w5_t1_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 313"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Freq threshold:20"
## [1] "Lexicon:SCHMIDT"
## [1] "Continue with window: 5 Freq threshold: 20 Lexicon: SCHMIDT"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 20 SCHMIDT"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 20 SCHMIDT"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 20 SCHMIDT"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w5_t20/Lexicon_SCHMIDT"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_SCHMIDT_semantic-space_w5_t20_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 701"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Lexicon:AGWN"
## [1] "Continue with window: 5 Freq threshold: 20 Lexicon: AGWN"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 20 AGWN"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 20 AGWN"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 20 AGWN"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w5_t20/Lexicon_AGWN"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_AGWN_semantic-space_w5_t20_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 6865"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Lexicon:POLLUX"
## [1] "Continue with window: 5 Freq threshold: 20 Lexicon: POLLUX"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 20 POLLUX"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 20 POLLUX"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 20 POLLUX"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w5_t20/Lexicon_POLLUX"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_POLLUX_semantic-space_w5_t20_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 236"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Freq threshold:50"
## [1] "Lexicon:SCHMIDT"
## [1] "Continue with window: 5 Freq threshold: 50 Lexicon: SCHMIDT"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 50 SCHMIDT"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 50 SCHMIDT"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 50 SCHMIDT"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w5_t50/Lexicon_SCHMIDT"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_SCHMIDT_semantic-space_w5_t50_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 531"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Lexicon:AGWN"
## [1] "Continue with window: 5 Freq threshold: 50 Lexicon: AGWN"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 50 AGWN"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 50 AGWN"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 50 AGWN"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w5_t50/Lexicon_AGWN"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_AGWN_semantic-space_w5_t50_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 4666"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Lexicon:POLLUX"
## [1] "Continue with window: 5 Freq threshold: 50 Lexicon: POLLUX"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 50 POLLUX"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 50 POLLUX"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 50 POLLUX"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w5_t50/Lexicon_POLLUX"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_POLLUX_semantic-space_w5_t50_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 177"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Freq threshold:100"
## [1] "Lexicon:SCHMIDT"
## [1] "Continue with window: 5 Freq threshold: 100 Lexicon: SCHMIDT"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 100 SCHMIDT"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 100 SCHMIDT"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 100 SCHMIDT"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w5_t100/Lexicon_SCHMIDT"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_SCHMIDT_semantic-space_w5_t100_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 423"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Lexicon:AGWN"
## [1] "Continue with window: 5 Freq threshold: 100 Lexicon: AGWN"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 100 AGWN"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 100 AGWN"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 100 AGWN"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w5_t100/Lexicon_AGWN"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_AGWN_semantic-space_w5_t100_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 3329"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Lexicon:POLLUX"
## [1] "Continue with window: 5 Freq threshold: 100 Lexicon: POLLUX"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 100 POLLUX"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 100 POLLUX"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 5 100 POLLUX"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w5_t100/Lexicon_POLLUX"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_POLLUX_semantic-space_w5_t100_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 146"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Window:10"
## [1] "Freq threshold:1"
## [1] "Lexicon:SCHMIDT"
## [1] "Continue with window: 10 Freq threshold: 1 Lexicon: SCHMIDT"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 1 SCHMIDT"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 1 SCHMIDT"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 1 SCHMIDT"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w10_t1/Lexicon_SCHMIDT"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_SCHMIDT_semantic-space_w10_t1_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 1046"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Lexicon:AGWN"
## [1] "Lexicon:POLLUX"
## [1] "Continue with window: 10 Freq threshold: 1 Lexicon: POLLUX"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 1 POLLUX"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 1 POLLUX"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 1 POLLUX"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w10_t1/Lexicon_POLLUX"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_POLLUX_semantic-space_w10_t1_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 313"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Freq threshold:20"
## [1] "Lexicon:SCHMIDT"
## [1] "Continue with window: 10 Freq threshold: 20 Lexicon: SCHMIDT"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 20 SCHMIDT"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 20 SCHMIDT"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 20 SCHMIDT"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w10_t20/Lexicon_SCHMIDT"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_SCHMIDT_semantic-space_w10_t20_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 701"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Lexicon:AGWN"
## [1] "Continue with window: 10 Freq threshold: 20 Lexicon: AGWN"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 20 AGWN"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 20 AGWN"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 20 AGWN"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w10_t20/Lexicon_AGWN"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_AGWN_semantic-space_w10_t20_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 6865"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Lexicon:POLLUX"
## [1] "Continue with window: 10 Freq threshold: 20 Lexicon: POLLUX"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 20 POLLUX"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 20 POLLUX"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 20 POLLUX"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w10_t20/Lexicon_POLLUX"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_POLLUX_semantic-space_w10_t20_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 236"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Freq threshold:50"
## [1] "Lexicon:SCHMIDT"
## [1] "Continue with window: 10 Freq threshold: 50 Lexicon: SCHMIDT"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 50 SCHMIDT"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 50 SCHMIDT"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 50 SCHMIDT"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w10_t50/Lexicon_SCHMIDT"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_SCHMIDT_semantic-space_w10_t50_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 531"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Lexicon:AGWN"
## [1] "Continue with window: 10 Freq threshold: 50 Lexicon: AGWN"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 50 AGWN"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 50 AGWN"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 50 AGWN"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w10_t50/Lexicon_AGWN"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_AGWN_semantic-space_w10_t50_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 4666"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Lexicon:POLLUX"
## [1] "Continue with window: 10 Freq threshold: 50 Lexicon: POLLUX"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 50 POLLUX"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 50 POLLUX"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 50 POLLUX"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w10_t50/Lexicon_POLLUX"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_POLLUX_semantic-space_w10_t50_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 177"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Freq threshold:100"
## [1] "Lexicon:SCHMIDT"
## [1] "Continue with window: 10 Freq threshold: 100 Lexicon: SCHMIDT"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 100 SCHMIDT"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 100 SCHMIDT"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 100 SCHMIDT"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w10_t100/Lexicon_SCHMIDT"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_SCHMIDT_semantic-space_w10_t100_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 423"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Lexicon:AGWN"
## [1] "Continue with window: 10 Freq threshold: 100 Lexicon: AGWN"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 100 AGWN"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 100 AGWN"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 100 AGWN"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w10_t100/Lexicon_AGWN"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_AGWN_semantic-space_w10_t100_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 3329"
## [2] "Dimension of evaluation data frame: 6"
## [1] "Lexicon:POLLUX"
## [1] "Continue with window: 10 Freq threshold: 100 Lexicon: POLLUX"
## [1] "Processing evaluation text for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 100 POLLUX"
## [1] "Reading evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 100 POLLUX"
## [1] "Creating name of evaluation file for /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project 10 100 POLLUX"
## [1] "Path_ev: /Users/bmcgillivray/Documents/OneDrive/The Alan Turing Institute/Martina Astrid Rodda - MAR dphil project/Evaluation/output/semantic-space-w10_t100/Lexicon_POLLUX"
## [1] "Evaluation_file_name: summary_overlap_Lexicon_POLLUX_semantic-space_w10_t100_neighbours.txt"
## [1] "Length of evaluation text: 1"
## [1] "Dimension of evaluation data frame: 146"
## [2] "Dimension of evaluation data frame: 6"
evaluations$lexicon = as.factor(evaluations$lexicon)
Prepare final dataset:
data = merge(word_list, evaluations, by = c("lemma"))
data$lexicon = as.factor(data$lexicon)
dim(data)
## [1] 690 8
summary(data)
## lemma frequent polysemous precision
## ἔχθρα : 33 Mode :logical Mode :logical Min. :0.00000
## ἡσυχία : 33 FALSE:330 FALSE:360 1st Qu.:0.00000
## μῖσος : 33 TRUE :360 TRUE :330 Median :0.00000
## ἄδεια : 21 Mean :0.05565
## ἄνεμος : 21 3rd Qu.:0.10000
## ἀνήρ : 21 Max. :0.40000
## (Other):528
## recall window freq_threshold lexicon
## Min. :0.00000 Min. : 1.000 Min. : 1.00 AGWN :279
## 1st Qu.:0.00000 1st Qu.: 1.000 1st Qu.: 20.00 POLLUX : 36
## Median :0.00000 Median : 5.000 Median : 50.00 SCHMIDT:375
## Mean :0.05318 Mean : 5.333 Mean : 47.07
## 3rd Qu.:0.08333 3rd Qu.:10.000 3rd Qu.:100.00
## Max. :0.50000 Max. :10.000 Max. :100.00
##
hist(data$precision)
hist(data$recall)
shapiro.test(data$precision)
##
## Shapiro-Wilk normality test
##
## data: data$precision
## W = 0.68646, p-value < 2.2e-16
shapiro.test(data$recall)
##
## Shapiro-Wilk normality test
##
## data: data$recall
## W = 0.6561, p-value < 2.2e-16
The distributions are not normal. However, The t-test assumes that the means of the different samples are normally distributed; it does not assume that the population is normally distributed. By the central limit theorem, means of samples from a population with finite variance approach a normal distribution regardless of the distribution of the population. Rules of thumb say that the sample means are basically normally distributed as long as the sample size is at least 20 or 30. For a t-test to be valid on a sample of smaller size, the population distribution would have to be approximately normal. The t-test is invalid for small samples from non-normal distributions, but it is valid for large samples from non-normal distributions. In our case we have 690 data points.
par(mfrow = c(1,2))
boxplot(precision ~ frequent, data = data, main = "Precision by frequency")
boxplot(recall ~ frequent, data = data, main = "Recall by frequency")
par(mfrow = c(1,1))
Frequency does not seem to make a difference to evaluation metrics.
Is there a statistically significant difference between frequent and non-frequent lemmas in terms of their precision?
tapply(data$precision, data$frequent, mean)
## FALSE TRUE
## 0.06181818 0.05000000
tapply(data$precision, data$frequent, sd)
## FALSE TRUE
## 0.09515479 0.06881407
boxplot(data$precision ~ data$frequent)
t.test(precision ~ frequent, data = data, paired = F)
##
## Welch Two Sample t-test
##
## data: precision by frequent
## t = 1.855, df = 594.79, p-value = 0.0641
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.0006944971 0.0243308607
## sample estimates:
## mean in group FALSE mean in group TRUE
## 0.06181818 0.05000000
No.
Is there a statistically significant difference between frequent and non-frequent lemmas in terms of their recall?
tapply(data$recall, data$frequent, mean)
## FALSE TRUE
## 0.05868311 0.04813696
tapply(data$recall, data$frequent, sd)
## FALSE TRUE
## 0.10258151 0.07662795
boxplot(data$recall ~ data$frequent)
t.test(recall ~ frequent, data = data, paired = F)
##
## Welch Two Sample t-test
##
## data: recall by frequent
## t = 1.5191, df = 606.28, p-value = 0.1293
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.003088139 0.024180439
## sample estimates:
## mean in group FALSE mean in group TRUE
## 0.05868311 0.04813696
No.
The high p-value indicates that we cannot reject the null hypothesis that the two groups have the same mean.
par(mfrow = c(1,2))
boxplot(precision ~ polysemous, data = data, main = "Precision by polysemy")
boxplot(recall ~ polysemous, data = data, main = "Recall by polysemy")
par(mfrow = c(1,1))
Polysemy, rather than frequency, seems to make a difference to evaluation metrics, with polysemous lemmas performing worse than monosemous ones.
Is there a statistically significant difference between polysemous and monosemous lemmas in terms of their precision?
tapply(data$precision, data$polysemous, mean)
## FALSE TRUE
## 0.06916667 0.04090909
tapply(data$precision, data$polysemous, sd)
## FALSE TRUE
## 0.08652612 0.07554719
boxplot(data$precision ~ data$polysemous)
t.test(precision ~ polysemous, data = data, paired = F)
##
## Welch Two Sample t-test
##
## data: precision by polysemous
## t = 4.5785, df = 686.39, p-value = 5.561e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.01613962 0.04037554
## sample estimates:
## mean in group FALSE mean in group TRUE
## 0.06916667 0.04090909
Yes. The low p-value indicates that we can reject the null hypothesis that the two groups have the same mean.
Is there a statistically significant difference between polysemous and monosemous lemmas in terms of their recall?
tapply(data$recall, data$polysemous, mean)
## FALSE TRUE
## 0.06662527 0.03851405
tapply(data$recall, data$polysemous, sd)
## FALSE TRUE
## 0.09638540 0.08023477
boxplot(data$recall ~ data$polysemous)
t.test(recall ~ polysemous, data = data, paired = F)
##
## Welch Two Sample t-test
##
## data: recall by polysemous
## t = 4.176, df = 681.79, p-value = 3.351e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.01489414 0.04132829
## sample estimates:
## mean in group FALSE mean in group TRUE
## 0.06662527 0.03851405
Yes. The low p-value indicates that we can reject the null hypothesis that the two groups have the same mean.
ev.pr.lm <- lm(precision ~ frequent + polysemous, data = data)
summary(ev.pr.lm)
##
## Call:
## lm(formula = precision ~ frequent + polysemous, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.07584 -0.06333 -0.03477 0.03667 0.35272
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.075839 0.005410 14.017 < 2e-16 ***
## frequentTRUE -0.012511 0.006196 -2.019 0.0439 *
## polysemousTRUE -0.028561 0.006196 -4.609 4.81e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08128 on 687 degrees of freedom
## Multiple R-squared: 0.03496, Adjusted R-squared: 0.03215
## F-statistic: 12.44 on 2 and 687 DF, p-value: 4.914e-06
Model diagnostics:
par(mfrow = c(2,2))
plot(ev.pr.lm)
par(mfrow = c(1,1))
Not a good model.
ev.re.lm <- lm(recall ~ frequent + polysemous, data = data)
summary(ev.re.lm)
##
## Call:
## lm(formula = recall ~ frequent + polysemous, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.07262 -0.06138 -0.03300 0.02256 0.46700
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.072617 0.005919 12.269 < 2e-16 ***
## frequentTRUE -0.011234 0.006778 -1.657 0.0979 .
## polysemousTRUE -0.028384 0.006778 -4.187 3.19e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08892 on 687 degrees of freedom
## Multiple R-squared: 0.02823, Adjusted R-squared: 0.0254
## F-statistic: 9.978 on 2 and 687 DF, p-value: 5.349e-05
par(mfrow = c(2,2))
plot(ev.re.lm)
par(mfrow = c(1,1))
Not a good model.
ev.pr.lm2 <- lm(precision ~ frequent + polysemous + lexicon + window + freq_threshold, data = data)
summary(ev.pr.lm2)
##
## Call:
## lm(formula = precision ~ frequent + polysemous + lexicon + window +
## freq_threshold, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.16197 -0.05045 -0.03461 0.04837 0.35301
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.927e-02 8.919e-03 7.767 2.95e-14 ***
## frequentTRUE -1.558e-02 5.953e-03 -2.616 0.00909 **
## polysemousTRUE -1.756e-02 6.091e-03 -2.883 0.00406 **
## lexiconPOLLUX 1.102e-01 1.420e-02 7.763 3.03e-14 ***
## lexiconSCHMIDT -6.969e-04 6.267e-03 -0.111 0.91150
## window -3.849e-04 8.052e-04 -0.478 0.63281
## freq_threshold -8.705e-06 8.439e-05 -0.103 0.91787
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07787 on 683 degrees of freedom
## Multiple R-squared: 0.1193, Adjusted R-squared: 0.1115
## F-statistic: 15.42 on 6 and 683 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(ev.pr.lm2)
par(mfrow = c(1,1))
Not a good model.
Stepwise regression:
ev.pr.m0 <- lm(precision ~ 1, data = data)
ev.pr.lm.step <- step(ev.pr.m0, scope = ~ frequent + polysemous + lexicon + window + recall + freq_threshold)#, direction = "forward")
## Start: AIC=-3440.07
## precision ~ 1
##
## Df Sum of Sq RSS AIC
## + recall 1 2.49475 2.2082 -3959.7
## + lexicon 2 0.46813 4.2348 -3508.4
## + polysemous 1 0.13748 4.5655 -3458.5
## + frequent 1 0.02405 4.6789 -3441.6
## <none> 4.7030 -3440.1
## + window 1 0.00139 4.7016 -3438.3
## + freq_threshold 1 0.00135 4.7016 -3438.3
##
## Step: AIC=-3959.71
## precision ~ recall
##
## Df Sum of Sq RSS AIC
## + lexicon 2 0.41593 1.7923 -4099.7
## + polysemous 1 0.01585 2.1924 -3962.7
## <none> 2.2082 -3959.7
## + frequent 1 0.00393 2.2043 -3958.9
## + freq_threshold 1 0.00122 2.2070 -3958.1
## + window 1 0.00050 2.2077 -3957.9
## - recall 1 2.49475 4.7030 -3440.1
##
## Step: AIC=-4099.71
## precision ~ recall + lexicon
##
## Df Sum of Sq RSS AIC
## + frequent 1 0.01010 1.7822 -4101.6
## <none> 1.7923 -4099.7
## + window 1 0.00055 1.7917 -4097.9
## + freq_threshold 1 0.00014 1.7921 -4097.8
## + polysemous 1 0.00001 1.7923 -4097.7
## - lexicon 2 0.41593 2.2082 -3959.7
## - recall 1 2.44255 4.2348 -3508.4
##
## Step: AIC=-4101.61
## precision ~ recall + lexicon + frequent
##
## Df Sum of Sq RSS AIC
## <none> 1.7822 -4101.6
## + window 1 0.00053 1.7816 -4099.8
## - frequent 1 0.01010 1.7923 -4099.7
## + freq_threshold 1 0.00006 1.7821 -4099.6
## + polysemous 1 0.00000 1.7822 -4099.6
## - lexicon 2 0.42210 2.2043 -3958.9
## - recall 1 2.41178 4.1940 -3513.1
summary(ev.pr.lm.step)
##
## Call:
## lm(formula = precision ~ recall + lexicon + frequent, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.145057 -0.026309 -0.007445 0.004614 0.230953
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.034002 0.003739 9.094 < 2e-16 ***
## recall 0.675224 0.022177 30.447 < 2e-16 ***
## lexiconPOLLUX 0.080277 0.009142 8.781 < 2e-16 ***
## lexiconSCHMIDT -0.026557 0.004122 -6.442 2.21e-10 ***
## frequentTRUE -0.007693 0.003905 -1.970 0.0492 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05101 on 685 degrees of freedom
## Multiple R-squared: 0.6211, Adjusted R-squared: 0.6188
## F-statistic: 280.7 on 4 and 685 DF, p-value: < 2.2e-16
The higher the recall, the higher the precision, as expected. More interestingly, Schmidt’s lexicon is associated to lower precision compared to the baseline, AGWN.
Model diagnostics:
par(mfrow = c(2,2))
plot(ev.pr.lm.step)
par(mfrow = c(1,1))
Not a great model.
ev.re.m0 <- lm(recall ~ 1, data = data)
ev.re.lm.step <- step(ev.re.m0,
scope = ~ frequent + polysemous + lexicon + window + precision)
## Start: AIC=-3320.94
## recall ~ 1
##
## Df Sum of Sq RSS AIC
## + precision 1 2.96486 2.6243 -3840.6
## + lexicon 2 0.27679 5.3124 -3352.0
## + polysemous 1 0.13606 5.4531 -3335.9
## + frequent 1 0.01915 5.5700 -3321.3
## <none> 5.5892 -3320.9
## + window 1 0.00798 5.5812 -3319.9
##
## Step: AIC=-3840.59
## recall ~ precision
##
## Df Sum of Sq RSS AIC
## + lexicon 2 0.37600 2.2483 -3943.3
## <none> 2.6243 -3840.6
## + polysemous 1 0.00571 2.6186 -3840.1
## + window 1 0.00357 2.6208 -3839.5
## + frequent 1 0.00023 2.6241 -3838.6
## - precision 1 2.96486 5.5892 -3320.9
##
## Step: AIC=-3943.29
## recall ~ precision + lexicon
##
## Df Sum of Sq RSS AIC
## + polysemous 1 0.02066 2.2277 -3947.7
## <none> 2.2483 -3943.3
## + window 1 0.00332 2.2450 -3942.3
## + frequent 1 0.00048 2.2479 -3941.4
## - lexicon 2 0.37600 2.6243 -3840.6
## - precision 1 3.06407 5.3124 -3352.0
##
## Step: AIC=-3947.66
## recall ~ precision + lexicon + polysemous
##
## Df Sum of Sq RSS AIC
## <none> 2.2277 -3947.7
## + window 1 0.00336 2.2243 -3946.7
## + frequent 1 0.00036 2.2273 -3945.8
## - polysemous 1 0.02066 2.2483 -3943.3
## - lexicon 2 0.39095 2.6186 -3840.1
## - precision 1 2.97423 5.2019 -3364.5
#direction = "forward"
summary(ev.re.lm.step)
##
## Call:
## lm(formula = recall ~ precision + lexicon + polysemous, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.15445 -0.03198 -0.01482 0.01828 0.31073
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.006976 0.004405 -1.584 0.1137
## precision 0.843005 0.027876 30.242 < 2e-16 ***
## lexiconPOLLUX -0.048343 0.010777 -4.486 8.52e-06 ***
## lexiconSCHMIDT 0.038958 0.004509 8.641 < 2e-16 ***
## polysemousTRUE -0.011308 0.004486 -2.521 0.0119 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05703 on 685 degrees of freedom
## Multiple R-squared: 0.6014, Adjusted R-squared: 0.5991
## F-statistic: 258.4 on 4 and 685 DF, p-value: < 2.2e-16
png(paste(path, "Evaluation", "plots", "model_diagnostics.png", sep = "/"))
par(mfrow = c(2,2))
plot(ev.re.lm.step)
par(mfrow = c(1,1))
dev.off()
## quartz_off_screen
## 2
hist(resid(ev.re.lm.step))
plot(ev.re.lm.step, which = 1)
The histogram of standardised residuals shows that the assumption of normality is valid. The fitted values and residuals plot shows that the assumption of homoscedasticity is valid, as there is no pattern in the scatterplot and the width of the scatterplot as predicted values increase is roughly the same.
The higher the precision, the higher the recall, as expected. More interestingly, Schmidt’s lexicon is associated to higher recall compared to the baseline, AGWN.