Load packages

#Load packages
library(here)
## here() starts at /Users/davidgoldstein/Documents/LaTeX/HomerObliqueCase
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.3
## ✓ tidyr   1.0.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(R2jags)
## Loading required package: rjags
## Loading required package: coda
## Linked to JAGS 4.3.0
## Loaded modules: basemod,bugs
## 
## Attaching package: 'R2jags'
## The following object is masked from 'package:coda':
## 
##     traceplot
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(stringi)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(ggthemes)

Read in data

#This is where the dataset is
here::here("Data", "Homer_Phi.csv")
#Read in .csv file
phi.data <- read.csv(file = "Data/Homer_Phi.csv"
               stringsAsFactors = FALSE, 
              header = TRUE)
#Make factors---Do not make Greek words into factors
phi.data$Suffix <- factor(phi.data$Suffix)
phi.data$Text <- factor(phi.data$Text)
#Convert to Roman script
phi.data$Form.Latin <- stri_trans_general(phi.data$Form, "greek-latin")
#Remove prepositions (must be character strings and not factors)
phi.data <- phi.data %>% filter(Form.Latin != "nósphi" & Form.Latin != "nósphin" & Form.Latin != "aponósphi" & Form.Latin != "aponósphin")
#Unique forms
unique.phi.data <- phi.data %>% distinct(Entry, .keep_all = TRUE) #Don't use Form.Latin for constraint
## [1] "/Users/davidgoldstein/Documents/LaTeX/HomerObliqueCase/Data/Homer_Phi.csv"

Frequency distributions

Token and type frequency

#Token frequency
phi.token.freq <- count(phi.data, Text)
count1.bar <- ggplot(data = phi.token.freq, aes(x = Text, y = n, label = n)) +
  geom_bar(stat = "identity", color = "black", fill = "white", width = 0.5) +
  labs(y = "", x = "", size = 20) +
  geom_text(size = 11, position = position_stack(vjust = 0.5)) +
  theme_tufte() +
  ggtitle("Token frequency") +
  theme(text=element_text(size = 22)) +
    theme(axis.text.x = element_text(angle = 45, vjust = .5)) 

#Type frequency
phi.type.freq <- count(unique.phi.data, Text)
count2.bar <- ggplot(data = phi.type.freq, aes(x = Text, y = n, label = n)) +
  geom_bar(stat = "identity", color="black", fill="white", width = 0.5) +
  labs(y = "", x ="", size = 20) +
  geom_text(size = 11, position = position_stack(vjust = 0.5)) +
  theme_tufte() +
  ggtitle("Type frequency") +
  theme(text=element_text(size = 22)) +
    theme(axis.text.x = element_text(angle = 45, vjust = .5))
#Combine bar graphs in grid
grid.arrange(count1.bar, count2.bar, ncol = 2)
Frequency distribution of -φι(ν) in the _Iliad_ and _Odyssey_

Frequency distribution of -φι(ν) in the Iliad and Odyssey

Token frequency of each word form

order.table <- count(phi.data, Entry)
#NB reorder(Entry, n) and reorder(Entry, -n)
zipf.phi.bar <- ggplot(data = order.table, aes(x = reorder(Entry, n), y = n)) +
  geom_bar(stat = "identity", width = 0.35, color =" black", fill = "white") +
  coord_fixed(ratio = 0.85) +
  labs(y = "", x ="") +
  theme_tufte() +
  theme(text=element_text(size=16)) +
  scale_x_discrete(expand=c(0.00000000000000000001, 0)) 
  zipf.phi.bar + 
  coord_flip() +
  labs(y = "Frequency", x = "Word form")  
Frequency according to word form

Frequency according to word form

Frequency distribution according to grammatical number

homeric.counts <- data.frame(Number = c("Singular", "Dual", "Plural", "Unclassified", 
                                        "Singular", "Dual", "Plural", "Unclassified"),
Tokens = c(56976, 752, 26088, 16, 17, 0, 137, 10),
Frequency = c(56976/83832, 752/83832, 26088/83832, 16/83832, 17/164, 0/164, 137/164, 10/164),
Group = c("Non-φι(ν)","Non-φι(ν)", "Non-φι(ν)",  "Non-φι(ν)", "-φι(ν)", "-φι(ν)", "-φι(ν)", "-φι(ν)"))

homeric.counts$Number <- factor(homeric.counts$Number, levels = c("Singular", "Dual", "Plural", "Unclassified"))

ggplot(data = homeric.counts, mapping = aes(x = Number, y = Frequency)) +
  geom_bar(stat = "identity") +
  facet_wrap(~Group, ncol = 2) +
  theme_tufte() +
  theme(text=element_text(size = 16), axis.title.x = element_blank()) +
  ylab("Relative frequency") 
Relative frequency according to grammatical number

Relative frequency according to grammatical number

An excursus on dual φι(ν)-forms

The code for the Bayesian analysis below was adapted from Lee and Wagenmakers, Bayesian Cognitive Modeling (Cambridge 2013), pp. 127–129.

Data

#s successes, n trials
s1 <- 0 
n1 <- 164 
s2 <- 752
n2 <- 83832 

Two-tailed test of population proportion

# Two-sided p-value 
prop.test(c(s1, s2), c(n1, n2), alternative = c("two.sided")) #approximate
## Warning in prop.test(c(s1, s2), c(n1, n2), alternative = c("two.sided")): Chi-
## squared approximation may be incorrect
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(s1, s2) out of c(n1, n2)
## X-squared = 0.64556, df = 1, p-value = 0.4217
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.012663316 -0.005277327
## sample estimates:
##      prop 1      prop 2 
## 0.000000000 0.008970322

Bayes Factor

# Analytical Bayes factor
log.BF01 <- lchoose(n1, s1) + lchoose(n2, s2) + log(n1 + 1) + log(n2 + 1) - lchoose((n1 + n2),(s1 + s2)) - log(n1 + n2 + 1)
BF01 <- exp(log.BF01)
BF01 
## [1] 37.6254

Model specification

data  <- list("s1", "s2", "n1", "n2") 

myinits <- list(
  list(theta1 = runif(1), theta2 = runif(1), theta1prior = runif(1), theta2prior = runif(1)),
  list(theta1 = runif(1), theta2 = runif(1), theta1prior = runif(1), theta2prior = runif(1)),
  list(theta1 = runif(1), theta2 = runif(1), theta1prior = runif(1), theta2prior = runif(1)))

parameters <- c("theta1", "theta2", "delta", "deltaprior")
## [1] "/Users/davidgoldstein/Documents/LaTeX/HomerObliqueCase/Code/Model.txt"
## module glm loaded
## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 2
##    Unobserved stochastic nodes: 4
##    Total graph size: 11
## 
## Initializing model
#This is where the model file is
here::here("Code", "Model.txt")
samples <- jags(data,
                inits = myinits, 
                        parameters,
                    model.file = "Code/Model.txt",
                    n.chains = 3, 
                    n.iter = 2000000, 
                    n.burnin = 100000, 
                    n.thin = 1,
                    DIC = TRUE)
# Mean of delta
mean <- mean(samples$BUGSoutput$sims.list$delta)
# Median of delta
median <- median(samples$BUGSoutput$sims.list$delta)
# 95% credible interval for delta
ninefive.ci <- quantile(samples$BUGSoutput$sims.list$delta, c(0.025, 0.975))

Posterior distribution

#Collect posterior samples across all chains:
delta.posterior  <- as.data.frame(samples$BUGSoutput$sims.list$delta)
names(delta.posterior) <- "Value" 
delta.posterior$Category <- rep("Delta Posterior", 5700000)

delta.prior <- as.data.frame(samples$BUGSoutput$sims.list$deltaprior)
names(delta.prior) <- "Value" 
delta.prior$Category <- rep("Delta Prior", 5700000)
 
delta <- rbind(delta.posterior, delta.prior)

Visualization of posterior distribution of delta

ggplot(delta.posterior) +
  geom_histogram(aes(x = Value), bins = 100) +
  coord_cartesian(x = c(-0.1, 0.1)) +
  xlab("Difference in rates") +
  ylab("Posterior density") +
    theme(text = element_text(size = 10)) +
  theme_tufte() +
  scale_y_continuous(labels=scales::comma)
Posterior distribution of the difference in rates

Posterior distribution of the difference in rates