
#check what is in r console from last time
ls()
#remove everything
rm(list=ls())

#check what working directory is set
getwd()
#setting to required one where data is based
setwd("C:/Users/UEA/Documents/Dissertation and phd/Neat data/nature heatwave")

#reading in data 
frequency <- read.csv("heatwavebehaviourfrequency.csv", header = TRUE) # full data set 
duration <- read.csv("heatwavebehaviourduration.csv", header = TRUE) # full data set 


#### DATA CHECK, CLEAN, DESCRIPTION AND SUMMARY #########################################################################



##### frequency
frequency # produces all whole dataframe - no NAs, data to 2 or 3 d.p., no irregularities/anomalies

### checking for outliers/errors
summary(frequency) # produces general (unsplit) range, quantiles, median, count and mean summary stats for each variable

str(frequency) # checks the variable types
# 'data.frame':	215 obs. of  3 variables:
# $ Frequency     : int  35 40 28 35 19 4 24 18 14 9 ...                        # the number of copulatory contacts between male and female in 60 mins of observation
# $ Temperature.oC: int  30 30 30 30 30 30 30 30 30 30 ...                      # male 5 day heatwave treatmnet prior to mating
# $ Status       : Factor w/ 3 levels "mate","mount",..: 2 2 2 2 2 2 2 2 2 2 ...# 
# ! sexaul behaviour split into mounting and mating dependent on time (35 seconds of continuous mating) (e.g. Dickinson et al. 2018) 
# sorting variable mounting <36 sec of unbroken copulation, mating where spermatophore likely to have been transfered with >36 sec. Total is the combined dataset  



is.na(frequency) # returns TRUE of x is missing
# nothing missing

frequency$Temperature.oC<-as.factor(frequency$Temperature.oC) # changing to categorical factor as coding is not proportional to differences

levels(frequency$Temperature.oC)
# "30" "39" "40" "41" "42"
## if want to change 
# repfit10D$Temperature.oC <- factor(repfit10D$Temperature.oC,levels = c("42","42both",30","42female","42femalesperm"))
# levels(repfit10D$Temperature.oC)

levels(frequency$Status) #[1] "mate"  "mount" "total"
levels(frequency$Status) <- c("Mate", "Mount", "Total")



frequencymount<-subset(frequency, Status %in% c("Mount"))
frequencymount$Status <- NULL #removing sorting column
str(frequencymount)
frequencymate<-subset(frequency, Status %in% c("Mate"))
frequencymate$Status <- NULL #removing sorting column
str(frequencymate)
frequencytotal<-subset(frequency, Status %in% c("Total"))
frequencytotal$Status <- NULL #removing sorting column
str(frequencytotal)
frequencymountmate<-subset(frequency, !(Status %in% c("Total")))
frequencymountmate$Status <- NULL #removing sorting column
str(frequencymountmate)

##### duration
duration # produces all whole dataframe - no NAs, data to 2 or 3 d.p., no irregularities/anomalies

### checking for outliers/errors
summary(duration) # produces general (unsplit) range, quantiles, median, count and mean summary stats for each variable

str(duration) # checks the variable types
# 'data.frame':	2774 obs. of  3 variables:
#      $ Duration.sec  : int  1 1 1 1 1 1 1 1 1 1 ...                 # duration of copulatory contact in seconds
# $ Temperature.oC: int  30 30 30 30 30 30 30 30 30 30 ...            # heatwave temperature
# $ Status        : Factor w/ 3 levels "mate","mount",..: 2 2 2 2 2 2 2 2 2 2 ...
# ! sexaul behaviour split into mounting and mating dependent on time (35 seconds of continuous mating) (e.g. Dickinson et al. 2018) 
# sorting variable mounting <36 sec of unbroken copulation, mating where spermatophore likely to have been transfered with >36 sec. Total is the combined dataset  



is.na(duration) # returns TRUE of x is missing
# nothing missing

duration$Temperature.oC<-as.factor(duration$Temperature.oC) # changing to categorical factor as coding is not proportional to differences

levels(duration$Temperature.oC)
# "30" "39" "40" "41" "42"
## if want to change 
# repfit10D$Temperature.oC <- factor(repfit10D$Temperature.oC,levels = c("42","42both",30","42female","42femalesperm"))
# levels(repfit10D$Temperature.oC)

levels(duration$Status) #[1] "mate"  "mount" "total"
levels(duration$Status) <- c("Mate", "Mount", "Total")



durationmount<-subset(duration, Status %in% c("Mount"))
durationmount$Status <- NULL #removing sorting column
str(durationmount)
durationmate<-subset(duration, Status %in% c("Mate"))
durationmate$Status <- NULL #removing sorting column
str(durationmate)
durationtotal<-subset(duration, Status %in% c("Total"))
durationtotal$Status <- NULL #removing sorting column
str(durationtotal)
durationmountmate<-subset(duration, !(Status %in% c("Total")))
durationmountmate$Status <- NULL #removing sorting column
str(durationmountmate)




########################### NEATPLOT ###############################################################################

# 20D boxplot

library(ggplot2)

 temp <- expression(bold(paste('Heatwave temperature (',degree,'C)'))) #the temperature label with degrees sign # ~ is a space

 
 
 ############## ! NATURE COMM FIGURE S1 A PLOTS  ############## 
 
 ##### frequency
 str(frequency)
 
 graphfrequency<-ggplot(frequencymate, aes(x=Temperature.oC, y=Frequency, fill= Temperature.oC)) +  #change fill to colour is just lines and change 'scale_fill_manual' below to scale_color_manual
      geom_boxplot(notch=F,  #change to F if want to get rid of notchs
                   outlier.shape= NA, #shape of the outlier (hashtag out if dont want outliers marked)
                   width=0.5,
                   lwd=0.5,
                   fatten=0.5,
                   color="black",
                   position=position_dodge(0.5)) + #size of the outlier (hashtag out if dont want outliers marked)
      stat_summary(fun.y="mean", geom= "point", size=4, position=position_dodge(0.5), color="black") + 
      scale_fill_manual(values=c("ghostwhite", "ghostwhite", "ghostwhite", "tomato", "tomato")) + #how things are labeled in the lgend
      scale_colour_manual(values=c("black", "black","black")) +
      geom_jitter(shape=1, size=1.5, position=position_jitter(0.15)) +
      labs (x= temp, y="Mating frequency") +  #adding title to the x axis and y axis
      scale_x_discrete(breaks=c("30","39","40","41","42"), #the order of the variables on the x axis
                       labels=c("30","39","40","41","42")) + # the names on the x axis
      coord_cartesian(ylim=c(-1,21)) + #set axis limits
      scale_y_continuous(breaks=seq(0, 20, 5), #ticks from 0 to 16000 and show number every 16000
                         expand = c(0, 0)) + #cuts the axis off at 0
      theme_classic() + #the theme of the whole plot 
      theme(
            #legend.position="none", #get rid of the hashtag to get rid of legend
            panel.grid.major=element_blank(), #getting rid of majorgridlines
            panel.border=element_blank(),     #getting rid of minorgridlines  
            panel.grid.minor=element_blank(),
            axis.line.x=element_line(color="black", size = 1),
            axis.line.y=element_line(color="black", size = 1),
            axis.text.x=element_text(color="black", size=12),
            axis.text.y=element_text(color="black", size=12),
            axis.title.x=element_text(face = "bold", size=12, color="black", margin = margin(t = 10, r = 0, b = 0, l = 0)),
            axis.title.y=element_text(face = "bold", size=12, color="black", margin = margin(t = 0, r = 10, b = 0, l = 0)),
            legend.position="none",
            panel.background=element_blank(),
            plot.background=element_rect(fill="transparent", colour = NA))
 
 
 setwd("C:/Users/UEA/Desktop")
 ggsave("graphfrequency.png",width=3, height=4, dpi=300, bg = "transparent")
 setwd("C:/Users/UEA/Documents/Dissertation and phd/d- data for phd/R analysis/largepaper/behaviour")
 
 
 ############## ! NATURE COMMM FIGURE S1 B PLOTS  ##############
 
 ##### duration 
 str(duration)
 
 graphduration<-ggplot(durationmate, aes(x=Temperature.oC, y=Duration.sec, fill= Temperature.oC)) +  #change fill to colour is just lines and change 'scale_fill_manual' below to scale_color_manual
      geom_boxplot(notch=F,  #change to F if want to get rid of notchs
                   outlier.shape= NA, #shape of the outlier (hashtag out if dont want outliers marked)
                   width=0.5,
                   lwd=0.5,
                   fatten=0.5,
                   color="black",
                   position=position_dodge(0.5)) + #size of the outlier (hashtag out if dont want outliers marked)
      stat_summary(fun.y="mean", geom= "point", size=4, position=position_dodge(0.5), color="black") + 
      scale_fill_manual(values=c("ghostwhite", "ghostwhite", "ghostwhite", "tomato", "tomato")) + #how things are labeled in the lgend
      scale_colour_manual(values=c("black", "black","black")) +
      geom_jitter(shape=1, size=1.5, position=position_jitter(0.15)) +
      labs (x= temp, y="Mating duration (sec)") +  #adding title to the x axis and y axis
      scale_x_discrete(breaks=c("30","39","40","41","42"), #the order of the variables on the x axis
                       labels=c("30","39","40","41","42")) + # the names on the x axis
      coord_cartesian(ylim=c(-25,825)) + #set axis limits
      scale_y_continuous(breaks=seq(0, 800, 200), #ticks from 0 to 16000 and show number every 16000
                         expand = c(0, 0)) + #cuts the axis off at 0
      theme_classic() + #the theme of the whole plot 
      theme(
           #legend.position="none", #get rid of the hashtag to get rid of legend
           panel.grid.major=element_blank(), #getting rid of majorgridlines
           panel.border=element_blank(),     #getting rid of minorgridlines  
           panel.grid.minor=element_blank(),
           axis.line.x=element_line(color="black", size = 1),
           axis.line.y=element_line(color="black", size = 1),
           axis.text.x=element_text(color="black", size=12),
           axis.text.y=element_text(color="black", size=12),
           axis.title.x=element_text(face = "bold", size=12, color="black", margin = margin(t = 10, r = 0, b = 0, l = 0)),
           axis.title.y=element_text(face = "bold", size=12, color="black", margin = margin(t = 0, r = 10, b = 0, l = 0)),
           legend.position="none",
           panel.background=element_blank(),
           plot.background=element_rect(fill="transparent", colour = NA))
 
 
 setwd("C:/Users/UEA/Desktop")
 ggsave("graphduration.png",width=3, height=4, dpi=300, bg = "transparent")
 setwd("C:/Users/UEA/Documents/Dissertation and phd/d- data for phd/R analysis/largepaper/behaviour")
 
 
 
 


##### frequency
str(frequency)

graphfrequency<-ggplot(frequency, aes(x=Temperature.oC, y=Frequency, fill= Temperature.oC)) +  #change fill to colour is just lines and change 'scale_fill_manual' below to scale_color_manual
     geom_boxplot(notch=F,  #change to F if want to get rid of notchs
                  outlier.shape= NA, #shape of the outlier (hashtag out if dont want outliers marked)
                  width=0.5,
                  lwd=1,
                  position=position_dodge(0.5)) + #size of the outlier (hashtag out if dont want outliers marked)
     stat_summary(fun.y="mean", geom= "point", size=2.5, position=position_dodge(0.5), color="black") + 
     scale_fill_manual(values=c("ghostwhite", "orange1", "darkorange2", "orangered2", "firebrick1"), # changes the colour of the bars
                       name = temp, #adds in temperature label on the legend
                       breaks = c("Mount","Mate","Total"), #the order listed in the legend
                       label = c("Mount","Mate","Total")) + #how things are labeled in the lgend
     scale_colour_manual(values=c("black", "black","black")) +
     geom_point(position=position_jitterdodge(dodge.width=0.5, jitter.width = 0.5), shape=1, size= 0.75) +
     labs (x= temp, y="Frequency of copula formation") +  #adding title to the x axis and y axis
     scale_x_discrete(breaks=c("30","39","40","41","42"), #the order of the variables on the x axis
                      labels=c("30","39","40","41","42")) + # the names on the x axis
     coord_cartesian(ylim=c(-3,83)) + #set axis limits
     scale_y_continuous(breaks=seq(0, 80, 20), #ticks from 0 to 16000 and show number every 16000
                        expand = c(0, 0)) + #cuts the axis off at 0
     theme_classic() + #the theme of the whole plot 
     facet_grid(.~ Status) +
     theme(text = element_text(size=20), #the text size on the graph
           #legend.position="none", #get rid of the hashtag to get rid of legend
           panel.grid.major=element_blank(), #getting rid of majorgridlines
           panel.border=element_blank(),     #getting rid of minorgridlines  
           panel.grid.minor=element_blank(),
           axis.line.x=element_line(color="black", size = 1.5),
           axis.line.y=element_line(color="black", size = 1.5),
           axis.text.x=element_text(face = "bold", color="black"),
           axis.text.y=element_text(face = "bold", color="black"),
           axis.title.x=element_text(face = "bold", color="black"),
           axis.title.y=element_text(face = "bold", color="black"),
           strip.background = element_blank(),
           strip.text.x = element_text(size = 20, face = "bold", color="black"),
           legend.position="none",
           panel.background=element_blank(),
           plot.background=element_rect(fill="transparent", colour = NA))


setwd("C:/Users/UEA/Desktop")
ggsave("graphfrequency.png",width=8, height=7, dpi=300, bg = "transparent")
setwd("C:/Users/UEA/Documents/Dissertation and phd/d- data for phd/R analysis/largepaper/behaviour")


##### frequency
str(duration)

graphduration<-ggplot(duration, aes(x=Temperature.oC, y=Duration.sec, fill= Temperature.oC)) +  #change fill to colour is just lines and change 'scale_fill_manual' below to scale_color_manual
     geom_boxplot(notch=F,  #change to F if want to get rid of notchs
                  outlier.shape= NA, #shape of the outlier (hashtag out if dont want outliers marked)
                  width=0.5,
                  lwd=1,
                  position=position_dodge(0.5)) + #size of the outlier (hashtag out if dont want outliers marked)
     stat_summary(fun.y="mean", geom= "point", size=2.5, position=position_dodge(0.5), color="black") + 
     scale_fill_manual(values=c("ghostwhite", "orange1", "darkorange2", "orangered2", "firebrick1"), # changes the colour of the bars
                       name = temp, #adds in temperature label on the legend
                       breaks = c("Mount","Mate","Total"), #the order listed in the legend
                       label = c("Mount","Mate","Total")) + #how things are labeled in the lgend
     scale_colour_manual(values=c("black", "black","black")) +
     geom_point(position=position_jitterdodge(dodge.width=0.5, jitter.width = 0.5), shape=1, size= 0.75) +
     labs (x= temp, y="Duration of copulation (sec)") +  #adding title to the x axis and y axis
     scale_x_discrete(breaks=c("30","39","40","41","42"), #the order of the variables on the x axis
                      labels=c("30","39","40","41","42")) + # the names on the x axis
     coord_cartesian(ylim=c(-5,405)) + #set axis limits
     scale_y_continuous(breaks=seq(0, 400, 100), #ticks from 0 to 16000 and show number every 16000
                        expand = c(0, 0)) + #cuts the axis off at 0
     theme_classic() + #the theme of the whole plot 
     facet_grid(.~ Status) +
     theme(text = element_text(size=20), #the text size on the graph
           #legend.position="none", #get rid of the hashtag to get rid of legend
           panel.grid.major=element_blank(), #getting rid of majorgridlines
           panel.border=element_blank(),     #getting rid of minorgridlines  
           panel.grid.minor=element_blank(),
           axis.line.x=element_line(color="black", size = 1.5),
           axis.line.y=element_line(color="black", size = 1.5),
           axis.text.x=element_text(face = "bold", color="black"),
           axis.text.y=element_text(face = "bold", color="black"),
           axis.title.x=element_text(face = "bold", color="black"),
           axis.title.y=element_text(face = "bold", color="black"),
           strip.background = element_blank(),
           strip.text.x = element_text(size = 20, face = "bold", color="black"),
           legend.position="none",
           panel.background=element_blank(),
           plot.background=element_rect(fill="transparent", colour = NA))


setwd("C:/Users/UEA/Desktop")
ggsave("graphduration.png",width=8, height=7, dpi=300, bg = "transparent")
setwd("C:/Users/UEA/Documents/Dissertation and phd/d- data for phd/R analysis/largepaper/behaviour")





##############################################################################################################################################################  FREQUENCY #### PLOTTING RAW DATA DISTRIBUTION AND TESTING NORMALITY AND HOMOGENIETY OF VARIANCES ###############################

str(frequency) # total core dataset n= (219 females, 262 males) 

######## ! NATURE COMM SUPP TABLE 1 STATS ############

#### ! library(psych)
#gives you vars  n, mean, sd,  median,  trimmed, mad, min, max, range, skew, kurtosis, se
describeBy(frequency$Frequency, list(frequency$Temperature.oC,frequency$Status),mat=TRUE) 
# item group1 group2 vars  n      mean        sd median   trimmed     mad min max range       skew    kurtosis        se
# X11     1     30   Mate    1 25  9.120000  3.395095    8.0  9.142857  4.4478   4  14    10  0.1649368 -1.36836602 0.6790189
# X12     2     39   Mate    1 24  6.958333  1.756458    8.0  7.350000  0.0000   1   8     7 -2.0155315  3.68274838 0.3585355
# X13     3     40   Mate    1 21  8.333333  3.245510    8.0  8.294118  2.9652   1  16    15  0.1247454  0.08539535 0.7082283
# X14     4     41   Mate    1 24  5.666667  4.135390    5.0  5.350000  4.4478   0  15    15  0.6199360 -0.69705425 0.8441329
# X15     5     42   Mate    1 14  4.785714  4.098378    4.5  4.250000  3.7065   0  16    16  1.2446288  1.31386956 1.0953376
# X16     6     30  Mount    1 25 20.840000 11.855660   19.0 20.952381 13.3434   0  40    40  0.0560568 -1.16955991 2.3711319
# X17     7     39  Mount    1 24 20.125000 12.202325   18.5 19.300000  9.6369   2  54    52  0.7524484  0.38150501 2.4907892
# X18     8     40  Mount    1 21 29.047619 19.883350   22.0 26.235294 13.3434   9  82    73  1.0501213  0.13601232 4.3389028
# X19     9     41  Mount    1 25 11.600000 10.372239    7.0 10.238095  5.9304   1  45    44  1.4470373  1.89507584 2.0744477
# X110   10     42  Mount    1 12  7.500000  6.459665    6.5  6.400000  5.1891   1  25    24  1.4728567  1.67564025 1.8647447
# X111   11     30  Total    1 50 14.980000 10.465668   12.5 13.700000  8.1543   0  40    40  0.9933994 -0.04683350 1.4800689
# X112   12     39  Total    1 48 13.541667 10.892040    8.0 12.050000  3.7065   1  54    53  1.5926325  2.43361871 1.5721305
# X113   13     40  Total    1 42 18.690476 17.546437   11.5 15.382353  6.6717   1  82    81  1.8183914  2.82410961 2.7074741
# X114   14     41  Total    1 49  8.693878  8.424182    6.0  7.365854  4.4478   0  45    45  2.0586694  5.28823368 1.2034545
# X115   15     42  Total    1 26  6.038462  5.385022    5.0  5.227273  4.4478   0  25    25  1.7648247  3.61371752 1.0560897




########### Normality 

par(mfrow=c(2,2)) #plotting the graphs next to get other in a 4x4 gird

#### total
### in base
hist(frequencytotal$Frequency[frequencytotal$Temperature.oC == "30"], 
     main = list("30", cex = 2), xlab = "Copulation number", ylab ="Frequency", ylim = c(0,20),
     nclass = 10) 
hist(frequencytotal$Frequency[frequencytotal$Temperature.oC == "39"], 
     main = list("39", cex = 2), xlab = "Copulation number", ylab ="Frequency", ylim = c(0,20),
     nclass = 10) 
hist(frequencytotal$Frequency[frequencytotal$Temperature.oC == "40"], 
     main = list("40", cex = 2), xlab = "Copulation number", ylab ="Frequency", ylim = c(0,20),
     nclass = 10) 
hist(frequencytotal$Frequency[frequencytotal$Temperature.oC == "41"], 
     main = list("41", cex = 2), xlab = "Copulation number", ylab ="Frequency", ylim = c(0,20),
     nclass = 10) 
hist(frequencytotal$Frequency[frequencytotal$Temperature.oC == "42"], 
     main = list("42", cex = 2), xlab = "Copulation number", ylab ="Frequency", ylim = c(0,20),
     nclass = 10) 

# positive skew


#### mate
### in base
hist(frequencymate$Frequency[frequencymate$Temperature.oC == "30"], 
     main = list("30", cex = 2), xlab = "Copulation number", ylab ="Frequency", ylim = c(0,20),
     nclass = 10) 
hist(frequencymate$Frequency[frequencymate$Temperature.oC == "39"], 
     main = list("39", cex = 2), xlab = "Copulation number", ylab ="Frequency", ylim = c(0,20),
     nclass = 10) 
hist(frequencymate$Frequency[frequencymate$Temperature.oC == "40"], 
     main = list("40", cex = 2), xlab = "Copulation number", ylab ="Frequency", ylim = c(0,20),
     nclass = 10) 
hist(frequencymate$Frequency[frequencymate$Temperature.oC == "41"], 
     main = list("41", cex = 2), xlab = "Copulation number", ylab ="Frequency", ylim = c(0,20),
     nclass = 10) 
hist(frequencymate$Frequency[frequencymate$Temperature.oC == "42"], 
     main = list("42", cex = 2), xlab = "Copulation number", ylab ="Frequency", ylim = c(0,20),
     nclass = 10) 

#platy kurtosis

#### mount
### in base
hist(frequencymount$Frequency[frequencymount$Temperature.oC == "30"], 
     main = list("30", cex = 2), xlab = "Copulation number", ylab ="Frequency", ylim = c(0,20),
     nclass = 10) 
hist(frequencymount$Frequency[frequencymount$Temperature.oC == "39"], 
     main = list("39", cex = 2), xlab = "Copulation number", ylab ="Frequency", ylim = c(0,20),
     nclass = 10) 
hist(frequencymount$Frequency[frequencymount$Temperature.oC == "40"], 
     main = list("40", cex = 2), xlab = "Copulation number", ylab ="Frequency", ylim = c(0,20),
     nclass = 10) 
hist(frequencymount$Frequency[frequencymount$Temperature.oC == "41"], 
     main = list("41", cex = 2), xlab = "Copulation number", ylab ="Frequency", ylim = c(0,20),
     nclass = 10) 
hist(frequencymount$Frequency[frequencymount$Temperature.oC == "42"], 
     main = list("42", cex = 2), xlab = "Copulation number", ylab ="Frequency", ylim = c(0,20),
     nclass = 10) 

# +ve skew, -ve kurtosis


### total
shapiro.test (frequencytotal$Frequency[frequencytotal$Temperature.oC == "30"]) #  W = 0.88197, p-value = 0.0001285
ks.test(frequencytotal$Frequency[frequencytotal$Temperature.oC == "30"], pnorm)  # D = 0.95997, p-value < 2.2e-16
shapiro.test (frequencytotal$Frequency[frequencytotal$Temperature.oC == "39"]) # W = 0.80663, p-value = 1.842e-06
ks.test(frequencytotal$Frequency[frequencytotal$Temperature.oC == "39"], pnorm) # D = 0.95698, p-value < 2.2e-16
shapiro.test (frequencytotal$Frequency[frequencytotal$Temperature.oC == "40"]) #  W = 0.74778, p-value = 4.05e-07
ks.test(frequencytotal$Frequency[frequencytotal$Temperature.oC == "40"], pnorm)  # D = 0.97619, p-value < 2.2e-16
shapiro.test (frequencytotal$Frequency[frequencytotal$Temperature.oC == "41"]) # W = 0.78669, p-value = 5.368e-07
ks.test(frequencytotal$Frequency[frequencytotal$Temperature.oC == "41"], pnorm) # D = 0.87521, p-value < 2.2e-16
shapiro.test (frequencytotal$Frequency[frequencytotal$Temperature.oC == "42"]) #  W = 0.82368, p-value = 0.0004577
ks.test(frequencytotal$Frequency[frequencytotal$Temperature.oC == "42"], pnorm)  # D = 0.8234, p-value = 9.992e-16

### mate
shapiro.test (frequencymate$Frequency[frequencymate$Temperature.oC == "30"]) #  W = 0.91531, p-value = 0.04011
ks.test(frequencymate$Frequency[frequencymate$Temperature.oC == "30"], pnorm)  # D = 0.99997, p-value < 2.2e-16
shapiro.test (frequencymate$Frequency[frequencymate$Temperature.oC == "39"]) # W = 0.64759, p-value = 2.2e-06
ks.test(frequencymate$Frequency[frequencymate$Temperature.oC == "39"], pnorm) # D = 0.95698, p-value < 2.2e-16
shapiro.test (frequencymate$Frequency[frequencymate$Temperature.oC == "40"]) #  D = 0.95698, p-value < 2.2e-16
ks.test(frequencymate$Frequency[frequencymate$Temperature.oC == "40"], pnorm)  # W = 0.96292, p-value = 0.5768
shapiro.test (frequencymate$Frequency[frequencymate$Temperature.oC == "41"]) # W = 0.93782, p-value = 0.1459
ks.test(frequencymate$Frequency[frequencymate$Temperature.oC == "41"], pnorm) # D = 0.81058, p-value = 4.019e-14
shapiro.test (frequencymate$Frequency[frequencymate$Temperature.oC == "42"]) #  W = 0.86783, p-value = 0.03913
ks.test(frequencymate$Frequency[frequencymate$Temperature.oC == "42"], pnorm)  # D = 0.76992, p-value = 1.238e-07

### mount
shapiro.test (frequencymount$Frequency[frequencymount$Temperature.oC == "30"]) #  W = 0.95759, p-value = 0.3686
ks.test(frequencymount$Frequency[frequencymount$Temperature.oC == "30"], pnorm)  # D = 0.93725, p-value < 2.2e-16
shapiro.test (frequencymount$Frequency[frequencymount$Temperature.oC == "39"]) # W = 0.95176, p-value = 0.2957
ks.test(frequencymount$Frequency[frequencymount$Temperature.oC == "39"], pnorm) # D = 0.97725, p-value < 2.2e-16
shapiro.test (frequencymount$Frequency[frequencymount$Temperature.oC == "40"]) #  W = 0.85583, p-value = 0.005362
ks.test(frequencymount$Frequency[frequencymount$Temperature.oC == "40"], pnorm)  # D = 1, p-value < 2.2e-16
shapiro.test (frequencymount$Frequency[frequencymount$Temperature.oC == "41"]) # W = 0.83058, p-value = 0.0007702
ks.test(frequencymount$Frequency[frequencymount$Temperature.oC == "41"], pnorm) # D = 0.93725, p-value < 2.2e-16
shapiro.test (frequencymount$Frequency[frequencymount$Temperature.oC == "42"]) #  W = 0.8103, p-value = 0.0123
ks.test(frequencymount$Frequency[frequencymount$Temperature.oC == "42"], pnorm)  # D = 0.89392, p-value = 9.378e-09


## Transformation unlikely to fix normality in one groupd without removing it in the other


###### plotting differences
# base boxplots of data distribution grouped by temperature

### total
boxplot(frequencytotal$Frequency ~ frequencytotal$Temperature.oC, ylab="Frequency", xlab="Temperature")
#look relatively similar

### mate
boxplot(frequencymate$Frequency ~ frequencymate$Temperature.oC, ylab="Frequency", xlab="Temperature")
#39 and 42 look less variable

### mount
boxplot(frequencymount$Frequency ~ frequencymount$Temperature.oC, ylab="Frequency", xlab="Temperature")
# 40 looks more variable

# variances look similar at lower temperatures, but males larger at higher temperatures



########### Homogeneity of Variances - Failed in all groups 
#! need library(car)

### total
bartlett.test(frequencytotal$Frequency ~ frequencytotal$Temperature.oC) # Bartlett's K-squared = 46.469, df = 4, p-value = 1.967e-09
fligner.test(frequencytotal$Frequency ~ frequencytotal$Temperature.oC) # Fligner-Killeen:med chi-squared = 9.9335, df = 4, p-value = 0.04156
leveneTest(frequencytotal$Frequency ~ frequencytotal$Temperature.oC)   #Df F value Pr(>F)  4  2.7051 0.03144 *

### mate
bartlett.test(frequencymate$Frequency ~ frequencymate$Temperature.oC) # Bartlett's K-squared = 16.304, df = 4, p-value = 0.002637
fligner.test(frequencymate$Frequency ~ frequencymate$Temperature.oC) # Fligner-Killeen:med chi-squared = 14.2, df = 4, p-value = 0.006682
leveneTest(frequencymate$Frequency ~ frequencymate$Temperature.oC)   #Df F value Pr(>F) 4   3.484 0.01036 *

### mount
bartlett.test(frequencymount$Frequency ~ frequencymount$Temperature.oC) # Bartlett's K-squared = 19.115, df = 4, p-value = 0.0007461
fligner.test(frequencymount$Frequency ~ frequencymount$Temperature.oC) # Fligner-Killeen:med chi-squared = 12.259, df = 4, p-value = 0.01553
leveneTest(frequencymount$Frequency ~ frequencymount$Temperature.oC)   #Df F value Pr(>F) 4  3.0426 0.02051 *




#################################################################################################################################################################### FREQUENCYTOTAL   ####  OLD METHOD: USE NORMAL > TRY AND TRANSFORM TO NORMAL > NON PARAMETRIC ##################################### 

### total

# Data not normal or homogenous in variances 

#1) Homogeneity of variance not strict assumpation of Kurskall wallace 
kruskal.test(frequencytotal$Frequency ~ frequencytotal$Temperature.oC)
# Kruskal-Wallis chi-squared =  38.959, df = 4, p-value = 7.104e-08 # there is a significant difference between groups
# Posthoc testing 

#! library(pgirmess) 
kruskalmc(frequencytotal$Frequency ~ frequencytotal$Temperature.oC, probs = 0.05, cont=NULL) # Multiple comparison test after 
# Multiple comparison test after Kruskal-Wallis 
# p.value: 0.05 
# Comparisons
# obs.dif critical.dif difference
# 30-39 12.271250     35.28666      FALSE
# 30-40  6.224286     36.55000      FALSE
# 30-41 48.403265     35.10247       TRUE
# 30-42 67.682308     42.22195       TRUE
# 39-40 18.495536     36.89599      FALSE
# 39-41 36.132015     35.46258       TRUE
# 39-42 55.411058     42.52180       TRUE
# 40-41 54.627551     36.71987       TRUE
# 40-42 73.906593     43.57589       TRUE
# 41-42 19.279042     42.36908      FALSE


#! library(PMCMR) 
posthoc.kruskal.nemenyi.test(frequencytotal$Frequency ~ frequencytotal$Temperature.oC, dist="Chisquare") #"Tukey" for no ties 
#         30      39      40      41     
#      39 0.91639 -       -       -      
#      40 0.99391 0.73824 -       -      
#      41 0.00464 0.08430 0.00155 -      
#      42 0.00043 0.00938 0.00014 0.80217


# dunn.test.control(x=survivalsex$Prop.survivors,g=survivalsex$Treatment, p.adjust="bonferroni")
# dunn.test.control(x=survivalsex$Prop.survivors,g=survivalsex$Treatment)
posthoc.kruskal.dunn.test(x=frequencytotal$Frequency, g=frequencytotal$Temperature.oC, p.adjust.method="bonferroni") # ULTRA CONSERVATIVE
posthoc.kruskal.dunn.test(x=frequencytotal$Frequency, g=frequencytotal$Temperature.oC) # THIS METHOD PREFERRED FOR NON PARAMETRIC UNEQUAL SAMPLE SIZES
# 
#         30      39      40      41     
#      39 0.65638 -       -       -      
#      40 0.65638 0.63482 -       -      
#      41 0.00074 0.02087 0.00023 -      
#      42 5.9e-05 0.00149 1.9e-05 0.63482



#also see Games-Howell test as more robust to violation of assumptions 

#2) Convert data to ranks and submitt to a welch anova and tukey HSD Ruxton (2006)
frequencytotal$rank<-rank(frequencytotal$Frequency) # ranking all data inter-group by ascending count 
oneway.test(frequencytotal$Frequency~ frequencytotal$Temperature.oC, var.equal=FALSE)
# F = 10.265, num df = 4.00, denom df = 101.13, p-value = 5.127e-07
TukeyHSD(aov) # only works with aov not oneway, 
#! library(userfriendlyscience)
posthocTGH(y=frequencytotal$Frequency, x=frequencytotal$Temperature.oC, method="games-howell") # use games-howell when different sample sizes # tukey for equal
# n means variances
# 30 50  15.0       110
# 39 48  13.5       119
# 40 42  18.7       308
# 41 49   8.7        71
# 42 26   6.0        29
# 
#        diff ci.lo ci.hi    t df    p p.adjusted
# 39-30  -1.4  -7.4  4.57 0.67 95  .96       1.00
# 40-30   3.7  -4.9 12.37 1.20 64  .75       1.00
# 41-30  -6.3 -11.6 -0.98 3.30 93  .01        .08
# 42-30  -8.9 -14.0 -3.86 4.92 74 <.01       <.01
# 40-39   5.1  -3.6 13.93 1.64 67  .47       1.00
# 41-39  -4.8 -10.4  0.67 2.45 88  .11        .56
# 42-39  -7.5 -12.8 -2.20 3.96 72 <.01        .01
# 41-40 -10.0 -18.3 -1.65 3.37 57  .01        .08
# 42-40 -12.7 -20.9 -4.44 4.35 52 <.01        .01
# 42-41  -2.7  -7.1  1.83 1.66 70  .47       1.00



#################################################################################################################################################################### FREQUENCYMATE   ####  OLD METHOD: USE NORMAL > TRY AND TRANSFORM TO NORMAL > NON PARAMETRIC ##################################### 

### total

# Data not normal or homogenous in variances 

#1) Homogeneity of variance not strict assumpation of Kurskall wallace 
kruskal.test(frequencymate$Frequency ~ frequencymate$Temperature.oC)
# Kruskal-Wallis chi-squared =  20.498, df = 4, p-value = 0.0003981 # there is a significant difference between groups
# Posthoc testing 

#! library(pgirmess) 
kruskalmc(frequencymate$Frequency ~ frequencymate$Temperature.oC, probs = 0.05, cont=NULL) # Multiple comparison test after 
# Multiple comparison test after Kruskal-Wallis 
# p.value: 0.05 
# Comparisons
#         obs.dif critical.dif difference
# 30-39 14.610833     25.12491      FALSE
# 30-40  5.792381     26.02444      FALSE
# 30-41 29.235833     25.12491       TRUE
# 30-42 38.340000     29.34813       TRUE
# 39-40  8.818452     26.27079      FALSE
# 39-41 14.625000     25.37999      FALSE
# 39-42 23.729167     29.56680      FALSE
# 40-41 23.443452     26.27079      FALSE
# 40-42 32.547619     30.33490       TRUE
# 41-42  9.104167     29.56680      FALSE


#! library(PMCMR) 
posthoc.kruskal.nemenyi.test(frequencymate$Frequency ~ frequencymate$Temperature.oC, dist="Chisquare") #"Tukey" for no ties 
#          30     39     40     41    
#      39 0.6087 -      -      -     
#      40 0.9828 0.9245 -      -     
#      41 0.0286 0.6173 0.1735 -     
#      42 0.0085 0.2724 0.0563 0.9440


# dunn.test.control(x=survivalsex$Prop.survivors,g=survivalsex$Treatment, p.adjust="bonferroni")
# dunn.test.control(x=survivalsex$Prop.survivors,g=survivalsex$Treatment)
posthoc.kruskal.dunn.test(x=frequencymate$Frequency, g=frequencymate$Temperature.oC, p.adjust.method="bonferroni") # ULTRA CONSERVATIVE
posthoc.kruskal.dunn.test(x=frequencymate$Frequency, g=frequencymate$Temperature.oC) # THIS METHOD PREFERRED FOR NON PARAMETRIC UNEQUAL SAMPLE SIZES
# 
#          30     39     40     41    
#      39 0.5008 -      -      -     
#      40 1.0000 1.0000 -      -     
#      41 0.0090 0.5008 0.0815 -     
#      42 0.0022 0.1396 0.0193 1.0000



#also see Games-Howell test as more robust to violation of assumptions 

#2) Convert data to ranks and submitt to a welch anova and tukey HSD Ruxton (2006)
frequencymate$rank<-rank(frequencymate$Frequency) # ranking all data inter-group by ascending count 
oneway.test(frequencymate$Frequency~ frequencymate$Temperature.oC, var.equal=FALSE)
# F = 4.5842, num df = 4.000, denom df = 44.663, p-value = 0.003459
TukeyHSD(aov) # only works with aov not oneway, 
#! library(userfriendlyscience)
posthocTGH(y=frequencymate$Frequency, x=frequencymate$Temperature.oC, method="games-howell") # use games-howell when different sample sizes # tukey for equal
#    n means variances
# 30 25   9.1      11.5
# 39 24   7.0       3.1
# 40 21   8.3      10.5
# 41 24   5.7      17.1
# 42 14   4.8      16.8
# 
#        diff ci.lo  ci.hi    t df   p p.adjusted
# 39-30 -2.16 -4.37  0.042 2.82 36 .06        .45
# 40-30 -0.79 -3.58  2.006 0.80 43 .93       1.00
# 41-30 -3.45 -6.53 -0.374 3.19 45 .02        .20
# 42-30 -4.33 -8.14 -0.526 3.36 23 .02        .20
# 40-39  1.38 -0.93  3.678 1.73 30 .43       1.00
# 41-39 -1.29 -3.95  1.363 1.41 31 .63       1.00
# 42-39 -2.17 -5.71  1.363 1.89 16 .36       1.00
# 41-40 -2.67 -5.81  0.472 2.42 43 .13        .78
# 42-40 -3.55 -7.40  0.302 2.72 23 .08        .57
# 42-41 -0.88 -4.91  3.153 0.64 28 .97       1.00




#################################################################################################################################################################### FREQUENCYMOUNT  ####  OLD METHOD: USE NORMAL > TRY AND TRANSFORM TO NORMAL > NON PARAMETRIC ##################################### 

### total

# Data not normal or homogenous in variances 

#1) Homogeneity of variance not strict assumpation of Kurskall wallace 
kruskal.test(frequencymount$Frequency ~ frequencymount$Temperature.oC)
# Kruskal-Wallis chi-squared =  27.662, df = 4, p-value = 1.46e-05 # there is a significant difference between groups
# Posthoc testing 

#! library(pgirmess) 
kruskalmc(frequencymount$Frequency ~ frequencymount$Temperature.oC, probs = 0.05, cont=NULL) # Multiple comparison test after 
# Multiple comparison test after Kruskal-Wallis 
# p.value: 0.05 
# Comparisons
#           obs.dif critical.dif difference
# 30-39  1.835833     24.89334      FALSE
# 30-40  9.988571     25.78458      FALSE
# 30-41 24.980000     24.63802       TRUE
# 30-42 37.023333     30.59152       TRUE
# 39-40 11.824405     26.02866      FALSE
# 39-41 23.144167     24.89334      FALSE
# 39-42 35.187500     30.79752       TRUE
# 40-41 34.968571     25.78458       TRUE
# 40-42 47.011905     31.52227       TRUE
# 41-42 12.043333     30.59152      FALSE


#! library(PMCMR) 
posthoc.kruskal.nemenyi.test(frequencymount$Frequency ~ frequencymount$Temperature.oC, dist="Chisquare") #"Tukey" for no ties 
#         30     39     40     41    
#      39 0.9998 -      -      -     
#      40 0.8807 0.8037 -      -     
#      41 0.0876 0.1457 0.0058 -     
#      42 0.0210 0.0357 0.0015 0.8743


# dunn.test.control(x=survivalsex$Prop.survivors,g=survivalsex$Treatment, p.adjust="bonferroni")
# dunn.test.control(x=survivalsex$Prop.survivors,g=survivalsex$Treatment)
posthoc.kruskal.dunn.test(x=frequencymount$Frequency, g=frequencymount$Temperature.oC, p.adjust.method="bonferroni") # ULTRA CONSERVATIVE
posthoc.kruskal.dunn.test(x=frequencymount$Frequency, g=frequencymount$Temperature.oC) # THIS METHOD PREFERRED FOR NON PARAMETRIC UNEQUAL SAMPLE SIZES
# 
#          30      39      40      41     
#      39 0.83589 -       -       -      
#      40 0.80776 0.80776 -       -      
#      41 0.02640 0.04507 0.00125 -      
#      42 0.00540 0.00931 0.00028 0.80776



#also see Games-Howell test as more robust to violation of assumptions 

#2) Convert data to ranks and submitt to a welch anova and tukey HSD Ruxton (2006)
frequencymount$rank<-rank(frequencymount$Frequency) # ranking all data inter-group by ascending count 
oneway.test(frequencymount$Frequency~ frequencymount$Temperature.oC, var.equal=FALSE)
# F = 9.4852, num df = 4.000, denom df = 48.025, p-value = 9.831e-06
TukeyHSD(aov) # only works with aov not oneway, 
#! library(userfriendlyscience)
posthocTGH(y=frequencymount$Frequency, x=frequencymount$Temperature.oC, method="games-howell") # use games-howell when different sample sizes # tukey for equal
# n means variances
# 30 25  20.8       141
# 39 24  20.1       149
# 40 21  29.0       395
# 41 25  11.6       108
# 42 12   7.5        42
# 
#         diff ci.lo ci.hi    t df    p p.adjusted
# 39-30  -0.71 -10.5  9.04 0.21 47 1.00       1.00
# 40-30   8.21  -6.1 22.51 1.66 31  .47       1.00
# 41-30  -9.24 -18.2 -0.30 2.93 47  .04        .24
# 42-30 -13.34 -22.0 -4.66 4.42 34 <.01        .01
# 40-39   8.92  -5.5 23.37 1.78 32  .40       1.00
# 41-39  -8.53 -17.7  0.68 2.63 45  .08        .41
# 42-39 -12.62 -21.6 -3.66 4.06 34 <.01        .02
# 41-40 -17.45 -31.4 -3.47 3.63 29  .01        .06
# 42-40 -21.55 -35.4 -7.73 4.56 26 <.01        .01
# 42-41  -4.10 -12.2  3.95 1.47 32  .59       1.00











##############################################################################################################################################################  DURATION #### PLOTTING RAW DATA DISTRIBUTION AND TESTING NORMALITY AND HOMOGENIETY OF VARIANCES ###############################

str(duration) # total core dataset n= (219 females, 262 males) 

######## ! NATURE COMM SUPP TABLE 1 STATS ############

#### ! library(psych)
#gives you vars  n, mean, sd,  median,  trimmed, mad, min, max, range, skew, kurtosis, se
describeBy(duration$Duration.sec , list(duration$Temperature.oC,duration$Status),mat=TRUE) 
#      item group1 group2  vars  n       mean         sd median    trimmed     mad min max range      skew   kurtosis         se
# X11     1     30   Mate    1 178  75.797753  67.172462   62.0  63.597222 23.7216  37 633   596 5.5140732 36.5018501  5.0347889
# X12     2     39   Mate    1 183  72.693989  39.720926   62.0  65.170068 22.2390  37 301   264 2.5756477  8.7935251  2.9362554
# X13     3     40   Mate    1 174  80.580460  81.478831   57.0  63.907143 22.2390  36 694   658 5.0152508 30.6677714  6.1768942
# X14     4     41   Mate    1 119  78.949580  49.485120   62.0  69.597938 22.2390  37 348   311 2.9239840 10.1410007  4.5362935
# X15     5     42   Mate    1  67 125.656716 107.596124   91.0 106.781818 62.2692  37 645   608 2.3156378  6.9156117 13.1449586
# X16     6     30  Mount    1 485  11.717526   9.414941    8.0  10.655527  7.4130   1  35    34 0.7855433 -0.5984272  0.4275105
# X17     7     39  Mount    1 584  10.445205   9.113375    7.0   9.136752  5.9304   1  35    34 1.0201137 -0.1386174  0.3771141
# X18     8     40  Mount    1 610   9.831148   8.506504    6.0   8.532787  5.9304   1  35    34 1.1396994  0.4574904  0.3444183
# X19     9     41  Mount    1 263  12.003802   9.466493    9.0  10.952607  8.8956   1  35    34 0.7940098 -0.5442860  0.5837290
# X110   10     42  Mount    1 111  13.711712   9.822412   10.0  12.887640 10.3782   1  35    34 0.5766030 -0.8979326  0.9323021
# X111   11     30  Total    1 663  28.921569  45.594991   15.0  21.233522 17.7912   1 633   632 6.7431489 70.4383461  1.7707627
# X112   12     39  Total    1 767  25.297262  33.807399   11.0  18.508943 13.3434   1 301   300 2.9097004 12.8131254  1.2207144
# X113   13     40  Total    1 784  25.533163  48.872774   11.0  16.490446 11.8608   1 694   693 7.5034708 83.0067406  1.7454562
# X114   14     41  Total    1 382  32.858639  42.234826   16.5  24.901961 20.0151   1 348   347 3.1305360 14.6658227  2.1609217
# X115   15     42  Total    1 178  55.848315  85.643952   24.0  36.701389 26.6868   1 645   644 3.2774959 14.4792554  6.4192856

# durationtotal30 <- durationtotal[durationtotal$Temperature.oC== "30",]
# frequencytotal39 <- frequencytotal[frequencytotal$Temperature.oC== "39",]
# frequencytotal40 <- frequencytotal[frequencytotal$Temperature.oC== "40",]
# frequencytotal41 <- frequencytotal[frequencytotal$Temperature.oC== "41",]
# frequencytotal42 <- frequencytotal[frequencytotal$Temperature.oC== "42",]
# 
# frequencymate30 <- frequencymate[frequencymate$Temperature.oC== "30",]
# frequencymate39 <- frequencymate[frequencymate$Temperature.oC== "39",]
# frequencymate40 <- frequencymate[frequencymate$Temperature.oC== "40",]
# frequencymate41 <- frequencymate[frequencymate$Temperature.oC== "41",]
# frequencymate42 <- frequencymate[frequencymate$Temperature.oC== "42",]
# 
# frequencymount30 <- frequencymount[frequencymount$Temperature.oC== "30",]
# frequencymount39 <- frequencymount[frequencymount$Temperature.oC== "39",]
# frequencymount40 <- frequencymount[frequencymount$Temperature.oC== "40",]
# frequencymount41 <- frequencymount[frequencymount$Temperature.oC== "41",]
# frequencymount42 <- frequencymount[frequencymount$Temperature.oC== "42",]



########### Normality 

par(mfrow=c(2,2)) #plotting the graphs next to get other in a 4x4 gird

#### total
### in base
hist(durationtotal$Duration.sec[durationtotal$Temperature.oC == "30"], 
     main = list("30", cex = 2), xlab = "Copulation duration", ylab ="Frequency", ylim = c(0,100),
     nclass = 10) 
hist(durationtotal$Duration.sec[durationtotal$Temperature.oC == "39"], 
     main = list("39", cex = 2), xlab = "Copulation duration", ylab ="Frequency", ylim = c(0,100),
     nclass = 10) 
hist(durationtotal$Duration.sec[durationtotal$Temperature.oC == "40"], 
     main = list("40", cex = 2), xlab = "Copulation duration", ylab ="Frequency", ylim = c(0,100),
     nclass = 10) 
hist(durationtotal$Duration.sec[durationtotal$Temperature.oC == "41"], 
     main = list("41", cex = 2), xlab = "Copulation duration", ylab ="Frequency", ylim = c(0,100),
     nclass = 10) 
hist(durationtotal$Duration.sec[durationtotal$Temperature.oC == "42"], 
     main = list("42", cex = 2), xlab = "Copulation duration", ylab ="Frequency", ylim = c(0,100),
     nclass = 10) 


#### mate
### in base
hist(durationmate$Duration.sec[durationmate$Temperature.oC == "30"], 
     main = list("30", cex = 2), xlab = "Copulation duration", ylab ="Frequency", ylim = c(0,100),
     nclass = 10) 
hist(durationmate$Duration.sec[durationmate$Temperature.oC == "39"], 
     main = list("39", cex = 2), xlab = "Copulation duration", ylab ="Frequency", ylim = c(0,100),
     nclass = 10) 
hist(durationmate$Duration.sec[durationmate$Temperature.oC == "40"], 
     main = list("40", cex = 2), xlab = "Copulation duration", ylab ="Frequency", ylim = c(0,100),
     nclass = 10) 
hist(durationmate$Duration.sec[durationmate$Temperature.oC == "41"], 
     main = list("41", cex = 2), xlab = "Copulation duration", ylab ="Frequency", ylim = c(0,100),
     nclass = 10) 
hist(durationmate$Duration.sec[durationmate$Temperature.oC == "42"], 
     main = list("42", cex = 2), xlab = "Copulation duration", ylab ="Frequency", ylim = c(0,100),
     nclass = 10) 


#### mount
### in base
hist(durationmount$Duration.sec[durationmount$Temperature.oC == "30"], 
     main = list("30", cex = 2), xlab = "Copulation duration", ylab ="Frequency", ylim = c(0,100),
     nclass = 10) 
hist(durationmount$Duration.sec[durationmount$Temperature.oC == "39"], 
     main = list("39", cex = 2), xlab = "Copulation duration", ylab ="Frequency", ylim = c(0,100),
     nclass = 10) 
hist(durationmount$Duration.sec[durationmount$Temperature.oC == "40"], 
     main = list("40", cex = 2), xlab = "Copulation duration", ylab ="Frequency", ylim = c(0,100),
     nclass = 10) 
hist(durationmount$Duration.sec[durationmount$Temperature.oC == "41"], 
     main = list("41", cex = 2), xlab = "Copulation duration", ylab ="Frequency", ylim = c(0,100),
     nclass = 10) 
hist(durationmount$Duration.sec[durationmount$Temperature.oC == "42"], 
     main = list("42", cex = 2), xlab = "Copulation duration", ylab ="Frequency", ylim = c(0,100),
     nclass = 10) 

# large positive skew


### total
shapiro.test (durationtotal$Duration.sec[durationtotal$Temperature.oC == "30"]) #  W = 0.51431, p-value < 2.2e-16
ks.test(durationtotal$Duration.sec[durationtotal$Temperature.oC == "30"], pnorm)  # D = 0.93803, p-value < 2.2e-16
shapiro.test (durationtotal$Duration.sec[durationtotal$Temperature.oC == "39"]) # W = 0.68926, p-value < 2.2e-16
ks.test(durationtotal$Duration.sec[durationtotal$Temperature.oC == "39"], pnorm) # D = 0.94074, p-value < 2.2e-16
shapiro.test (durationtotal$Duration.sec[durationtotal$Temperature.oC == "40"]) #  W = 0.44075, p-value < 2.2e-16
ks.test(durationtotal$Duration.sec[durationtotal$Temperature.oC == "40"], pnorm)  # D = 0.93516, p-value < 2.2e-16
shapiro.test (durationtotal$Duration.sec[durationtotal$Temperature.oC == "41"]) # W = 0.68896, p-value < 2.2e-16
ks.test(durationtotal$Duration.sec[durationtotal$Temperature.oC == "41"], pnorm) # D = 0.94845, p-value < 2.2e-16
shapiro.test (durationtotal$Duration.sec[durationtotal$Temperature.oC == "42"]) #  W = 0.62131, p-value < 2.2e-16
ks.test(durationtotal$Duration.sec[durationtotal$Temperature.oC == "42"], pnorm)  # D = 0.95478, p-value < 2.2e-16

### mate
shapiro.test (durationmate$Duration.sec[durationmate$Temperature.oC == "30"]) #  W = 0.45323, p-value < 2.2e-16
ks.test(durationmate$Duration.sec[durationmate$Temperature.oC == "30"], pnorm)  # D = 1, p-value < 2.2e-16
shapiro.test (durationmate$Duration.sec[durationmate$Temperature.oC == "39"]) # W = 0.7368, p-value < 2.2e-16
ks.test(durationmate$Duration.sec[durationmate$Temperature.oC == "39"], pnorm) # D = 1, p-value < 2.2e-16
shapiro.test (durationmate$Duration.sec[durationmate$Temperature.oC == "40"]) #  W = 0.47257, p-value < 2.2e-16
ks.test(durationmate$Duration.sec[durationmate$Temperature.oC == "40"], pnorm)  # D = 1, p-value < 2.2e-16
shapiro.test (durationmate$Duration.sec[durationmate$Temperature.oC == "41"]) # W = 0.67181, p-value = 6.011e-15
ks.test(durationmate$Duration.sec[durationmate$Temperature.oC == "41"], pnorm) # D = 1, p-value < 2.2e-16
shapiro.test (durationmate$Duration.sec[durationmate$Temperature.oC == "42"]) #  W = 0.74598, p-value = 1.963e-09
ks.test(durationmate$Duration.sec[durationmate$Temperature.oC == "42"], pnorm)  # D = 1, p-value < 2.2e-16

### mount
shapiro.test (durationmount$Duration.sec[durationmount$Temperature.oC == "30"]) #  W = 0.8875, p-value < 2.2e-16
ks.test(durationmount$Duration.sec[durationmount$Temperature.oC == "30"], pnorm)  # D = 0.92364, p-value < 2.2e-16
shapiro.test (durationmount$Duration.sec[durationmount$Temperature.oC == "39"]) # W = 0.85012, p-value < 2.2e-16
ks.test(durationmount$Duration.sec[durationmount$Temperature.oC == "39"], pnorm) # D = 0.9293, p-value < 2.2e-16
shapiro.test (durationmount$Duration.sec[durationmount$Temperature.oC == "40"]) #  W = 0.8585, p-value < 2.2e-16
ks.test(durationmount$Duration.sec[durationmount$Temperature.oC == "40"], pnorm)  # D = 0.92315, p-value < 2.2e-16
shapiro.test (durationmount$Duration.sec[durationmount$Temperature.oC == "41"]) # W = 0.89164, p-value = 8.918e-13
ks.test(durationmount$Duration.sec[durationmount$Temperature.oC == "41"], pnorm) # D = 0.93542, p-value < 2.2e-16
shapiro.test (durationmount$Duration.sec[durationmount$Temperature.oC == "42"]) #  W = 0.91694, p-value = 3.55e-06
ks.test(durationmount$Duration.sec[durationmount$Temperature.oC == "42"], pnorm)  # D = 0.94121, p-value < 2.2e-16

## No group anywhere near normal


###### plotting differences
# base boxplots of data distribution grouped by temperature

### total
boxplot(durationtotal$Duration.sec ~ durationtotal$Temperature.oC, ylab="Frequency", xlab="Temperature")
#look relatively similar, 42 getting more variable

### mate
boxplot(durationmate$Duration.sec ~ durationmate$Temperature.oC, ylab="Frequency", xlab="Temperature")
#look relatively similar, 42 getting more variable

### mount
boxplot(durationmount$Duration.sec ~ durationmount$Temperature.oC, ylab="Frequency", xlab="Temperature")
# look relatively simialr

# variances look similar at lower temperatures, but males larger at higher temperatures



########### Homogeneity of Variances - Failed in all groups 
#! need library(car)

### total
bartlett.test(durationtotal$Duration.sec ~ durationtotal$Temperature.oC) # Bartlett's K-squared = 344.79, df = 4, p-value < 2.2e-16
fligner.test(durationtotal$Duration.sec ~ durationtotal$Temperature.oC) # Fligner-Killeen:med chi-squared = 102.52, df = 4, p-value < 2.2e-16
leveneTest(durationtotal$Duration.sec ~ durationtotal$Temperature.oC)   #Df F value Pr(>F)   4  13.831 3.578e-11 ***

### mate
bartlett.test(durationmate$Duration.sec ~ durationmate$Temperature.oC) # Bartlett's K-squared = 146.34, df = 4, p-value < 2.2e-16
fligner.test(durationmate$Duration.sec ~ durationmate$Temperature.oC) # Fligner-Killeen:med chi-squared = 43.155, df = 4, p-value = 9.608e-09
leveneTest(durationmate$Duration.sec ~ durationmate$Temperature.oC)   #Df F value Pr(>F) 4  6.7449 2.485e-05 ***

### mount
bartlett.test(durationmount$Duration.sec ~ durationmount$Temperature.oC) # Bartlett's K-squared = 8.6256, df = 4, p-value = 0.07117
fligner.test(durationmount$Duration.sec ~ durationmount$Temperature.oC) # Fligner-Killeen:med chi-squared = 19.537, df = 4, p-value = 0.0006163
leveneTest(durationmount$Duration.sec ~ durationmount$Temperature.oC)   #Df F value Pr(>F) 4  3.1505 0.01358 *







#################################################################################################################################################################### DURATIONTOTAL   ####  OLD METHOD: USE NORMAL > TRY AND TRANSFORM TO NORMAL > NON PARAMETRIC ##################################### 



# Data not normal or homogenous in variances 

#1) Homogeneity of variance not strict assumpation of Kurskall wallace 
kruskal.test(durationtotal$Duration.sec ~ durationtotal$Temperature.oC)
# Kruskal-Wallis chi-squared =  60.268, df = 4, p-value = 2.548e-12 # there is a significant difference between groups
# Posthoc testing 

#! library(pgirmess) 
kruskalmc(durationtotal$Duration.sec ~ durationtotal$Temperature.oC, probs = 0.05, cont=NULL) # Multiple comparison test after 
# Multiple comparison test after Kruskal-Wallis 
# p.value: 0.05 
# Comparisons
#         obs.dif critical.dif difference
# 30-39 105.12942     119.2217      FALSE
# 30-40 157.25765     118.6209       TRUE
# 30-41  85.14896     144.4147      FALSE
# 30-42 274.23692     189.7898       TRUE
# 39-40  52.12823     114.1805      FALSE
# 39-41 190.27838     140.7902       TRUE
# 39-42 379.36634     187.0467       TRUE
# 40-41 242.40661     140.2818       TRUE
# 40-42 431.49457     186.6643       TRUE
# 41-42 189.08796     204.0300      FALSE


#! library(PMCMR) 
posthoc.kruskal.nemenyi.test(durationtotal$Duration.sec ~ durationtotal$Temperature.oC, dist="Chisquare") #"Tukey" for no ties 
#         30     39      40      41    
#      39 0.1892 -       -       -     
#      40 0.0077 0.8007  -       -     
#      41 0.6017 0.0061  9.8e-05 -     
#      42 0.0024 1.5e-06 1.5e-08 0.1481


# dunn.test.control(x=survivalsex$Prop.survivors,g=survivalsex$Treatment, p.adjust="bonferroni")
# dunn.test.control(x=survivalsex$Prop.survivors,g=survivalsex$Treatment)
posthoc.kruskal.dunn.test(x=durationtotal$Duration.sec, g=durationtotal$Temperature.oC, p.adjust.method="bonferroni") # ULTRA CONSERVATIVE
posthoc.kruskal.dunn.test(x=durationtotal$Duration.sec, g=durationtotal$Temperature.oC) # THIS METHOD PREFERRED FOR NON PARAMETRIC UNEQUAL SAMPLE SIZES
# 
#           30      39      40      41     
#      39 0.03974 -       -       -      
#      40 0.00098 0.19968 -       -      
#      41 0.19533 0.00088 9.7e-06 -      
#      42 0.00035 1.1e-07 8.4e-10 0.03693



#also see Games-Howell test as more robust to violation of assumptions 

#2) Convert data to ranks and submitt to a welch anova and tukey HSD Ruxton (2006)
durationtotal$rank<-rank(durationtotal$Duration.sec) # ranking all data inter-group by ascending count 
oneway.test(durationtotal$Duration.sec~ durationtotal$Temperature.oC, var.equal=FALSE)
# F = 7.6826, num df = 4.00, denom df = 833.34, p-value = 4.428e-06
TukeyHSD(aov) # only works with aov not oneway, 
#! library(userfriendlyscience)
posthocTGH(y=durationtotal$Duration.sec, x=durationtotal$Temperature.oC, method="games-howell") # use games-howell when different sample sizes # tukey for equal
# n means variances
# 30 50  15.0       110
# 39 48  13.5       119
# 40 42  18.7       308
# 41 49   8.7        71
# 42 26   6.0        29
# 
# diff  ci.lo ci.hi    t   df    p p.adjusted
# 39-30 -3.62  -9.50   2.3 1.69 1205  .44       1.00
# 40-30 -3.39 -10.18   3.4 1.36 1431  .65       1.00
# 41-30  3.94  -3.70  11.6 1.41  845  .62       1.00
# 42-30 26.93   8.60  45.3 4.04  205 <.01        .01
# 40-39  0.24  -5.58   6.1 0.11 1395 1.00       1.00
# 41-39  7.56   0.77  14.4 3.05  631  .02        .12
# 42-39 30.55  12.55  48.5 4.68  190 <.01       <.01
# 41-40  7.33  -0.27  14.9 2.64  862  .06        .32
# 42-40 30.32  12.01  48.6 4.56  204 <.01       <.01
# 42-41 22.99   4.36  41.6 3.39  218  .01        .05





#################################################################################################################################################################### DURATIONMATE  ####  OLD METHOD: USE NORMAL > TRY AND TRANSFORM TO NORMAL > NON PARAMETRIC ##################################### 



# Data not normal or homogenous in variances 

#1) Homogeneity of variance not strict assumpation of Kurskall wallace 
kruskal.test(durationmate$Duration.sec ~ durationmate$Temperature.oC)
# Kruskal-Wallis chi-squared =  24, df = 4, p-value = 8e-05 # there is a significant difference between groups
# Posthoc testing 

#! library(pgirmess) 
kruskalmc(durationmate$Duration.sec ~ durationmate$Temperature.oC, probs = 0.05, cont=NULL) # Multiple comparison test after 
# Multiple comparison test after Kruskal-Wallis 
# p.value: 0.05 
# Comparisons
#         obs.dif critical.dif difference
# 30-39   9.921        61.55      FALSE
# 30-40  13.015        62.33      FALSE
# 30-41  39.522        69.23      FALSE
# 30-42 123.891        83.80       TRUE
# 39-40  22.936        61.91      FALSE
# 39-41  29.601        68.85      FALSE
# 39-42 113.969        83.48       TRUE
# 40-41  52.537        69.55      FALSE
# 40-42 136.906        84.06       TRUE
# 41-42  84.368        89.30      FALSE


#! library(PMCMR) 
posthoc.kruskal.nemenyi.test(durationmate$Duration.sec ~ durationmate$Temperature.oC, dist="Chisquare") #"Tukey" for no ties 
#            30      39      40      41     
#      39 0.99510 -       -       -      
#      40 0.98682 0.89712 -       -      
#      41 0.63236 0.83424 0.34281 -      
#      42 0.00175 0.00539 0.00033 0.13402






# dunn.test.control(x=survivalsex$Prop.survivors,g=survivalsex$Treatment, p.adjust="bonferroni")
# dunn.test.control(x=survivalsex$Prop.survivors,g=survivalsex$Treatment)
posthoc.kruskal.dunn.test(x=durationmate$Duration.sec, g=durationmate$Temperature.oC, p.adjust.method="bonferroni") # ULTRA CONSERVATIVE
posthoc.kruskal.dunn.test(x=durationmate$Duration.sec, g=durationmate$Temperature.oC) # THIS METHOD PREFERRED FOR NON PARAMETRIC UNEQUAL SAMPLE SIZES
# 
#         30     39     40      41    
#      39 1.0000 -      -       -     
#      40 1.0000 0.9097 -       -     
#      41 0.5450 0.9097 0.2036  -     
#      42 0.0003 0.0010 4.8e-05 0.0559



#also see Games-Howell test as more robust to violation of assumptions 

#2) Convert data to ranks and submitt to a welch anova and tukey HSD Ruxton (2006)
durationmate$rank<-rank(durationmate$Duration.sec) # ranking all data inter-group by ascending count 
oneway.test(durationmate$Duration.sec~ durationmate$Temperature.oC, var.equal=FALSE)
# F = 4.1, num df = 4, denom df = 270, p-value = 0.003
TukeyHSD(aov) # only works with aov not oneway, 
#! library(userfriendlyscience)
posthocTGH(y=durationmate$Duration.sec, x=durationmate$Temperature.oC, method="games-howell") # use games-howell when different sample sizes # tukey for equal
#      n means variances
# 30 178    76      4512
# 39 183    73      1578
# 40 174    81      6639
# 41 119    79      2449
# 42  67   126     11577
# 
#        diff ci.lo ci.hi    t  df    p p.adjusted
# 39-30 -3.1 -19.1    13 0.53 286  .98       1.00
# 40-30  4.8 -17.1    27 0.60 335  .97       1.00
# 41-30  3.2 -15.5    22 0.47 292  .99       1.00
# 42-30 49.9  10.6    89 3.54  86  .01        .05
# 40-39  7.9 -10.9    27 1.15 248  .78       1.00
# 41-39  6.3  -8.6    21 1.16 213  .78       1.00
# 42-39 53.0  15.3    91 3.93  73 <.01        .02
# 41-40 -1.6 -22.7    19 0.21 287 1.00       1.00
# 42-40 45.1   4.7    85 3.10  97  .02        .14
# 42-41 46.7   7.9    85 3.36  82  .01        .08






#################################################################################################################################################################### DURATIONMOUNT  ####  OLD METHOD: USE NORMAL > TRY AND TRANSFORM TO NORMAL > NON PARAMETRIC ##################################### 



# Data not normal or homogenous in variances 

#1) Homogeneity of variance not strict assumpation of Kurskall wallace 
kruskal.test(durationmount$Duration.sec ~ durationmount$Temperature.oC)
# Kruskal-Wallis chi-squared =  30, df = 4, p-value = 6e-06 # there is a significant difference between groups
# Posthoc testing 

#! library(pgirmess) 
kruskalmc(durationmount$Duration.sec ~ durationmount$Temperature.oC, probs = 0.05, cont=NULL) # Multiple comparison test after 
# Multiple comparison test after Kruskal-Wallis 
# p.value: 0.05 
# Comparisons
#       obs.dif critical.dif difference
# 30-39   89.41       102.23      FALSE
# 30-40  119.51       101.23       TRUE
# 30-41   23.33       127.42      FALSE
# 30-42  136.70       175.08      FALSE
# 39-40   30.10        96.33      FALSE
# 39-41  112.74       123.57      FALSE
# 39-42  226.11       172.30       TRUE
# 40-41  142.84       122.75       TRUE
# 40-42  256.21       171.71       TRUE
# 41-42  113.37       188.34      FALSE


#! library(PMCMR) 
posthoc.kruskal.nemenyi.test(durationmount$Duration.sec ~ durationmount$Temperature.oC, dist="Chisquare") #"Tukey" for no ties 
#        30     39     40     41    
#      39 0.1955 -      -      -     
#      40 0.0263 0.9422 -      -     
#      41 0.9920 0.1596 0.0300 -     
#      42 0.3062 0.0086 0.0015 0.5807






# dunn.test.control(x=survivalsex$Prop.survivors,g=survivalsex$Treatment, p.adjust="bonferroni")
# dunn.test.control(x=survivalsex$Prop.survivors,g=survivalsex$Treatment)
posthoc.kruskal.dunn.test(x=durationmount$Duration.sec, g=durationmount$Temperature.oC, p.adjust.method="bonferroni") # ULTRA CONSERVATIVE
posthoc.kruskal.dunn.test(x=durationmount$Duration.sec, g=durationmount$Temperature.oC) # THIS METHOD PREFERRED FOR NON PARAMETRIC UNEQUAL SAMPLE SIZES
# 
#         30      39      40      41     
#      39 0.06956 -       -       -      
#      40 0.00721 0.75927 -       -      
#      41 0.75927 0.06178 0.00747 -      
#      42 0.11249 0.00202 0.00027 0.27158



#also see Games-Howell test as more robust to violation of assumptions 

#2) Convert data to ranks and submitt to a welch anova and tukey HSD Ruxton (2006)
durationmount$rank<-rank(durationmount$Duration.sec) # ranking all data inter-group by ascending count 
oneway.test(durationmount$Duration.sec~ durationmount$Temperature.oC, var.equal=FALSE)
# F = 6.8, num df = 4, denom df = 570, p-value = 2e-05
TukeyHSD(aov) # only works with aov not oneway, 
#! library(userfriendlyscience)
posthocTGH(y=durationmount$Duration.sec, x=durationmount$Temperature.oC, method="games-howell") # use games-howell when different sample sizes # tukey for equal
# n means variances
# 30 485  11.7        89
# 39 584  10.4        83
# 40 610   9.8        72
# 41 263  12.0        90
# 42 111  13.7        96
# 
# diff ci.lo ci.hi   t   df    p p.adjusted
# 39-30 -1.27 -2.83  0.29 2.2 1018  .17       1.00
# 40-30 -1.89 -3.39 -0.39 3.4  986  .01        .05
# 41-30  0.29 -1.69  2.27 0.4  535  .99       1.00
# 42-30  1.99 -0.84  4.82 1.9  160  .30       1.00
# 40-39 -0.61 -2.01  0.78 1.2 1177  .75       1.00
# 41-39  1.56 -0.34  3.46 2.2  488  .17       1.00
# 42-39  3.27  0.49  6.04 3.2  148  .01        .10
# 41-40  2.17  0.32  4.03 3.2  453  .01        .10
# 42-40  3.88  1.13  6.63 3.9  142 <.01        .01
# 42-41  1.71 -1.32  4.74 1.6  200  .53       1.00



#################################################################################################################################################################### FREQUENCY TOTAL NEW METHOD: USE GLM WITH NON-GAUSSIAN ERROR STRUCTURE######################################################

#### Poisson family error structures
# As data is very right skewed count, fitting normal distibution does not give normal and homogenity of variance in residuals 

str(frequencytotal)

### Temperature as factor 

# Creating a global model
globalmodposs<-glm(Frequency ~ Temperature.oC, poisson(link = "log"), data=frequencytotal)
globalmodpossID<-glm(Frequency ~ Temperature.oC, poisson(link = "identity"), data=frequencytotal)
globalmodpossRT<-glm(Frequency ~ Temperature.oC, poisson(link = "sqrt"), data=frequencytotal) 


summary(globalmodposs); summary(globalmodpossID); summary(globalmodpossRT) # No R^2, AIC given
# AIC: 2507  # link change seem to do little
pseudoR<-(globalmodposs$null.deviance-globalmodposs$deviance) / globalmodposs$null.deviance # (thomas et al., 2015)
pseudoR # 0.1584
pseudoR<-(globalmodpossID$null.deviance-globalmodpossID$deviance) / globalmodpossID$null.deviance # (thomas et al., 2015)
pseudoR # 0.1584
pseudoR<-(globalmodpossRT$null.deviance-globalmodpossRT$deviance) / globalmodpossRT$null.deviance # (thomas et al., 2015)
pseudoR # 0.1584
# seems changing the link does nothing to R^2 or AIC


AICc<-(-2*logLik(globalmodposs))+((2*5*(5+1)/(215-5-1))); AICc # qAICc<-((-2*logLik(model1)/Theta)+((2*p*(p+1)/(n-p-1))); qAICc # AIC correcting for perameters(p) and sample size (n) # 2498
qAICc<-(-2*logLik(globalmodposs)/7.783)+((2*5*(5+1)/(215-5-1))); qAICc # 74.84


## Overdispersion check
par(mfrow=c(2,2)); plot(globalmodposs);par(mfrow=c(1,1))
theta<-globalmodposs$deviance/globalmodposs$df.residual; theta #dispersion perameter (thomas et al 2015) how much variation left unexplained after fitting distribution # theta = 7.783, some overdispersion >1 is overdispersion. 
#! library(AER) alternative test
var(frequencytotal$Frequency) #146.9
mean(frequencytotal$Frequency) #12.87
dispersiontest(globalmodposs) # dispersion  9.104
table(frequencytotal$Frequency) # 3/215 0s

# quasi poisson best as <15


### assumption checks, recommendation of residual dev (contribution of each obs to resid dev) rather than pearson (Thomas et al., 2015)
globalmodposs<-glm(Frequency ~ Temperature.oC, poisson(link = "log"), data=frequencytotal)
globalmodquasiposs<-glm(Frequency ~ Temperature.oC, quasipoisson(link = "log"), data=frequencytotal)
summary(globalmodposs)
summary(globalmodquasiposs)

# 1) Errors normally distributed? - NOT NECASSARY BUT NOT IMPROVED

# poisson
devresid<-resid(globalmodposs, type = "deviance"); hist(devresid)
shapiro.test(devresid);ks.test(devresid, pnorm)
qqnorm(devresid,cex=1.8,pch=20); qqline(devresid,lty=2,lwd=2)
par(mfrow=c(2,2)); plot(globalmodposs);par(mfrow=c(1,1))

# quasi
devresid<-resid(globalmodquasiposs, type = "deviance"); hist(devresid)
shapiro.test(devresid);ks.test(devresid, pnorm)
qqnorm(devresid,cex=1.8,pch=20); qqline(devresid,lty=2,lwd=2)
par(mfrow=c(2,2)); plot(globalmodquasiposs);par(mfrow=c(1,1))
# some positive skew and pull down up right of qq

# 2) Homogenous/homoscedasticity variance of residuals - YES

# poisson
devresid<-resid(globalmodposs, type = "deviance")
plot(devresid ~ globalmodposs$fitted.values, pch = 20, cex = 1, cex.lab = 1.5)
fligner.test(devresid~frequencytotal$Frequency) 
par(mfrow=c(2,2)); plot(globalmodposs);par(mfrow=c(1,1))

# quasi
devresid<-resid(globalmodquasiposs, type = "deviance")
plot(devresid ~ globalmodposs$fitted.values, pch = 20, cex = 1, cex.lab = 1.5)
fligner.test(devresid~frequencytotal$Frequency)
par(mfrow=c(2,2)); plot(globalmodquasiposs);par(mfrow=c(1,1))
#  little trend or wedging, test passed

# 3) Independences of independent variables - NO PATTERN
# Only one independent variable


# 4) No serial auto-correlation with time/space - #ALL DATA COLLECTED AT ONE TIMEPOINT
#! need library(car)
# durbinWatsonTest(globalmodquasiposs)  # passed

# 5) No bias by unduly influential datapoints - YES

# poisson
par(mfrow=c(2,2)); plot(globalmodposs);par(mfrow=c(1,1))
influence<-influence.measures(globalmodposs); summary(influence) 

# quasi
par(mfrow=c(2,2)); plot(globalmodquasiposs);par(mfrow=c(1,1))
influence<-influence.measures(globalmodquasiposs); summary(influence) 
# 5 cases in poisson with larger cooks distance improved with quasi poisson 

# 6) Independent variables measured without error - BEST OF ABILITY

## Overdispersion re-check
theta<-globalmodquasiposs$deviance/globalmodquasiposs$df.residual; theta # 7.783
#AICc<-(-2*logLik(globalmodquasiposs))+((2*5*(5+1)/(215-5-1))); AICc # 688
# model still overdispersed


###  MODEL REFINEMENT

#! Quasi poisson is improving assumpations of model in parcicular cooks distance, it is also producing more reliable significance values
# models with levels look better because their coefficient estiamtes are closer, they have less overdispersion and explain a greater proporiton of the variation 
# there are only 5 group points unevenly spaced on regression line  (Field book )


globalmodquasiposs<-glm(Frequency ~ Temperature.oC, quasipoisson(link = "log"), data=frequencytotal)
pseudoR<-(globalmodquasiposs$null.deviance-globalmodquasiposs$deviance) / globalmodquasiposs$null.deviance; pseudoR # 0.1584

## Null model
nullmod<-glm(Frequency ~ 1, quasipoisson(link = "log"), data=frequencytotal) # creating null of just intercept (and random in glmms)
pseudoR<-(nullmod$null.deviance-nullmod$deviance) / nullmod$null.deviance; pseudoR # (thomas et al., 2015) #  -1.171e-16


# liklihood ratio test
library(lmtest)
lrtest(globalmodquasiposs, nullmod) #doesnt work

anova(globalmodquasiposs, nullmod, test = "Chi")
# Model 1: Offspring.count.20D ~ Temperature.oC * Sex.1
# Model 2: Offspring.count.20D ~ Temperature.oC + Sex.1
#   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
# 1       210       1634                         
# 2       214       1942 -4     -308  1.2e-06 ***


anova(globalmodquasiposs, test = "Chi")
drop1(globalmodquasiposs, test = "Chi")
# Frequency ~ Temperature.oC
#                Df Deviance scaled dev. Pr(>Chi)    
# <none>                1634                         
# Temperature.oC  4     1942          33  1.2e-06 ***


summary(globalmodquasiposs)
# Call:
#      glm(formula = Frequency ~ Temperature.oC, family = quasipoisson(link = "log"), 
#          data = frequencytotal)
# 
# Deviance Residuals: 
#    Min      1Q  Median      3Q     Max  
# -5.474  -2.207  -1.136   0.997  10.765  
# 
# Coefficients:
#                        Estimate Std. Error t value Pr(>|t|)    
#      (Intercept)         2.707      0.112   24.26  < 2e-16 ***
#      Temperature.oC39   -0.101      0.164   -0.62  0.53802    
#      Temperature.oC40    0.221      0.156    1.42  0.15734    
#      Temperature.oC41   -0.544      0.185   -2.94  0.00369 ** 
#      Temperature.oC42   -0.909      0.268   -3.39  0.00083 ***
#      ---
#      Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# (Dispersion parameter for quasipoisson family taken to be 9.32)
# 
# Null deviance: 1941.9  on 214  degrees of freedom
# Residual deviance: 1634.3  on 210  degrees of freedom
# AIC: NA
# 
# Number of Fisher Scoring iterations: 5

# estimates looks ok
exp(2.707) # 14.98
exp(2.707-0.101) # 13.54
exp(2.707+0.221) # 18.69
exp(2.707-0.544) # 8.697
exp(2.707-0.909) # 6.038

describeBy(frequencytotal$Frequency, frequencytotal$Temperature.oC, mat=TRUE) 
# item group1 vars  n   mean     sd median trimmed   mad min max range   skew kurtosis    se
# X11    1     30    1 50 14.980 10.466   12.5  13.700 8.154   0  40    40 0.9934 -0.04683 1.480
# X12    2     39    1 48 13.542 10.892    8.0  12.050 3.707   1  54    53 1.5926  2.43362 1.572
# X13    3     40    1 42 18.690 17.546   11.5  15.382 6.672   1  82    81 1.8184  2.82411 2.707
# X14    4     41    1 49  8.694  8.424    6.0   7.366 4.448   0  45    45 2.0587  5.28823 1.203
# X15    5     42    1 26  6.038  5.385    5.0   5.227 4.448   0  25    25 1.7648  3.61372 1.056

tapply(frequencytotal$Frequency, frequencytotal$Temperature.oC, mean)
# 30     39     40     41     42 
# 14.980 13.542 18.690  8.694  6.038 


library(lsmeans)
lsmeans(globalmodquasiposs, pairwise~Temperature.oC, adjust="tukey")
# Temperature.oC lsmean     SE df asymp.LCL asymp.UCL
# 30              2.707 0.1116 NA     2.488     2.925
# 39              2.606 0.1197 NA     2.371     2.840
# 40              2.928 0.1090 NA     2.714     3.142
# 41              2.163 0.1479 NA     1.873     2.453
# 42              1.798 0.2437 NA     1.321     2.276
# 
# Results are given on the log (not the response) scale. 
# Confidence level used: 0.95 
# 
# $contrasts
# contrast estimate     SE df z.ratio p.value
# 30 - 39    0.1009 0.1637 NA   0.617  0.9725
# 30 - 40   -0.2213 0.1559 NA  -1.419  0.6153
# 30 - 41    0.5441 0.1853 NA   2.937  0.0274 <--
# 30 - 42    0.9086 0.2680 NA   3.391  0.0063 <--
# 39 - 40   -0.3222 0.1619 NA  -1.990  0.2706
# 39 - 41    0.4432 0.1903 NA   2.329  0.1358
# 39 - 42    0.8076 0.2715 NA   2.975  0.0245 <--
# 40 - 41    0.7654 0.1837 NA   4.166  0.0003 <--
# 40 - 42    1.1299 0.2669 NA   4.233  0.0002 <--
# 41 - 42    0.3645 0.2850 NA   1.279  0.7044 





library(multcomp)
summary(glht(globalmodgammalog, mcp(Temperature.oC="Tukey")))




#################################################################################################################################################################### FREQUENCY MATE NEW METHOD: USE GLM WITH NON-GAUSSIAN ERROR STRUCTURE######################################################


############### ! NATURE COMM PAPER MATING FREQUENCY MODEL REFINEMENT ##############

#### Poisson family error structures
# As data is very right skewed count, fitting normal distibution does not give normal and homogenity of variance in residuals 

str(frequencymate)

### Temperature as factor 

# Creating a global model
globalmodposs<-glm(Frequency ~ Temperature.oC, poisson(link = "log"), data=frequencymate)
globalmodpossID<-glm(Frequency ~ Temperature.oC, poisson(link = "identity"), data=frequencymate)
globalmodpossRT<-glm(Frequency ~ Temperature.oC, poisson(link = "sqrt"), data=frequencymate) 


summary(globalmodposs); summary(globalmodpossID); summary(globalmodpossRT) # No R^2, AIC given
# AIC: 590.  # link change seem to do little
pseudoR<-(globalmodposs$null.deviance-globalmodposs$deviance) / globalmodposs$null.deviance # (thomas et al., 2015)
pseudoR # 0.1634
pseudoR<-(globalmodpossID$null.deviance-globalmodpossID$deviance) / globalmodpossID$null.deviance # (thomas et al., 2015)
pseudoR # 0.1634
pseudoR<-(globalmodpossRT$null.deviance-globalmodpossRT$deviance) / globalmodpossRT$null.deviance # (thomas et al., 2015)
pseudoR # 0.1634
# seems changing the link does nothing to R^2 or AIC


AICc<-(-2*logLik(globalmodposs))+((2*5*(5+1)/(108-5-1))); AICc # qAICc<-((-2*logLik(model1)/Theta)+((2*p*(p+1)/(n-p-1))); qAICc # AIC correcting for perameters(p) and sample size (n) # 580.9
qAICc<-(-2*logLik(globalmodposs)/1.675)+((2*5*(5+1)/(108-5-1))); qAICc # 347


## Overdispersion check
par(mfrow=c(2,2)); plot(globalmodposs);par(mfrow=c(1,1))
theta<-globalmodposs$deviance/globalmodposs$df.residual; theta #dispersion perameter (thomas et al 2015) how much variation left unexplained after fitting distribution # theta = 1.832, no overdispersion >1 is overdispersion. 
#! library(AER) alternative test
var(frequencymate$Frequency) #13.
mean(frequencymate$Frequency) #7.157
dispersiontest(globalmodposs) # dispersion  1.675
table(frequencymate$Frequency) # 2/108 0s

# poisson fine


### assumption checks, recommendation of residual dev (contribution of each obs to resid dev) rather than pearson (Thomas et al., 2015)
globalmodposs<-glm(Frequency ~ Temperature.oC, poisson(link = "log"), data=frequencymate)


# 1) Errors normally distributed? - YES

# poisson
devresid<-resid(globalmodposs, type = "deviance"); hist(devresid)
shapiro.test(devresid);ks.test(devresid, pnorm)
qqnorm(devresid,cex=1.8,pch=20); qqline(devresid,lty=2,lwd=2)
par(mfrow=c(2,2)); plot(globalmodposs);par(mfrow=c(1,1))
# pretty symetrical, test passed

# 2) Homogenous/homoscedasticity variance of residuals - YES

# poisson
devresid<-resid(globalmodposs, type = "deviance")
plot(devresid ~ globalmodposs$fitted.values, pch = 20, cex = 1, cex.lab = 1.5)
fligner.test(devresid~frequencymate$Frequency) 
par(mfrow=c(2,2)); plot(globalmodposs);par(mfrow=c(1,1))
#  little trend or wedging, test passed

# 3) Independences of independent variables - NO PATTERN
# Only one independent variable


# 4) No serial auto-correlation with time/space - #ALL DATA COLLECTED AT ONE TIMEPOINT
#! need library(car)
# durbinWatsonTest(globalmodquasiposs)  # passed

# 5) No bias by unduly influential datapoints - YES

# poisson
par(mfrow=c(2,2)); plot(globalmodposs);par(mfrow=c(1,1))
influence<-influence.measures(globalmodposs); summary(influence) 
# 5 minor cases 

# 6) Independent variables measured without error - BEST OF ABILITY


###  MODEL REFINEMENT
# there are only 5 group points unevenly spaced on regression line  (Field book )

globalmodposs<-glm(Frequency ~ Temperature.oC, poisson(link = "log"), data=frequencymate)
pseudoR<-(globalmodposs$null.deviance-globalmodposs$deviance) / globalmodposs$null.deviance; pseudoR # 0.1634

## Null model
nullmod<-glm(Frequency ~ 1, poisson(link = "log"), data=frequencymate) # creating null of just intercept (and random in glmms)
pseudoR<-(nullmod$null.deviance-nullmod$deviance) / nullmod$null.deviance; pseudoR # (thomas et al., 2015) #  2.52e-16




############### ! NATURE COMM PAPER MATING FREQUENCY MODEL SIGNIFICANCE ##############

# liklihood ratio test
library(lmtest)
lrtest(globalmodposs, nullmod)
# Model 1: Frequency ~ Temperature.oC
# Model 2: Frequency ~ 1
#    Df LogLik Df Chisq Pr(>Chisq)    
# 1   5   -290                        
# 2   1   -309 -4  36.9    1.9e-07 ***


anova(globalmodposs, nullmod, test = "Chi")
# Model 1: Frequency ~ Temperature.oC
# Model 2: Frequency ~ 1
# Resid. Df Resid. Dev Df Deviance Pr(>Chi)    
# 1       103        189                         
# 2       107        226 -4    -36.9  1.9e-07 ***

#!!!! Deviance looks the same thing as chisq

anova(globalmodposs, test = "Chi")
drop1(globalmodposs, test = "Chi")
# Frequency ~ Temperature.oC
#                Df Deviance AIC  LRT Pr(>Chi)    
# <none>                 189 590                  
# Temperature.oC  4      226 619 36.9  1.9e-07 ***


############### ! NATURE COMM PAPER MATING FREQUENCY MODEL POST HOC TESTS ##############

summary(globalmodposs)
# Call:
#      glm(formula = Frequency ~ Temperature.oC, family = poisson(link = "log"), 
#          data = frequencymate)
# 
# Deviance Residuals: 
#      Min      1Q  Median      3Q     Max  
# -3.367  -0.858  -0.040   0.560   4.024  
# 
# Coefficients:
#                       Estimate Std. Error z value Pr(>|z|)    
#      (Intercept)        2.2105     0.0662   33.38  < 2e-16 ***
#      Temperature.oC39  -0.2705     0.1019   -2.66   0.0079 ** 
#      Temperature.oC40  -0.0902     0.1005   -0.90   0.3694    
#      Temperature.oC41  -0.4759     0.1083   -4.39  1.1e-05 ***
#      Temperature.oC42  -0.6448     0.1390   -4.64  3.5e-06 ***
#      ---
#      Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# (Dispersion parameter for poisson family taken to be 1)
# 
# Null deviance: 225.57  on 107  degrees of freedom
# Residual deviance: 188.71  on 103  degrees of freedom
# AIC: 590.3
# 
# Number of Fisher Scoring iterations: 5

# estimates looks ok
exp(2.2105) # 9.12
exp(2.2105-0.2705) # 6.959
exp(2.2105-0.0902) # 8.334
exp(2.2105-0.4759) # 5.667
exp(2.2105-0.6448) # 4.786

describeBy(frequencymate$Frequency, frequencymate$Temperature.oC, mat=TRUE) 
#     item group1 vars  n  mean    sd median trimmed   mad min max range    skew kurtosis     se
# X11    1     30    1 25 9.120 3.395    8.0   9.143 4.448   4  14    10  0.1649  -1.3684 0.6790
# X12    2     39    1 24 6.958 1.756    8.0   7.350 0.000   1   8     7 -2.0155   3.6827 0.3585
# X13    3     40    1 21 8.333 3.246    8.0   8.294 2.965   1  16    15  0.1247   0.0854 0.7082
# X14    4     41    1 24 5.667 4.135    5.0   5.350 4.448   0  15    15  0.6199  -0.6971 0.8441
# X15    5     42    1 14 4.786 4.098    4.5   4.250 3.707   0  16    16  1.2446   1.3139 1.0953

tapply(frequencymate$Frequency, frequencymate$Temperature.oC, mean)
# 30    39    40    41    42 
# 9.120 6.958 8.333 5.667 4.786 




library(lsmeans)
lsmeans(globalmodposs, pairwise~Temperature.oC, adjust="tukey")
# Temperature.oC lsmean      SE df asymp.LCL asymp.UCL
# 30              2.210 0.06623 NA     2.081     2.340
# 39              1.940 0.07738 NA     1.788     2.092
# 40              2.120 0.07559 NA     1.972     2.268
# 41              1.735 0.08575 NA     1.567     1.903
# 42              1.566 0.12217 NA     1.326     1.805
# 
# Results are given on the log (not the response) scale. 
# Confidence level used: 0.95 
# 
# $contrasts
# contrast estimate     SE df z.ratio p.value
# 30 - 39   0.27053 0.1019 NA   2.656  0.0607
# 30 - 40   0.09021 0.1005 NA   0.898  0.8979
# 30 - 41   0.47587 0.1083 NA   4.392  0.0001 <--
# 30 - 42   0.64483 0.1390 NA   4.640  <.0001 <--
# 39 - 40  -0.18032 0.1082 NA  -1.667  0.4546
# 39 - 41   0.20534 0.1155 NA   1.778  0.3866
# 39 - 42   0.37430 0.1446 NA   2.588  0.0725
# 40 - 41   0.38566 0.1143 NA   3.374  0.0067 <--
# 40 - 42   0.55463 0.1437 NA   3.861  0.0011 <--
# 41 - 42   0.16897 0.1493 NA   1.132  0.7896


library(multcomp)
summary(glht(globalmodgammalog, mcp(Temperature.oC="Tukey")))




#################################################################################################################################################################### FREQUENCY MOUNT NEW METHOD: USE GLM WITH NON-GAUSSIAN ERROR STRUCTURE######################################################

#### Poisson family error structures
# As data is very right skewed count, fitting normal distibution does not give normal and homogenity of variance in residuals 

str(frequencymount)

### Temperature as factor 

# Creating a global model
globalmodposs<-glm(Frequency ~ Temperature.oC, poisson(link = "log"), data=frequencymount)
globalmodpossID<-glm(Frequency ~ Temperature.oC, poisson(link = "identity"), data=frequencymount)
globalmodpossRT<-glm(Frequency ~ Temperature.oC, poisson(link = "sqrt"), data=frequencymount) 


summary(globalmodposs); summary(globalmodpossID); summary(globalmodpossRT) # No R^2, AIC given
# AIC: 1335  # link change seem to do little
pseudoR<-(globalmodposs$null.deviance-globalmodposs$deviance) / globalmodposs$null.deviance # (thomas et al., 2015)
pseudoR # 0.2559
pseudoR<-(globalmodpossID$null.deviance-globalmodpossID$deviance) / globalmodpossID$null.deviance # (thomas et al., 2015)
pseudoR # 0.2559
pseudoR<-(globalmodpossRT$null.deviance-globalmodpossRT$deviance) / globalmodpossRT$null.deviance # (thomas et al., 2015)
pseudoR # 0.2559
# seems changing the link does nothing to R^2 or AIC


AICc<-(-2*logLik(globalmodposs))+((2*5*(5+1)/(107-5-1))); AICc # qAICc<-((-2*logLik(model1)/Theta)+((2*p*(p+1)/(n-p-1))); qAICc # AIC correcting for perameters(p) and sample size (n) # 1326
qAICc<-(-2*logLik(globalmodposs)/8.299)+((2*5*(5+1)/(107-5-1))); qAICc # 160.3


## Overdispersion check
par(mfrow=c(2,2)); plot(globalmodposs);par(mfrow=c(1,1))
theta<-globalmodposs$deviance/globalmodposs$df.residual; theta #dispersion perameter (thomas et al 2015) how much variation left unexplained after fitting distribution # theta = 8.369, some overdispersion >1 is overdispersion. 
#! library(AER) alternative test
var(frequencymount$Frequency) #216.3
mean(frequencymount$Frequency) #18.
dispersiontest(globalmodposs) # dispersion  8.299
table(frequencymount$Frequency) # 1/107 0s

# quasi poisson best as <15


### assumption checks, recommendation of residual dev (contribution of each obs to resid dev) rather than pearson (Thomas et al., 2015)
globalmodposs<-glm(Frequency ~ Temperature.oC, poisson(link = "log"), data=frequencymount)
globalmodquasiposs<-glm(Frequency ~ Temperature.oC, quasipoisson(link = "log"), data=frequencymount)
summary(globalmodposs)
summary(globalmodquasiposs)

# 1) Errors normally distributed? - NOT NECASSARY BUT NOT IMPROVED

# poisson
devresid<-resid(globalmodposs, type = "deviance"); hist(devresid)
shapiro.test(devresid);ks.test(devresid, pnorm)
qqnorm(devresid,cex=1.8,pch=20); qqline(devresid,lty=2,lwd=2)
par(mfrow=c(2,2)); plot(globalmodposs);par(mfrow=c(1,1))

# quasi
devresid<-resid(globalmodquasiposs, type = "deviance"); hist(devresid)
shapiro.test(devresid);ks.test(devresid, pnorm)
qqnorm(devresid,cex=1.8,pch=20); qqline(devresid,lty=2,lwd=2)
par(mfrow=c(2,2)); plot(globalmodquasiposs);par(mfrow=c(1,1))
# minor positive skew and positive kurtosis, test failed

# 2) Homogenous/homoscedasticity variance of residuals - YES

# poisson
devresid<-resid(globalmodposs, type = "deviance")
plot(devresid ~ globalmodposs$fitted.values, pch = 20, cex = 1, cex.lab = 1.5)
fligner.test(devresid~frequencymount$Frequency) 
par(mfrow=c(2,2)); plot(globalmodposs);par(mfrow=c(1,1))

# quasi
devresid<-resid(globalmodquasiposs, type = "deviance")
plot(devresid ~ globalmodposs$fitted.values, pch = 20, cex = 1, cex.lab = 1.5)
fligner.test(devresid~frequencymount$Frequency)
par(mfrow=c(2,2)); plot(globalmodquasiposs);par(mfrow=c(1,1))
#  little trend or wedging, test passed

# 3) Independences of independent variables - NO PATTERN
# Only one independent variable


# 4) No serial auto-correlation with time/space - #ALL DATA COLLECTED AT ONE TIMEPOINT
#! need library(car)
# durbinWatsonTest(globalmodquasiposs)  # passed

# 5) No bias by unduly influential datapoints - YES

# poisson
par(mfrow=c(2,2)); plot(globalmodposs);par(mfrow=c(1,1))
influence<-influence.measures(globalmodposs); summary(influence) 

# quasi
par(mfrow=c(2,2)); plot(globalmodquasiposs);par(mfrow=c(1,1))
influence<-influence.measures(globalmodquasiposs); summary(influence) 
# 3 cases in poisson with larger cooks distance improved with quasi poisson 

# 6) Independent variables measured without error - BEST OF ABILITY

## Overdispersion re-check
theta<-globalmodquasiposs$deviance/globalmodquasiposs$df.residual; theta # 7.783
#AICc<-(-2*logLik(globalmodquasiposs))+((2*5*(5+1)/(215-5-1))); AICc # 688
# model still overdispersed


###  MODEL REFINEMENT

#! Quasi poisson is improving assumpations of model in parcicular cooks distance, it is also producing more reliable significance values
# models with levels look better because their coefficient estiamtes are closer, they have less overdispersion and explain a greater proporiton of the variation 
# there are only 5 group points unevenly spaced on regression line  (Field book )


globalmodquasiposs<-glm(Frequency ~ Temperature.oC, quasipoisson(link = "log"), data=frequencymount)
pseudoR<-(globalmodquasiposs$null.deviance-globalmodquasiposs$deviance) / globalmodquasiposs$null.deviance; pseudoR # 0.1584

## Null model
nullmod<-glm(Frequency ~ 1, quasipoisson(link = "log"), data=frequencymount) # creating null of just intercept (and random in glmms)
pseudoR<-(nullmod$null.deviance-nullmod$deviance) / nullmod$null.deviance; pseudoR # (thomas et al., 2015) #  -1.171e-16


# liklihood ratio test
library(lmtest)
lrtest(globalmodquasiposs, nullmod) #doesnt work

anova(globalmodquasiposs, nullmod, test = "Chi")
# Model 1: Offspring.count.20D ~ Temperature.oC * Sex.1
# Model 2: Offspring.count.20D ~ Temperature.oC + Sex.1
#        Resid. Df Resid. Dev Df Deviance Pr(>Chi)    
# 1       102        854                         
# 2       106       1147 -4     -294  8.5e-07 ***


anova(globalmodquasiposs, test = "Chi")
drop1(globalmodquasiposs, test = "Chi")
# Frequency ~ Temperature.oC
#                Df Deviance scaled dev. Pr(>Chi)    
# <none>                 854                         
# Temperature.oC  4     1147        33.7  8.5e-07 ***


summary(globalmodquasiposs)
# Call:
#      glm(formula = Frequency ~ Temperature.oC, family = quasipoisson(link = "log"), 
#          data = frequencymount)
# 
# Deviance Residuals: 
#      Min      1Q  Median      3Q     Max  
# -6.456  -2.381  -0.576   1.573   8.018  
# 
# Coefficients:
#                      Estimate Std. Error t value Pr(>|t|)    
#      (Intercept)        3.0369     0.1293   23.49   <2e-16 ***
#      Temperature.oC39  -0.0349     0.1864   -0.19   0.8518    
#      Temperature.oC40   0.3321     0.1760    1.89   0.0621 .  
#      Temperature.oC41  -0.5859     0.2162   -2.71   0.0079 ** 
#      Temperature.oC42  -1.0220     0.3368   -3.03   0.0031 ** 
#      ---
#      Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# (Dispersion parameter for quasipoisson family taken to be 8.706)
# 
# Null deviance: 1147.16  on 106  degrees of freedom
# Residual deviance:  853.65  on 102  degrees of freedom
# AIC: NA
# 
# Number of Fisher Scoring iterations: 5


# estimates looks ok
exp(3.0369) # 20.84
exp(3.0369-0.0349) # 20.13
exp(3.0369+0.3321) # 29.05
exp(3.0369-0.5859) # 11.6
exp(3.0369-1.0220) # 7.5

describeBy(frequencymount$Frequency, frequencymount$Temperature.oC, mat=TRUE) 
#     item group1 vars  n  mean    sd median trimmed    mad min max range    skew kurtosis    se
# X11    1     30    1 25 20.84 11.86   19.0   20.95 13.343   0  40    40 0.05606  -1.1696 2.371
# X12    2     39    1 24 20.12 12.20   18.5   19.30  9.637   2  54    52 0.75245   0.3815 2.491
# X13    3     40    1 21 29.05 19.88   22.0   26.24 13.343   9  82    73 1.05012   0.1360 4.339
# X14    4     41    1 25 11.60 10.37    7.0   10.24  5.930   1  45    44 1.44704   1.8951 2.074
# X15    5     42    1 12  7.50  6.46    6.5    6.40  5.189   1  25    24 1.47286   1.6756 1.865

tapply(frequencymount$Frequency, frequencymount$Temperature.oC, mean)
# 30    39    40    41    42 
# 20.84 20.12 29.05 11.60  7.50 


library(lsmeans)
lsmeans(globalmodquasiposs, pairwise~Temperature.oC, adjust="tukey")
# Temperature.oC lsmean     SE df asymp.LCL asymp.UCL
# 30              3.037 0.1293 NA     2.784     3.290
# 39              3.002 0.1343 NA     2.739     3.265
# 40              3.369 0.1195 NA     3.135     3.603
# 41              2.451 0.1733 NA     2.111     2.791
# 42              2.015 0.3110 NA     1.405     2.624
# 
# Results are given on the log (not the response) scale. 
# Confidence level used: 0.95 
# 
# $contrasts
# contrast estimate     SE df z.ratio p.value
# 30 - 39   0.03491 0.1864 NA   0.187  0.9997
# 30 - 40  -0.33206 0.1760 NA  -1.887  0.3245
# 30 - 41   0.58587 0.2162 NA   2.710  0.0525
# 30 - 42   1.02197 0.3368 NA   3.034  0.0204 <--
# 39 - 40  -0.36697 0.1797 NA  -2.042  0.2459
# 39 - 41   0.55096 0.2192 NA   2.514  0.0876
# 39 - 42   0.98706 0.3388 NA   2.914  0.0294
# 40 - 41   0.91793 0.2105 NA   4.362  0.0001 <--
# 40 - 42   1.35403 0.3332 NA   4.064  0.0005 <--
# 41 - 42   0.43610 0.3560 NA   1.225  0.7368




library(multcomp)
summary(glht(globalmodgammalog, mcp(Temperature.oC="Tukey")))



#################################################################################################################################################################### DURATIONTOTAL ### NEW METHOD: USE GLM WITH NON-GAUSSIAN ERROR STRUCTURE######################################################

str(durationtotal)

#### Gaussian error 
# Data is a right skewed continuous variable, fitting normal distibution does not give normal and homogenity of variance in residuals 


#### Creating a global model gaussian identity
globalmodgauss<-glm(Duration.sec  ~ Temperature.oC, gaussian(link = "identity"), data=durationtotal)

summary(globalmodgauss); # No R^2, AIC 29232
AICc<-(-2*logLik(globalmodgauss))+((2*5*(5+1)/(2774-5-1))); AICc # qAICc<-((-2*logLik(model1)/Theta)+((2*p*(p+1)/(n-p-1))); qAICc # AIC correcting for perameters(p) and sample size (n) # 29220.25
pseudoR<-(globalmodgauss$null.deviance-globalmodgauss$deviance) / globalmodgauss$null.deviance # (thomas et al., 2015)
pseudoR # 0.02459197
R2 <- cor(durationtotal$Duration.sec,predict(globalmodgauss))^2; R2 # 0.02459197



# 1) Errors normally distributed? - NO
sresid <- (globalmodgauss$residuals - mean(globalmodgauss$residuals))/sd(globalmodgauss$residuals);hist(sresid) 
shapiro.test(sresid);ks.test(sresid, pnorm)
par(mfrow=c(2,2)); plot(globalmodgauss);par(mfrow=c(1,1))
# P2 Q-Q points pull up on right side, sresid histogram positively skewed, normality tests p<0.01. 

# 2) Homogenous/homoscedasticity variance of residuals - NO
par(mfrow=c(2,2)); plot(globalmodgauss);par(mfrow=c(1,1));plot(sresid~globalmodgauss$fitted.values, pch = 20, cex = 2, cex.lab = 1.5)
plot(sresid~durationtotal$Duration.sec) 
fligner.test(sresid~durationtotal$Duration.sec); leveneTest(sresid ~ durationtotal$Duration.sec)  #  P1 Resids~Fitted and P3 SQRT(Resid~Fitted) show a wedge, 30oC greater variance, test <0.01

# 3) Independences of independent variables - YES
# Only one independent variable # would use VIFs as cannot use cor.plots with categories

# 4) No serial auto-correlation with time/space - NO
#! need library(car)
durbinWatsonTest(globalmodgauss) # Test failed

# 5) No bias by unduly influential datapoints - NOT STRONGLY
influence<-influence.measures(globalmodgauss); summary(influence) 
par(mfrow=c(2,2));plot(globalmodgauss);par(mfrow=c(1,1))
# C4,14,22 cooks distances 0.19-0.41; confirmed on P4 stdzd resids~leverage,
# still below cooks threshold >1 or samplesize (49)/4 (Fox, 1991)

# 6) Independent variables measured without error - BEST OF ABILITY



#### Creating a global model gaussian log
globalmodgausslog<-glm(Duration.sec ~ Temperature.oC, gaussian(link = "log"), data=durationtotal)

summary(globalmodgausslog); # No R^2, AIC 18.075
AICc<-(-2*logLik(globalmodgausslog))+((2*5*(5+1)/(2774-5-1))); AICc # qAICc<-((-2*logLik(model1)/Theta)+((2*p*(p+1)/(n-p-1))); qAICc # AIC correcting for perameters(p) and sample size (n) # 29220.25
pseudoR<-(globalmodgausslog$null.deviance-globalmodgausslog$deviance) / globalmodgausslog$null.deviance # (thomas et al., 2015)
pseudoR # 0.02459197
R2 <- cor(testessize$Testes.size,predict(globalmodgausslog))^2; R2 # 0.2370825

# 1) Errors normally distributed? - IMPROVED
sresid <- (globalmodgausslog$residuals - mean(globalmodgausslog$residuals))/sd(globalmodgausslog$residuals);hist(sresid) 
shapiro.test(sresid);ks.test(sresid, pnorm)
par(mfrow=c(2,2)); plot(globalmodgausslog);par(mfrow=c(1,1))
# P2 Q-Q points pull up on right side, sresid histogram slightly postitively skewed, normality tests p<0.01. 

# 2) Homogenous/homoscedasticity variance of residuals - IMPROVED
par(mfrow=c(2,2)); plot(globalmodgausslog);par(mfrow=c(1,1));plot(sresid~globalmodgausslog$fitted.values, pch = 20, cex = 2, cex.lab = 1.5)
plot(sresid~testessize$Temperature.oC) 
fligner.test(sresid~testessize$Temperature.oC); leveneTest(sresid ~ testessize$Temperature.oC)  #  P1 Resids~Fitted and P3 SQRT(Resid~Fitted) show a wedge, 30oC greater variance, test >0.05

# 3) Independences of independent variables - YES
# Only one independent variable # would use VIFs as cannot use cor.plots with categories

# 4) No serial auto-correlation with time/space - NO
#! need library(car)
durbinWatsonTest(globalmodgausslog) # Test failed

# 5) No bias by unduly influential datapoints - NOT STRONGLY
influence<-influence.measures(globalmodgausslog); summary(influence) 
par(mfrow=c(2,2));plot(globalmodgausslog);par(mfrow=c(1,1))
# C4,14,22 cooks distances 0.19-0.41; confirmed on P4 stdzd resids~leverage,
# still below cooks threshold >1 or samplesize (49)/4 (Fox, 1991)

# 6) Independent variables measured without error - BEST OF ABILITY



#### Creating a global model gaussian sqrt
globalmodgaussSR<-glm(Duration.sec ~ Temperature.oC, gaussian(link = "sqrt"), data=durationtotal) 
summary(globalmodgaussSR); # No R^2, AIC 29232
AICc<-(-2*logLik(globalmodgaussSR))+((2*1*(1+1)/(49-1-1))); AICc # qAICc<-((-2*logLik(model1)/Theta)+((2*p*(p+1)/(n-p-1))); qAICc # AIC correcting for perameters(p) and sample size (n) # 29232
pseudoR<-(globalmodgaussSR$null.deviance-globalmodgaussSR$deviance) / globalmodgaussSR$null.deviance # (thomas et al., 2015)
pseudoR # 0.02459197






#### Creating a global model gamma 
globalmodgamma<-glm(Duration.sec ~ Temperature.oC, Gamma(link = "identity"), data=durationtotal)

summary(globalmodgamma); # No R^2, AIC 24069
AICc<-(-2*logLik(globalmodgamma))+((2*5*(5+1)/(2774-5-1))); AICc # qAICc<-((-2*logLik(model1)/Theta)+((2*p*(p+1)/(n-p-1))); qAICc # AIC correcting for perameters(p) and sample size (n) # 24056.87
pseudoR<-(globalmodgamma$null.deviance-globalmodgamma$deviance) / globalmodgamma$null.deviance # (thomas et al., 2015)
pseudoR # 0.02875861
R2 <- cor(durationtotal$Duration.sec,predict(globalmodgamma))^2; R2 # 0.02459197

# 1) Errors normally distributed? - NO BUT IMPROVED
sresid <- (globalmodgamma$residuals - mean(globalmodgamma$residuals))/sd(globalmodgamma$residuals);hist(sresid) 
shapiro.test(sresid);ks.test(sresid, pnorm)
par(mfrow=c(2,2)); plot(globalmodgamma);par(mfrow=c(1,1))
# P2 Q-Q points pull up on right side, sresid histogram positively skewed, normality tests p<0.01. 

# 2) Homogenous/homoscedasticity variance of residuals - NO BUT IMPROVED
par(mfrow=c(2,2)); plot(globalmodgamma);par(mfrow=c(1,1));plot(sresid~globalmodgamma$fitted.values, pch = 20, cex = 2, cex.lab = 1.5)
plot(sresid~durationtotal$Duration.sec) 
fligner.test(sresid~durationtotal$Duration.sec); leveneTest(sresid ~ testessize$Temperature.oC)  #  P1 Resids~Fitted and P3 SQRT(Resid~Fitted) show no tend no real wedge, test <0.01

# 3) Independences of independent variables - YES
# Only one independent variable # would use VIFs as cannot use cor.plots with categories

# 4) No serial auto-correlation with time/space - NO BUT EVERYTHING COLLECTED AT ONCE
#! need library(car)
durbinWatsonTest(globalmodgamma) # Test failed

# 5) No bias by unduly influential datapoints - NOT STR0NGLY
influence<-influence.measures(globalmodgamma); summary(influence) 
par(mfrow=c(2,2));plot(globalmodgamma);par(mfrow=c(1,1))
# C4,14,22 cooks distances 0.29-0.33; confirmed on P4 stdzd resids~leverage,
# still below cooks threshold >1 or samplesize (49)/4 (Fox, 1991)

# 6) Independent variables measured without error - BEST OF ABILITY

## gamma log didnt improve further




###  MODEL REFINEMENT

## Global model
globalmodgamma<-glm(Duration.sec ~ Temperature.oC, Gamma(link = "identity"), data=durationtotal)
pseudoR<-(globalmodgamma$null.deviance-globalmodgamma$deviance) / globalmodgamma$null.deviance # (thomas et al., 2015)
pseudoR # 0.02875861 


AICc<-(-2*logLik(globalmodgamma))+((2*5*(5+1)/(2774-5-1))); AICc  # 24056. 

## Null model
nullmodgamma<-glm(Duration.sec ~ 1, Gamma(link = "log"), data=durationtotal) # creating null of just intercept (and random in glmms)

pseudoR<-(nullmodgamma$null.deviance-nullmodgamma$deviance) / nullmodgamma$null.deviance; pseudoR # (thomas et al., 2015) # 0
AICc<-(-2*logLik(nullmodgamma))+((2*5*(5+1)/(2774-5-1))); AICc  # 24156.16 

### Global model explains spermcount significantly more than null model because
# 1) pseudo R is higher and AIC more than 2 lower 

# 2) anova comparison
anova(globalmodgamma, nullmodgamma, test = "F")
anova(globalmodgamma, nullmodgamma, test = "F") #! Use "F" for continuous dependent variables
# Likelihood ratio tests of Negative Binomial Models
# Model 1: Duration.sec ~ Temperature.oC
# Model 2: Duration.sec ~ 1
# Resid. Df Resid. Dev Df Deviance     F   Pr(>F)    
# 1      2769     4329.5                               
# 2      2773     4457.7 -4   -128.2 12.81 2.47e-10 ***

drop1(globalmodgamma, test = "F")
#                Df Deviance   AIC F value    Pr(>F)    
# <none>              4329.5 24069                      
# Temperature.oC  4   4457.7 24112  20.498 < 2.2e-16 ***

# 3) liklihood ratio test
library(lmtest)
lrtest(globalmodgamma,nullmodgamma)
lrtest(nullmodgamma,globalmodgamma)#produces same result just different order 
# Likelihood ratio test
# Model 1: Duration.sec ~ 1
# Model 2: Duration.sec ~ Temperature.oC
# #Df LogLik Df  Chisq Pr(>Chisq)    
# 1   2 -12078                         
# 2   6 -12028  4 99.285  < 2.2e-16 ***

## Therefore complex model kept (although the simplest prefered for NS difference)

globalmodgamma # shortened with basic information # redundant

summary(globalmodgamma)
# Call:
#      glm(formula = Duration.sec ~ Temperature.oC, family = Gamma(link = "identity"), 
#          data = durationtotal)
# 
# Deviance Residuals: 
#      Min       1Q   Median       3Q      Max  
# -2.4660  -1.4160  -0.6709   0.2849   6.7643  
# 
# Coefficients:
#      Estimate Std. Error t value Pr(>|t|)    
# (Intercept)        28.922      1.777  16.279  < 2e-16 ***
# Temperature.oC39   -3.624      2.290  -1.583    0.114    
# Temperature.oC40   -3.388      2.288  -1.481    0.139    
# Temperature.oC41    3.937      3.198   1.231    0.218    
# Temperature.oC42   26.927      6.855   3.928 8.78e-05 ***
#      ---
#      Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# (Dispersion parameter for Gamma family taken to be 2.501908)
# 
# Null deviance: 4457.7  on 2773  degrees of freedom
# Residual deviance: 4329.5  on 2769  degrees of freedom
# AIC: 24069
# 
# Number of Fisher Scoring iterations: 3





describeBy(durationtotal$Duration.sec, durationtotal$Temperature.oC, mat=TRUE) 
# item group1 vars   n     mean       sd median  trimmed     mad min max range     skew kurtosis       se
# X11    1     30    1 663 28.92157 45.59499   15.0 21.23352 17.7912   1 633   632 6.743149 70.43835 1.770763
# X12    2     39    1 767 25.29726 33.80740   11.0 18.50894 13.3434   1 301   300 2.909700 12.81313 1.220714
# X13    3     40    1 784 25.53316 48.87277   11.0 16.49045 11.8608   1 694   693 7.503471 83.00674 1.745456
# X14    4     41    1 382 32.85864 42.23483   16.5 24.90196 20.0151   1 348   347 3.130536 14.66582 2.160922
# X15    5     42    1 178 55.84831 85.64395   24.0 36.70139 26.6868   1 645   644 3.277496 14.47926 6.419286
tapply(durationtotal$Duration.sec, durationtotal$Temperature.oC, mean)
# 30       39       40       41       42 
# 28.92157 25.29726 25.53316 32.85864 55.84831 




#pairwise comparison
library(multcomp)
summary(glht(globalmodgamma, mcp(Temperature.oC="Tukey")))
# Simultaneous Tests for General Linear Hypotheses
# 
# Multiple Comparisons of Means: Tukey Contrasts
# 
# 
# Fit: glm(formula = Duration.sec ~ Temperature.oC, family = Gamma(link = "identity"), 
#          data = durationtotal)
# 
# Linear Hypotheses:
#      Estimate Std. Error z value Pr(>|z|)    
# 39 - 30 == 0  -3.6243     2.2900  -1.583  0.47811    
# 40 - 30 == 0  -3.3884     2.2884  -1.481  0.54559    
# 41 - 30 == 0   3.9371     3.1981   1.231  0.70953    
# 42 - 30 == 0  26.9267     6.8554   3.928  < 0.001 ***
# 40 - 39 == 0   0.2359     2.0416   0.116  0.99995    
# 41 - 39 == 0   7.5614     3.0264   2.498  0.07894 .  
# 42 - 39 == 0  30.5511     6.7770   4.508  < 0.001 ***
# 41 - 40 == 0   7.3255     3.0252   2.421  0.09560 .  
# 42 - 40 == 0  30.3152     6.7765   4.474  < 0.001 ***
# 42 - 41 == 0  22.9897     7.1352   3.222  0.00942 ** 


library(lsmeans)
lsmeans(globalmodgamma, pairwise~Temperature.oC, adjust="tukey")
# Temperature.oC   lsmean       SE df asymp.LCL asymp.UCL
# 30             28.92157 1.776645 NA  25.43941  32.40373
# 39             25.29726 1.444813 NA  22.46548  28.12904
# 40             25.53316 1.442389 NA  22.70613  28.36019
# 41             32.85864 2.659216 NA  27.64667  38.07061
# 42             55.84831 6.621186 NA  42.87103  68.82560
# 
# Results are given on the identity (not the response) scale. 
# Confidence level used: 0.95 
# 
# $contrasts
# contrast    estimate       SE df z.ratio p.value
# 30 - 39    3.6243066 2.289968 NA   1.583  0.5085
# 30 - 40    3.3884054 2.288439 NA   1.481  0.5751
# 30 - 41   -3.9370701 3.198108 NA  -1.231  0.7332
# 30 - 42  -26.9267460 6.855404 NA  -3.928  0.0008 <--
# 39 - 40   -0.2359012 2.041560 NA  -0.116  1.0000
# 39 - 41   -7.5613767 3.026369 NA  -2.498  0.0909
# 39 - 42  -30.5510525 6.776989 NA  -4.508  0.0001 <--
# 40 - 41   -7.3254755 3.025213 NA  -2.421  0.1095
# 40 - 42  -30.3151513 6.776473 NA  -4.474  0.0001 <--
# 41 - 42  -22.9896759 7.135231 NA  -3.222  0.0112 <--




#################################################################################################################################################################### DURATIONMATE ### NEW METHOD: USE GLM WITH NON-GAUSSIAN ERROR STRUCTURE######################################################

############### ! NATURE COMM PAPER MATING DURATION MODEL REFINEMENT ##############

str(durationmate)

#### Gaussian error 
# Data is a right skewed continuous variable, fitting normal distibution does not give normal and homogenity of variance in residuals 

describeBy(durationmate$Duration.sec ,durationmate$Temperature.oC)

#### Creating a global model gaussian identity
globalmodgauss<-glm(Duration.sec  ~ Temperature.oC, gaussian(link = "identity"), data=durationmate)

summary(globalmodgauss); # No R^2, AIC 8132.5
AICc<-(-2*logLik(globalmodgauss))+((2*5*(5+1)/(721-5-1))); AICc # qAICc<-((-2*logLik(model1)/Theta)+((2*p*(p+1)/(n-p-1))); qAICc # AIC correcting for perameters(p) and sample size (n) # 8120.583
pseudoR<-(globalmodgauss$null.deviance-globalmodgauss$deviance) / globalmodgauss$null.deviance # (thomas et al., 2015)
pseudoR # 0.04405925
R2 <- cor(durationmate$Duration.sec,predict(globalmodgauss))^2; R2 # 0.04405925



# 1) Errors normally distributed? - NO
sresid <- (globalmodgauss$residuals - mean(globalmodgauss$residuals))/sd(globalmodgauss$residuals);hist(sresid) 
shapiro.test(sresid);ks.test(sresid, pnorm)
par(mfrow=c(2,2)); plot(globalmodgauss);par(mfrow=c(1,1))
# P2 Q-Q points pull up on right side, sresid histogram positively skewed, normality tests p<0.01. 

# 2) Homogenous/homoscedasticity variance of residuals - YES
par(mfrow=c(2,2)); plot(globalmodgauss);par(mfrow=c(1,1));plot(sresid~globalmodgauss$fitted.values, pch = 20, cex = 2, cex.lab = 1.5)
plot(sresid~durationmate$Duration.sec) 
fligner.test(sresid~durationmate$Duration.sec); leveneTest(sresid ~ durationmate$Duration.sec)  #  P1 Resids~Fitted and P3 SQRT(Resid~Fitted) show a some slope, test passed

# 3) Independences of independent variables - YES
# Only one independent variable # would use VIFs as cannot use cor.plots with categories

# 4) No serial auto-correlation with time/space - NO
#! need library(car)
durbinWatsonTest(globalmodgauss) # Test failed

# 5) No bias by unduly influential datapoints - NOT STRONGLY
influence<-influence.measures(globalmodgauss); summary(influence) 
par(mfrow=c(2,2));plot(globalmodgauss);par(mfrow=c(1,1))
# some low outliers listed
# still below cooks threshold >1 or samplesize (49)/4 (Fox, 1991)

# 6) Independent variables measured without error - BEST OF ABILITY



#### Creating a global model gamma 
globalmodgamma<-glm(Duration.sec ~ Temperature.oC, Gamma(link = "identity"), data=durationmate)

summary(globalmodgamma); # No R^2, AIC 7355.2
AICc<-(-2*logLik(globalmodgamma))+((2*5*(5+1)/(721-5-1))); AICc # qAICc<-((-2*logLik(model1)/Theta)+((2*p*(p+1)/(n-p-1))); qAICc # AIC correcting for perameters(p) and sample size (n) # 7343.318
pseudoR<-(globalmodgamma$null.deviance-globalmodgamma$deviance) / globalmodgamma$null.deviance # (thomas et al., 2015)
pseudoR # 0.07386796
R2 <- cor(durationmate$Duration.sec,predict(globalmodgamma))^2; R2 # 0.04405925

# 1) Errors normally distributed? - NO BUT IMPROVED
sresid <- (globalmodgamma$residuals - mean(globalmodgamma$residuals))/sd(globalmodgamma$residuals);hist(sresid) 
shapiro.test(sresid);ks.test(sresid, pnorm)
par(mfrow=c(2,2)); plot(globalmodgamma);par(mfrow=c(1,1))
# P2 Q-Q points pull up on right side, sresid histogram positively skewed, normality tests p<0.01. 

# 2) Homogenous/homoscedasticity variance of residuals - NO BUT IMPROVED
par(mfrow=c(2,2)); plot(globalmodgamma);par(mfrow=c(1,1));plot(sresid~globalmodgamma$fitted.values, pch = 20, cex = 2, cex.lab = 1.5)
plot(sresid~durationmate$Duration.sec) 
fligner.test(sresid~durationmate$Duration.sec); leveneTest(sresid ~ durationmate$Temperature.oC)  #   no real wedge, test passed

# 3) Independences of independent variables - YES
# Only one independent variable # would use VIFs as cannot use cor.plots with categories

# 4) No serial auto-correlation with time/space - NO BUT EVERYTHING COLLECTED AT ONCE
#! need library(car)
durbinWatsonTest(globalmodgamma) # Test failed

# 5) No bias by unduly influential datapoints - NOT STR0NGLY
influence<-influence.measures(globalmodgamma); summary(influence) 
par(mfrow=c(2,2));plot(globalmodgamma);par(mfrow=c(1,1))
# some low outliers
# still below cooks threshold >1 or samplesize (49)/4 (Fox, 1991)

# 6) Independent variables measured without error - BEST OF ABILITY

## gamma log didnt improve further




###  MODEL REFINEMENT

## Global model
globalmodgamma<-glm(Duration.sec ~ Temperature.oC, Gamma(link = "identity"), data=durationmate)
pseudoR<-(globalmodgamma$null.deviance-globalmodgamma$deviance) / globalmodgamma$null.deviance # (thomas et al., 2015)
pseudoR # 0.07386796 


AICc<-(-2*logLik(globalmodgamma))+((2*5*(5+1)/(721-5-1))); AICc  # 7343.318 

## Null model
nullmodgamma<-glm(Duration.sec ~ 1, Gamma(link = "identity"), data=durationmate) # creating null of just intercept (and random in glmms)

pseudoR<-(nullmodgamma$null.deviance-nullmodgamma$deviance) / nullmodgamma$null.deviance; pseudoR # (thomas et al., 2015) # 0
AICc<-(-2*logLik(nullmodgamma))+((2*5*(5+1)/(721-5-1))); AICc  # 7401.607  

### Global model explains spermcount significantly more than null model because
# 1) pseudo R is higher and AIC more than 2 lower 

# 2) anova comparison
anova(globalmodgamma, nullmodgamma, test = "F")
anova(globalmodgamma, nullmodgamma, test = "F") #! Use "F" for continuous dependent variables
# Likelihood ratio tests of Negative Binomial Models
# Model 1: Duration.sec ~ Temperature.oC
# Model 2: Duration.sec ~ 1
# Resid. Df Resid. Dev Df Deviance     F    Pr(>F)    
# 1       716     224.95                                
# 2       720     242.89 -4  -17.942 6.907 1.857e-05 ***


############### ! NATURE COMM PAPER MATING DURATION MODEL SIGNIFICANCE ##############


drop1(globalmodgamma, test = "F")
#                Df Deviance   AIC F value    Pr(>F)    
# <none>              224.95 7355.2                      
# Temperature.oC  4   242.89 7374.9  14.277 3.216e-11 ***

# 3) liklihood ratio test
library(lmtest)
lrtest(globalmodgamma,nullmodgamma)
lrtest(nullmodgamma,globalmodgamma)#produces same result just different order 
# Likelihood ratio test
# Model 1: Duration.sec ~ 1
# Model 2: Duration.sec ~ Temperature.oC
# #Df  LogLik Df  Chisq Pr(>Chisq)    
# 1   2 -3700.8                         
# 2   6 -3671.6  4 58.289  6.637e-12 ***

## Therefore complex model kept (although the simplest prefered for NS difference)

globalmodgamma # shortened with basic information # redundant


############### ! NATURE COMM PAPER MATING DURATION MODEL POST HOC ##############


summary(globalmodgamma)
# Call:
#      glm(formula = Duration.sec ~ Temperature.oC, family = Gamma(link = "identity"), 
#          data = durationmate)
# 
# Deviance Residuals: 
#      Min       1Q   Median       3Q      Max  
# -1.0169  -0.4633  -0.2364   0.0658   3.3043  
# 
# Coefficients:
#                    Estimate Std. Error t value Pr(>|t|)    
# (Intercept)        75.798      4.578  16.556  < 2e-16 ***
# Temperature.oC39   -3.104      6.302  -0.493  0.62251    
# Temperature.oC40    4.783      6.723   0.711  0.47705    
# Temperature.oC41    3.152      7.415   0.425  0.67090    
# Temperature.oC42   49.859     13.191   3.780  0.00017 ***
#      ---
#      Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# (Dispersion parameter for Gamma family taken to be 0.6494093)
# 
# Null deviance: 242.89  on 720  degrees of freedom
# Residual deviance: 224.95  on 716  degrees of freedom
# AIC: 7355.2
# 
# Number of Fisher Scoring iterations: 3





describeBy(durationmate$Duration.sec, durationmate$Temperature.oC, mat=TRUE) 
#     item group1 vars   n     mean       sd median  trimmed     mad min max range     skew kurtosis       se
# X11    1     30    1 178  75.79775  67.17246     62  63.59722 23.7216  37 633   596 5.514073 36.501850  5.034789
# X12    2     39    1 183  72.69399  39.72093     62  65.17007 22.2390  37 301   264 2.575648  8.793525  2.936255
# X13    3     40    1 174  80.58046  81.47883     57  63.90714 22.2390  36 694   658 5.015251 30.667771  6.176894
# X14    4     41    1 119  78.94958  49.48512     62  69.59794 22.2390  37 348   311 2.923984 10.141001  4.536294
# X15    5     42    1  67 125.65672 107.59612     91 106.78182 62.2692  37 645   608 2.315638  6.915612 13.144959
tapply(durationmate$Duration.sec, durationmate$Temperature.oC, mean)
# 30        39        40        41        42 
# 75.79775  72.69399  80.58046  78.94958 125.65672 




#pairwise comparison
library(multcomp)
summary(glht(globalmodgamma, mcp(Temperature.oC="Tukey")))
# Simultaneous Tests for General Linear Hypotheses
# 
# Multiple Comparisons of Means: Tukey Contrasts
# 
# 
# Fit: glm(formula = Duration.sec ~ Temperature.oC, family = Gamma(link = "identity"), 
#          data = durationmate)
# 
# Linear Hypotheses:
#      Estimate Std. Error z value Pr(>|z|)    
# 39 - 30 == 0   -3.104      6.302  -0.493  0.98719    
# 40 - 30 == 0    4.783      6.723   0.711  0.95079    
# 41 - 30 == 0    3.152      7.415   0.425  0.99268    
# 42 - 30 == 0   49.859     13.191   3.780  0.00132 ** 
# 40 - 39 == 0    7.886      6.556   1.203  0.73706    
# 41 - 39 == 0    6.256      7.264   0.861  0.90532    
# 42 - 39 == 0   52.963     13.107   4.041  < 0.001 ***
# 41 - 40 == 0   -1.631      7.632  -0.214  0.99950    
# 42 - 40 == 0   45.076     13.315   3.385  0.00598 ** 
# 42 - 41 == 0   46.707     13.677   3.415  0.00524 ** 


library(lsmeans)
lsmeans(globalmodgamma, pairwise~Temperature.oC, adjust="tukey")
# Temperature.oC    lsmean        SE df asymp.LCL asymp.UCL
# 30              75.79775  4.578314 NA  66.82442  84.77108
# 39              72.69399  4.330442 NA  64.20648  81.18150
# 40              80.58046  4.922825 NA  70.93190  90.22902
# 41              78.94958  5.832243 NA  67.51859  90.38057
# 42             125.65672 12.371079 NA 101.40985 149.90359
# 
# Results are given on the identity (not the response) scale. 
# Confidence level used: 0.95 
# 
# $contrasts
# contrast   estimate        SE df z.ratio p.value
# 30 - 39    3.103764  6.301880 NA   0.493  0.9881
# 30 - 40   -4.782707  6.722735 NA  -0.711  0.9540
# 30 - 41   -3.151827  7.414582 NA  -0.425  0.9932
# 30 - 42  -49.858964 13.191079 NA  -3.780  0.0015 <--
# 39 - 40   -7.886471  6.556442 NA  -1.203  0.7498
# 39 - 41   -6.255591  7.264144 NA  -0.861  0.9110
# 39 - 42  -52.962727 13.107109 NA  -4.041  0.0005 <--
# 40 - 41    1.630880  7.632120 NA   0.214  0.9995
# 40 - 42  -45.076257 13.314571 NA  -3.385  0.0064 <--
# 41 - 42  -46.707137 13.676939 NA  -3.415  0.0058 <--





#################################################################################################################################################################### DURATIONMOUNT ### NEW METHOD: USE GLM WITH NON-GAUSSIAN ERROR STRUCTURE######################################################

str(durationmount)

#### Gaussian error 
# Data is a right skewed continuous variable, fitting normal distibution does not give normal and homogenity of variance in residuals 


#### Creating a global model gaussian identity
globalmodgauss<-glm(Duration.sec  ~ Temperature.oC, gaussian(link = "identity"), data=durationmount)

summary(globalmodgauss); # No R^2, AIC 14899
AICc<-(-2*logLik(globalmodgauss))+((2*5*(5+1)/(2053-5-1))); AICc # qAICc<-((-2*logLik(model1)/Theta)+((2*p*(p+1)/(n-p-1))); qAICc # AIC correcting for perameters(p) and sample size (n) # 14886.99
pseudoR<-(globalmodgauss$null.deviance-globalmodgauss$deviance) / globalmodgauss$null.deviance # (thomas et al., 2015)
pseudoR # 0.01359914
R2 <- cor(durationmount$Duration.sec,predict(globalmodgauss))^2; R2 # 0.01359914



# 1) Errors normally distributed? - NO
sresid <- (globalmodgauss$residuals - mean(globalmodgauss$residuals))/sd(globalmodgauss$residuals);hist(sresid) 
shapiro.test(sresid);ks.test(sresid, pnorm)
par(mfrow=c(2,2)); plot(globalmodgauss);par(mfrow=c(1,1))
# P2 Q-Q points pull up on right side, sresid histogram positively skewed, normality tests p<0.01. 

# 2) Homogenous/homoscedasticity variance of residuals - NO
par(mfrow=c(2,2)); plot(globalmodgauss);par(mfrow=c(1,1));plot(sresid~globalmodgauss$fitted.values, pch = 20, cex = 2, cex.lab = 1.5)
plot(sresid~durationmount$Duration.sec) 
fligner.test(sresid~durationmount$Duration.sec); leveneTest(sresid ~ durationmount$Duration.sec)  #  P1 Resids~Fitted and P3 SQRT(Resid~Fitted) no wedge some trend, test <0.01

# 3) Independences of independent variables - YES
# Only one independent variable # would use VIFs as cannot use cor.plots with categories

# 4) No serial auto-correlation with time/space - NO
#! need library(car)
durbinWatsonTest(globalmodgauss) # Test failed

# 5) No bias by unduly influential datapoints - NOT STRONGLY
influence<-influence.measures(globalmodgauss); summary(influence) 
par(mfrow=c(2,2));plot(globalmodgauss);par(mfrow=c(1,1))
# many low outliers listed
# still below cooks threshold >1 or samplesize (49)/4 (Fox, 1991)

# 6) Independent variables measured without error - BEST OF ABILITY




#### Creating a global model gamma 
globalmodgamma<-glm(Duration.sec ~ Temperature.oC, Gamma(link = "identity"), data=durationmount)

summary(globalmodgamma); # No R^2, AIC 24069
AICc<-(-2*logLik(globalmodgamma))+((2*5*(5+1)/(2774-5-1))); AICc # qAICc<-((-2*logLik(model1)/Theta)+((2*p*(p+1)/(n-p-1))); qAICc # AIC correcting for perameters(p) and sample size (n) # 24056.87
pseudoR<-(globalmodgamma$null.deviance-globalmodgamma$deviance) / globalmodgamma$null.deviance # (thomas et al., 2015)
pseudoR # 0.02875861
R2 <- cor(durationmount$Duration.sec,predict(globalmodgamma))^2; R2 # 0.02459197

# 1) Errors normally distributed? - NO BUT IMPROVED
sresid <- (globalmodgamma$residuals - mean(globalmodgamma$residuals))/sd(globalmodgamma$residuals);hist(sresid) 
shapiro.test(sresid);ks.test(sresid, pnorm)
par(mfrow=c(2,2)); plot(globalmodgamma);par(mfrow=c(1,1))
# P2 Q-Q points pull up on right side, sresid histogram positively skewed, normality tests p<0.01. 

# 2) Homogenous/homoscedasticity variance of residuals - NO BUT IMPROVED
par(mfrow=c(2,2)); plot(globalmodgamma);par(mfrow=c(1,1));plot(sresid~globalmodgamma$fitted.values, pch = 20, cex = 2, cex.lab = 1.5)
plot(sresid~durationmount$Duration.sec) 
fligner.test(sresid~durationmount$Duration.sec); leveneTest(sresid ~ testessize$Temperature.oC)  #  P1 Resids~Fitted and P3 SQRT(Resid~Fitted) show no tend no real wedge, test <0.01

# 3) Independences of independent variables - YES
# Only one independent variable # would use VIFs as cannot use cor.plots with categories

# 4) No serial auto-correlation with time/space - NO BUT EVERYTHING COLLECTED AT ONCE
#! need library(car)
durbinWatsonTest(globalmodgamma) # Test failed

# 5) No bias by unduly influential datapoints - NOT STR0NGLY
influence<-influence.measures(globalmodgamma); summary(influence) 
par(mfrow=c(2,2));plot(globalmodgamma);par(mfrow=c(1,1))
# C4,14,22 cooks distances 0.29-0.33; confirmed on P4 stdzd resids~leverage,
# still below cooks threshold >1 or samplesize (49)/4 (Fox, 1991)

# 6) Independent variables measured without error - BEST OF ABILITY

## gamma log didnt improve further




###  MODEL REFINEMENT

## Global model
globalmodgamma<-glm(Duration.sec ~ Temperature.oC, Gamma(link = "identity"), data=durationmount)
pseudoR<-(globalmodgamma$null.deviance-globalmodgamma$deviance) / globalmodgamma$null.deviance # (thomas et al., 2015)
pseudoR # 0.01133 


AICc<-(-2*logLik(globalmodgamma))+((2*5*(5+1)/(2053-5-1))); AICc  # 13796.01 

## Null model
nullmodgamma<-glm(Duration.sec ~ 1, Gamma(link = "identity"), data=durationmount) # creating null of just intercept (and random in glmms)

pseudoR<-(nullmodgamma$null.deviance-nullmodgamma$deviance) / nullmodgamma$null.deviance; pseudoR # (thomas et al., 2015) # 0
AICc<-(-2*logLik(nullmodgamma))+((2*5*(5+1)/(2053-5-1))); AICc  # 13822.36 

### Global model explains spermcount significantly more than null model because
# 1) pseudo R is higher and AIC more than 2 lower 

# 2) anova comparison
anova(globalmodgamma, nullmodgamma, test = "F")
anova(globalmodgamma, nullmodgamma, test = "F") #! Use "F" for continuous dependent variables
# Likelihood ratio tests of Negative Binomial Models
# Model 1: Duration.sec ~ Temperature.oC
# Model 2: Duration.sec ~ 1
# Resid. Df Resid. Dev Df Deviance     F    Pr(>F)    
# 1      2048     1635.7                                
# 2      2052     1654.5 -4  -18.745 6.704 2.334e-05 ***

drop1(globalmodgamma, test = "F")
#                 Df Deviance   AIC F value    Pr(>F)    
# <none>              1635.7 13808                      
# Temperature.oC  4   1654.5 13827  5.8674 0.0001079 ***

# 3) liklihood ratio test
library(lmtest)
lrtest(globalmodgamma,nullmodgamma)
lrtest(nullmodgamma,globalmodgamma)#produces same result just different order 
# # Likelihood ratio test
# Model 1: Duration.sec ~ 1
# Model 2: Duration.sec ~ Temperature.oC
# #Df  LogLik Df  Chisq Pr(>Chisq)    
# 1   2 -6911.2                         
# 2   6 -6898.0  4 26.357  2.681e-05 ***

## Therefore complex model kept (although the simplest prefered for NS difference)

globalmodgamma # shortened with basic information # redundant

summary(globalmodgamma)
# Call:
#      glm(formula = Duration.sec ~ Temperature.oC, family = Gamma(link = "identity"), 
#          data = durationmount)
# 
# Deviance Residuals: 
#      Min       1Q   Median       3Q      Max  
# -1.8391  -0.9921  -0.3752   0.4590   1.6064  
# 
# Coefficients:
#                        Estimate Std. Error t value Pr(>|t|)    
#      (Intercept)       11.7175     0.4448  26.340  < 2e-16 ***
#      Temperature.oC39  -1.2723     0.5731  -2.220 0.026532 *  
#      Temperature.oC40  -1.8864     0.5556  -3.395 0.000698 ***
#      Temperature.oC41   0.2863     0.7621   0.376 0.707240    
#      Temperature.oC42   1.9942     1.1755   1.696 0.089963 .  
# ---
#      Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# (Dispersion parameter for Gamma family taken to be 0.6990296)
# 
# Null deviance: 1654.5  on 2052  degrees of freedom
# Residual deviance: 1635.7  on 2048  degrees of freedom
# AIC: 13808
# 
# Number of Fisher Scoring iterations: 3





describeBy(durationmount$Duration.sec, durationmount$Temperature.oC, mat=TRUE) 
# item group1 vars   n      mean       sd median   trimmed     mad min max range      skew   kurtosis
# X11    1     30    1 485 11.717526 9.414941      8 10.655527  7.4130   1  35    34 0.7855433 -0.5984272
# X12    2     39    1 584 10.445205 9.113375      7  9.136752  5.9304   1  35    34 1.0201137 -0.1386174
# X13    3     40    1 610  9.831148 8.506504      6  8.532787  5.9304   1  35    34 1.1396994  0.4574904
# X14    4     41    1 263 12.003802 9.466493      9 10.952607  8.8956   1  35    34 0.7940098 -0.5442860
# X15    5     42    1 111 13.711712 9.822412     10 12.887640 10.3782   1  35    34 0.5766030 -0.8979326
# se
# X11 0.4275105
# X12 0.3771141
# X13 0.3444183
# X14 0.5837290
# X15 0.9323021
tapply(durationmount$Duration.sec, durationmount$Temperature.oC, mean)
# 30        39        40        41        42 
# 11.717526 10.445205  9.831148 12.003802 13.711712 




#pairwise comparison
library(multcomp)
summary(glht(globalmodgamma, mcp(Temperature.oC="Tukey")))
# Simultaneous Tests for General Linear Hypotheses
# 
# Multiple Comparisons of Means: Tukey Contrasts
# 
# 
# Fit: glm(formula = Duration.sec ~ Temperature.oC, family = Gamma(link = "identity"), 
#          data = durationmount)
# 
# Linear Hypotheses:
#      Estimate Std. Error z value Pr(>|z|)   
# 39 - 30 == 0  -1.2723     0.5731  -2.220  0.15855   
# 40 - 30 == 0  -1.8864     0.5556  -3.395  0.00551 **
# 41 - 30 == 0   0.2863     0.7621   0.376  0.99536   
# 42 - 30 == 0   1.9942     1.1755   1.696  0.41465   
# 40 - 39 == 0  -0.6141     0.4913  -1.250  0.70499   
# 41 - 39 == 0   1.5586     0.7166   2.175  0.17468   
# 42 - 39 == 0   3.2665     1.1466   2.849  0.03174 * 
# 41 - 40 == 0   2.1727     0.7027   3.092  0.01516 * 
# 42 - 40 == 0   3.8806     1.1379   3.410  0.00538 **
# 42 - 41 == 0   1.7079     1.2518   1.364  0.63142  


library(lsmeans)
lsmeans(globalmodgamma, pairwise~Temperature.oC, adjust="tukey")
# Temperature.oC    lsmean        SE df asymp.LCL asymp.UCL
# 30             11.717526 0.4448492 NA 10.845637  12.58941
# 39             10.445205 0.3613752 NA  9.736923  11.15349
# 40              9.831148 0.3328029 NA  9.178866  10.48343
# 41             12.003802 0.6188548 NA 10.790869  13.21674
# 42             13.711712 1.0881229 NA 11.579030  15.84439
# 
# Results are given on the identity (not the response) scale. 
# Confidence level used: 0.95 
# 
# $contrasts
# contrast   estimate        SE df z.ratio p.value
# 30 - 39   1.2723203 0.5731342 NA   2.220  0.1722
# 30 - 40   1.8863782 0.5555615 NA   3.395  0.0062 <--
# 30 - 41  -0.2862765 0.7621496 NA  -0.376  0.9958
# 30 - 42  -1.9941859 1.1755434 NA  -1.696  0.4361
# 39 - 40   0.6140579 0.4912736 NA   1.250  0.7219
# 39 - 41  -1.5585968 0.7166403 NA  -2.175  0.1892
# 39 - 42  -3.2665062 1.1465616 NA  -2.849  0.0355 <--
# 40 - 41  -2.1726547 0.7026656 NA  -3.092  0.0170 <--
# 40 - 42  -3.8805642 1.1378793 NA  -3.410  0.0059 <--
# 41 - 42  -1.7079094 1.2517958 NA  -1.364  0.6506

