

#check what is in r console from last time
ls()
#remove everything
rm(list=ls())

#check what working directory is set
getwd()
#setting to required one where data is based
setwd("C:/Users/UEA/Documents/Neat data/nature heatwave")

#reading in data 
survivalave <- read.csv("transgenlifeave.csv", header = TRUE) # dataset with averages of lines used in survival analysis (random factor coding difficult in survival analysis, avoiding pseudoreplication by averaging the 4 replicates in each line
survivalplot <- read.csv("transgenlifeplot.csv", header = TRUE) # dataset for plotting a survival curve in ggplot2



#### DATA CHECK, CLEAN, DESCRIPTION AND SUMMARY #########################################################################



### checking for outliers/errors

str(survivalave)  # data arranged for survival analysis
# 'data.frame':	57 obs. of  4 variables:
#      $ Line                     : int  1 2 3 4 5 7 8 9 10 11 ...   # individual
# $ Paternal.temperature.oC  : int  30 30 30 30 30 30 30 30 30 30 ... # paternal heatwave temperature 
# $ Mean.line.round.longevity: num  66 71 76 72 70.7 ...    # mean longevity of of lines as average of 4 replicates longevity
# $ Censor                   : int  1 1 1 1 1 1 1 1 1 1 ... # whether death is known


str(survivalplot) # data arrange for plotting survival cure in ggplot2
# 'data.frame':	62 obs. of  6 variables:
# $ weeks    : int  0 4 8 12 16 20 24 28 32 36 ... #  time passed
# $ linesize : int  28 28 28 28 28 28 28 28 28 27 ... # how many lines extant at a time point
# $ lineprop : num  1 1 1 1 1 1 1 1 1 0.964 ...   # proporiton of lines which are still extant
# $ temptreat: int  30 30 30 30 30 30 30 30 30 30 ... # paternal heatwave temperature 



# creating factors
survivalave$Paternal.temperature.oC<-as.factor(survivalave$Paternal.temperature.oC)
survivalplot$temptreat<-as.factor(survivalplot$temptreat)
# time needs to be numeric for survival analysis
survivalave$Mean.line.round.longevity<-as.numeric(survivalave$Mean.line.round.longevity)


# # returns TRUE of x is missing
is.na(survivalave) # no nas

levels(survivalave$Paternal.temperature.oC)
# "30" "40"

############################# DATA EXPLORATION #######################################################################################


########### ! NAT COMMS DESCRIPTIVE STATS ##################

#### ! library(psych)
#gives you vars  n, mean, sd,  median,  trimmed, mad, min, max, range, skew, kurtosis, se
describeBy(survivalave$Mean.line.round.longevity, survivalave$Paternal.temperature.oC)
# $`30`
#    vars   n mean sd    median  trimmed mad min  max range skew   kurtosis  se
# X1    1 28 66.08 13.22  68.83   66.68 8.4 34.67  96 61.33 -0.49     0.27 2.5
# 
# $`40`
#    vars    n  mean sd     median trimmed mad   min max  range skew kurtosis  se
# X1    1   29 57.79 15.38     53   57.16 14.83  31  93    62  0.45    -0.79 2.86



###################### ! NAT COMMS FIGURE 4 A PLOT #########################################

names(survivalplot)
str(survivalplot)
str(survivalplot$temptreat)

library(ggplot2)

temp <- expression(paste('Temperature (',degree,'C)')) #the temperature label with degrees sign # ~ is a space

#step lines
graphsurv<-ggplot(survivalplot, aes(x=weeks, y=lineprop, group= temptreat, colour= temptreat)) +
     geom_step(aes(linetype=temptreat),                     # step graph
               lwd=1.5) +
     scale_color_manual(values=c("black", "tomato"), name= temp) +
     scale_linetype_manual(values=c(1,1), name= temp) +
     theme_classic() + #the theme of the whole plot
     theme(
           legend.position="none",
           axis.line.x=element_line(color="black", size = 1),
           axis.line.y=element_line(color="black", size = 1),
           axis.text.x=element_text(color="black", size=12),
           axis.text.y=element_text(color="black", size=12),
           axis.title.x=element_text(face = "bold", size=12, color="black", margin = margin(t = 10, r = 0, b = 0, l = 0)),
           axis.title.y=element_text(face = "bold", size=12, color="black", margin = margin(t = 0, r = 10, b = 0, l = 0)),
           panel.background=element_blank(),
           plot.background=element_rect(fill="transparent", colour = NA)) +
     labs (x= "Time passed (weeks)", 
           y= "Proportion of families surviving") + # x and y axis titles
     coord_cartesian(ylim=c(-0.02, 1.02), xlim=c(0,130)) + # set axis limits
     scale_y_continuous(breaks=seq(0, 1, 0.2), expand = c(0,0)) +
     scale_x_continuous(breaks=seq(0, 120, 30), expand = c(0,0))  

setwd("C:/Users/UEA/Desktop")
ggsave("graphsurv.png",width=6, height=4.5, dpi=300, bg = "transparent")
setwd("C:/Users/UEA/Documents/Dissertation and phd/d- data for phd/R analysis/main/transgenfert")


###################### ! NAT COMMS FIGURE 4B PLOT #########################################

# box ave
graphlifebox<-ggplot(survivalave, aes(x=Paternal.temperature.oC, y=Mean.line.round.longevity, fill= Paternal.temperature.oC)) +  #change fill to colour is just lines and change 'scale_fill_manual' below to scale_color_manual
     geom_boxplot(notch=F,  #change to F if want to get rid of notchs
                  outlier.shape= NA, #shape of the outlier (hashtag out if dont want outliers marked)
                  width=0.5,
                  lwd=0.5,
                  fatten=0.5,
                  color="black",
                  position=position_dodge(0.5)) + #size of the outlier (hashtag out if dont want outliers marked)
     stat_summary(fun.y="mean", geom= "point", size=2, position=position_dodge(1), color="black") + 
     scale_fill_manual(values=c("ghostwhite", "tomato"), # changes the colour of the bars
                       name = temp, #adds in temperature label on the legend
                       breaks = c("30", "40"), #the order listed in the legend
                       label = c("Control", "Heatwave")) + #how things are labeled in the lgend
     geom_jitter(shape=1, size=0.75, position=position_jitter(0.15)) + #so all the data points are not ontop of each other
     labs (x= "Paternal heatwave treatment", y="Mean longevity of family (weeks)") +  #adding title to the x axis and y axis
     scale_x_discrete(breaks=c("30", "40"), #the order of the variables on the x axis
                      labels=c("Control", "Heatwave")) + # the names on the x axis
     coord_cartesian(ylim=c(-2.5, 102.5)) + #set axis limits
     scale_y_continuous(breaks=seq(0, 100, 25), #ticks from 0 to 16000 and show number every 16000
                        expand = c(0, 0)) + #cuts the axis off at 0
     theme_classic() + #the theme of the whole plot 
     theme(text = element_text(size=17.5), #the text size on the graph
           #legend.position="none", #get rid of the hashtag to get rid of legend
           panel.grid.major=element_blank(), #getting rid of majorgridlines
           panel.border=element_blank(),     #getting rid of minorgridlines  
           panel.grid.minor=element_blank(),
           axis.line.x=element_line(color="black", size = 1),
           axis.line.y=element_line(color="black", size = 1),
           axis.text.x=element_text(color="black", size=7),
           axis.text.y=element_text(color="black", size=7),
           axis.title.x=element_text(face = "bold", size=7, color="black", margin = margin(t = 2.5, r = 0, b = 0, l = 0)),
           axis.title.y=element_text(face = "bold", size=7, color="black", margin = margin(t = 0, r = 2.5, b = 0, l = 0)),
           legend.position="none",
           panel.background=element_blank(),
           plot.background=element_rect(fill="transparent", colour = NA))

setwd("C:/Users/UEA/Desktop")
ggsave("translifebox.png",width=2, height=2, dpi=300, bg = "transparent")
setwd("C:/Users/UEA/Documents/Dissertation and phd/d- data for phd/R analysis/main/transgenlife")






######################################################################################################################################### SURVIVAL AVERAGE PLOTTING RAW DATA DISTRIBUTION AND TESTING NORMALITY AND HOMOGENIETY OF VARIANCES  ###############################

names(survivalave)
levels(survivalave$Paternal.temperature.oC)
str(survivalave)

### in base
# 30
hist(survivalave$Mean.line.round.longevity[survivalave$Paternal.temperature.oC == "30"], 
     main = list("30", cex = 2), xlab = "10 day adult count", ylab ="Frequency", ylim = c(0,60),
     nclass = 10) 
# 40
hist(survivalave$Mean.line.round.longevity[survivalave$Paternal.temperature.oC == "40"], 
     col = "red", density = 30, angle = 180, border = "red", 
     main = list("42", cex = 2), xlab = "10 day adult count", ylab ="Frequency", ylim = c(0,60),
     nclass = 10) 

###### plotting differences
# base boxplots of data distribution grouped by temperature
boxplot(survivalave$Mean.line.round.longevity ~ survivalave$Paternal.temperature.oC, ylab="10D Adult Count", xlab="Temperature")

# notice plot has automatically produced a scatterplot if x is made as an integar


########### Normality - Passed in shapiro
shapiro.test (survivalave$Mean.line.round.longevity[survivalave$Paternal.temperature.oC == "30"]) # W = 0.94235, p-value = 0.1269
ks.test(survivalave$Mean.line.round.longevity[survivalave$Paternal.temperature.oC == "30"], pnorm) # D = 1, p-value < 2.2e-16
shapiro.test (survivalave$Mean.line.round.longevity[survivalave$Paternal.temperature.oC == "40"]) # W = 0.9533, p-value = 0.2228
ks.test(survivalave$Mean.line.round.longevity[survivalave$Paternal.temperature.oC == "40"], pnorm) # D = 1, p-value < 2.2e-16

########### Homogeneity of Variances - Passed in all tests
bartlett.test(survivalave$Mean.line.round.longevity ~ survivalave$Paternal.temperature.oC) # Bartlett's K-squared = 0.61722, df = 1, p-value = 0.4321
fligner.test(survivalave$Mean.line.round.longevity ~ survivalave$Paternal.temperature.oC) # Fligner-Killeen:med chi-squared = 1.4, df = 1, p-value = 0.2367
#! need library(car)
leveneTest(survivalave$Mean.line.round.longevity ~ survivalave$Paternal.temperature.oC)   #Df F value Pr(>F) 1  1.2117 0.2758


################################################################################################################################################## SURVIVAL AVE OLD METHOD: USE NORMAL > TRY AND TRANSFORM TO NORMAL > NON PARAMETRIC ##################################### 

########## Transformation with just hist and shapiro 
# comparing plots and tests before and after
# two other methods: by(df$response, df$treatment, shapiro.test) # with(df, tapply(response, treatment, shapiro.test))
## RIGHT SKEW FIXING
par(mfrow=c(2,2)) #plotting the graphs next to get other in a 4x4 gird

hist (sqrt(survivalave$Mean.line.round.longevity[survivalave$Paternal.temperature.oC == "30"]))
hist (sqrt(survivalave$Mean.line.round.longevity[survivalave$Paternal.temperature.oC == "40"]))
shapiro.test (sqrt(survivalave$Mean.line.round.longevity[survivalave$Paternal.temperature.oC == "30"]))
shapiro.test (sqrt(survivalave$Mean.line.round.longevity[survivalave$Paternal.temperature.oC == "40"]))
#sqrt both fails
hist (log10(survivalave$Mean.line.round.longevity[survivalave$Paternal.temperature.oC == "30"]+0.01))
hist (log10(survivalave$Mean.line.round.longevity[survivalave$Paternal.temperature.oC == "40"]+0.01))
shapiro.test (log10(survivalave$Mean.line.round.longevity[survivalave$Paternal.temperature.oC == "30"]+0.01))
shapiro.test (sqrt(survivallines$Mean.line.round.longevity[survivallines$Paternal.temperature.oC == "40"]))
#log10 both fail
#no transformation fix

#### t-test
# As the data is normal in shapiro and homogenous in variance independent t-test valid
t.test(survivalave$Mean.line.round.longevity ~ survivalave$Paternal.temperature.oC, var.equal = TRUE, paired = FALSE)
# t = 2.1786, df = 55, p-value = 0.03366






###### SURVIVAL AVERAGE NEW METHOD: SURVIVAL ANALYSIS ############################################################

# picking a single point in time would be arbitrary total survival time anlaysis required; survivorship
# time to event occurance (death)
#'survival' package (Therneau 2015)
#A parametric accelerated failure time (AFT) model was used which allows an appropriate error distribution to be specified (Crawley 2013).
# Event times are positively skewed such that a normal distribution is inappropriate for survival data, also variance increase with mean. therefore, a number of distributions were compared using AIC values, along with visual assessment of log-log plots and raw data overlaid with model fits (Crawley 2013). 
# Pick error with best model fit and accounted for non-proportional hazards shown in the log-log plot. 
# Censoring was not required as time of death of all experimental individuals was known. A Kaplan-Meier object was created and sex (male versus female) was included as a categorical explanatory variable
# constant hazard no censoring (likelihood of death with time) would just be a glm with gamma
# age specific hazard weibull for parametric or cox proportional for non-parametric
# Try  a range and compare fits


library(survival)
Surv(survivalave$Mean.line.round.longevity, survivalave$Censor) # creating a kaplan-meier object

plot(survfit(Surv(Mean.line.round.longevity, Censor)~ 1, data=survivalave)) # plots the overall survivorship curve
plot(survfit(Surv(Mean.line.round.longevity, Censor)~ Paternal.temperature.oC, data=survivalave)) # plots the survivorship curve by heat treatments



## non parametric survival model on averages just using the ranks
survivalmodcox<-coxph(Surv(survivallines$Longevity.weeks.round.month, survivallines$Censor) ~ Paternal.temperature.oC, data=survivallines) # creating a cox's proportional hazards model non parametric
AIC(survivalmodcox) #  1982.555

#log-logplot, lines cross not parallel, cox proportional hazards 
# checking log-log plots
survcloglog<-Surv(survivalave$Mean.line.round.longevity, survivalave$Censor)
plot(survfit(survcloglog ~ survivalave$Paternal.temperature.oC), col=c("black", "red"), fun="cloglog")


print(cox.zph(survivalmodcox))
plot(cox.zph(survivalmodcox))
# test is significant and line not horizontal






########### ! CHAPTER 5.3 MODEL SELECTION  ##################

library(survival)
Surv(survivalave$Mean.line.round.longevity, survivalave$Censor) # creating a kaplan-meier object

plot(survfit(Surv(Mean.line.round.longevity, Censor)~ 1, data=survivalave)) # plots the overall survivorship curve
plot(survfit(Surv(Mean.line.round.longevity, Censor)~ Paternal.temperature.oC, data=survivalave)) # plots the survivorship curve by heat treatments

# Cox proportional hazard assumes the ratio of hazards between individuals of different treamtents remains constant over time  (http://influentialpoints.com/Training/coxs_proportional_hazards_regression_model-principles-properties-assumptions.htm, https://stat.ethz.ch/education/semesters/ss2011/seminar/contents/presentation_4.pdf page 15-6, https://stat.ethz.ch/education/semesters/ss2011/seminar/contents/handout_4.pdf)  hazards are not proportional as lines intersect on log-log plots. violation of assumption
# AICs quite large
# Cox uses ranks, less complex and accurate error structure, doesnt allow for censoring or predictions Crawley 2013  
# better to use parametric accelerated time failure models 



## parametric suvival models
survivalmodwei<-survreg(Surv(survivalave$Mean.line.round.longevity, survivalave$Censor) ~ Paternal.temperature.oC, dist= "weibull", data=survivalave) # default shallow hazard curve 
survivalmodex<-survreg(Surv(survivalave$Mean.line.round.longevity, survivalave$Censor) ~ Paternal.temperature.oC, dist= "extreme", data=survivalave) # 
survivalmodlog<-survreg(Surv(survivalave$Mean.line.round.longevity, survivalave$Censor) ~ Paternal.temperature.oC, dist= "logistic", data=survivalave) # 
survivalmodgaus<-survreg(Surv(survivalave$Mean.line.round.longevity, survivalave$Censor) ~ Paternal.temperature.oC, dist= "gaussian", data=survivalave) # 
survivalmodlga<-survreg(Surv(survivalave$Mean.line.round.longevity, survivalave$Censor) ~ Paternal.temperature.oC, dist= "loggaussian", data=survivalave) # 
survivalmodlgn<-survreg(Surv(survivalave$Mean.line.round.longevity, survivalave$Censor) ~ Paternal.temperature.oC, dist= "lognormal", data=survivalave) # extreme late life risk
survivalmodexp<-survreg(Surv(survivalave$Mean.line.round.longevity, survivalave$Censor) ~ Paternal.temperature.oC, dist= "exponential", data=survivalave) # constant hazard simplest
survivalmodray<-survreg(Surv(survivalave$Mean.line.round.longevity, survivalave$Censor) ~ Paternal.temperature.oC, dist= "rayleigh", data=survivalave) # linearly increasing




## Comparing the best fits
# AIC
AIC(survivalmodwei,survivalmodex, survivalmodlog, survivalmodgaus, survivalmodlga, survivalmodlgn, survivalmodexp, survivalmodray)
#                 df      AIC
# survivalmodwei   3 471.8418
# survivalmodex    3 478.8149
# survivalmodlog   3 470.5844
# survivalmodgaus  3 469.4890 <- lowest AIC
# survivalmodlga   3 472.6623
# survivalmodlgn   3 472.6623
# survivalmodexp   2 587.9898
# survivalmodray   2 518.1433

summary(survivalmodwei)
summary(survivalmodlog)
summary(survivalmodgaus) 
summary(survivalmodlga) 
summary(survivalmodlgn)
summary(survivalmodexp)
summary(survivalmodray)

# scale perameters >1 increasing risk of death with age

describeBy(survivalave$Mean.line.round.longevity, survivalave$Paternal.temperature.oC) # 30,40 66.06    57.79

tapply(predict(survivalmodwei, type= "response"), survivalave$Paternal.temperature.oC, mean) # 70.18265 64.81209 
tapply(predict(survivalmodlog, type= "response"), survivalave$Paternal.temperature.oC, mean) # 66.97591 56.58253
tapply(predict(survivalmodgaus, type= "response"), survivalave$Paternal.temperature.oC, mean)# 66.08336 57.79310 <-estimatese closest
tapply(predict(survivalmodlga, type= "response"), survivalave$Paternal.temperature.oC, mean) # 64.62656 55.86755 
tapply(predict(survivalmodlgn, type= "response"), survivalave$Paternal.temperature.oC, mean) # 64.62656 55.86755 
tapply(predict(survivalmodexp, type= "response"), survivalave$Paternal.temperature.oC, mean) # 66.08336 57.79310 
tapply(predict(survivalmodray, type= "response"), survivalave$Paternal.temperature.oC, mean) # 67.34634 59.73736 

# mean age at death control population relatively to heat 
66.06/57.57
#1.15

########### ! CHAPTER 5.3 MODEL SIGNIFICANCE  ##################

anova(survivalmodgaus)
#                         Df Deviance Resid. Df    -2*LL   Pr(>Chi)
# NULL                    NA       NA        55 468.2072         NA
# Paternal.temperature.oC  1 4.718205        54 463.4890 0.02984489

library(lmtest)
lrtest(survivalmodgaus)
# Likelihood ratio test
# # Model 1: Surv(survivalave$Mean.line.round.longevity, survivalave$Censor) ~ 
#      Paternal.temperature.oC
# Model 2: Surv(survivalave$Mean.line.round.longevity, survivalave$Censor) ~ 
#      1
#   #Df  LogLik Df  Chisq Pr(>Chisq)  
# 1   3 -231.74                       
# 2   2 -234.10 -1 4.7182    0.02984 *

drop1(survivalmodgaus)

########### ! CHAPTER 5.3 MODEL POST HOC  ##################

summary(survivalmodgaus)
# Call:
#      survreg(formula = Surv(survivalave$Mean.line.round.longevity, 
#                             survivalave$Censor) ~ Paternal.temperature.oC, data = survivalave, 
#              dist = "gaussian")
#                          Value Std. Error     z         p
# (Intercept)               66.08     2.6662 24.79 1.28e-135
# Paternal.temperature.oC40 -8.29     3.7379 -2.22  2.66e-02
# Log(scale)                 2.65     0.0937 28.26 1.08e-175
# 
# Scale= 14.1 
# 
# Gaussian distribution
# Loglik(model)= -231.7   Loglik(intercept only)= -234.1
# Chisq= 4.72 on 1 degrees of freedom, p= 0.03 
# Number of Newton-Raphson Iterations: 3 
# n= 57 








#### TRANSGEN LIFE EXP SUMMARY ######################################################################################
#Barnard et al., 2007 and Thomas 2015 as references
#---------Hypothesis
# Paternal 40oC heatwave treatment trasgenerationally reduces the lifeexpectency of offspring relative to 30oC controls 

#Response variable (dependent):           20D Offspring count; (count +/- positive skew) 

#Global Fixed variables (independent):          
#    Categorical                          temperature (30, 40), days after heatwave (5, 10, 15, 20) 
#    Covariates                           NA
#    Non-linear terms                     NA
#    Interactions                         NA

#Random terms:                            4 individuals per family (averaged out)

#---------Misc

# Simple analysis:                       independent t-tests on average longevity
# Non-para:                              mann-whitney Us 
# Plot:                                  Step line survivorship curve

#------Model report Simple stats

# Line averages
# As the data is normal in shapiro and homogenous in variance independent t-test valid
# t.test(survivalave$Mean.line.round.longevity ~ survivalave$Paternal.temperature.oC, var.equal = TRUE, paired = FALSE)
# t = 2.1786, df = 55, p-value = 0.03366

# ( Total 
# As the data is not normal in shapiro but  homogenous in variance mann whitney U is valid
# wilcox.test(survivallines$Longevity.weeks.round.month ~ survivallines$Paternal.temperature.oC, var.equal = TRUE, paired = FALSE)
# W = 7479.5, p-value = 0.01201 )


#---------GLM Model refinement

# Survival analysis type:           
# Hazards not proportional from log(-log) plot as line cross and residual-time plot being significant/ =/= 0 so a parametric accelerated time to failure model selected

# Hazard refinement method(s):              AIC and predicted value comparison

# Most plausible/final model(s):           survreg(formula = Surv(survivalave$Mean.line.round.longevity, survivalave$Censor) ~ Paternal.temperature.oC, data = survivalave, dist = "gaussian")

# AIC(c)                                   469.4890
# Model significance:                      from analysis of deviance lrtest()/anova: X^2(2) = 4.7182 p= 0.02984 



#-------Model report

#term                  peramter+/-se    test-stat(wald z)   d.f.      P
# (Intercept)               66.08   2.6662    24.79               1.28e-135
# Paternal.temperature.oC40 -8.29  3.7379     -2.22                2.66e-02

# Hypothesis interpretation:  Heatwaves decrease the life expectency of offspring

