#########################################################################################
######################### CCMI Nursery ##################################################
#########################################################################################

# objective: Nursery data analysis 
# Author: Lucas Le Gall (legall.lucas.llg@gmail.com)
# Date created: Sept 2023
# Last edited: 2024

# clear work space
rm(list=ls())

citation("vegan")
#########################################################################################
#################################### Set up #############################################

library(tidyverse)
library(fishualize)
library(patchwork)
library(FSA) 
library(vegan)
library(survival)
library(survminer)
library(viridis)
library(tseries)
library(dplyr)
library(lme4)
library(emmeans)

#########################################################################################
################################ working directories ####################################

data_wd <- "C:/Users/CCMI/Central Caribbean Marine Institute, Inc/Research - REEL - nursery_llg (1)/Nursery/data"

output_wd <- "C:/Users/CCMI/Central Caribbean Marine Institute, Inc/Research - REEL - nursery_llg (1)/Nursery/output" 

nursery_data <- read.csv(file=file.path(data_wd, "nursery_health_status_2023.csv") , fileEncoding="UTF-8-BOM")

nursery_alldata <- read.csv(file=file.path(data_wd, "nursery_health_status_2023 - ALL DATA.csv") , fileEncoding="UTF-8-BOM")

dhw_noaaog <- read.csv(file=file.path(data_wd, "dhwnoaa.csv") , fileEncoding="UTF-8-BOM")

temp_data <- read.csv(file=file.path(data_wd, "icon_frame.csv") , fileEncoding="UTF-8-BOM")

temp_full <- read.csv(file=file.path(data_wd, "temp_full.csv") , fileEncoding="UTF-8-BOM")


######get the noaa's data straight from the interweb
#noaalink <- "http://coralreefwatch.noaa.gov/product/vs/data/cayman_islands.txt"
#dhw_noaaog <- read.table(noaalink, header = TRUE, sep = "\t")


########################################################################################
############################# set my own theme for figures #############################

my_theme <- theme_classic() +
  theme(axis.title.x = element_text(size = 22, color = "black"), axis.title.y = element_text(size = 22, color = "black"), text=element_text(size=16)) +
  theme(axis.text.y = element_text(size=18, color="black"), axis.text.x = element_text(size=18, color="black", angle = 0, vjust = .0)) +
  theme(legend.text = element_text(size = 16),
        legend.title = element_text(size = 18, face = "bold")) 

#########################################################################################
######################data cleaning######################################################

summary(nursery_data)

#to put the date in a correct format understood by R
nursery_data$date <- as.Date(nursery_data$date, format = "%m/%d/%Y")

nursery_data$bleaching[nursery_data$status == "D"] <- "dead"

nursery_data_filtered <- nursery_data %>%
  filter(!(status %in% c("O", "0")))

nursery_bleaching <- nursery_data_filtered %>%
  group_by(date, bleaching) %>%
  summarise(total = n())

summary(nursery_bleaching)

# filter only useful things of off the noaa df
dhw_noaa <- dhw_noaaog %>%
  select(1, 7)

#####arrondis
#dhw_noaa$DHW_from_90th_HS.1 <- ceiling(dhw_noaa$DHW_from_90th_HS.1)

# Add a Celcius collumn and get rid of the farenheit column in the temp data

#proper format on date
temp_data$date <- as.Date(temp_data$date, format = "%m/%d/%Y")

temp_data <- temp_data %>%
  mutate(temp_celsius = (temp_f-32)*5/9 )

frame_celsius <- temp_data %>%
  select(-temp_f)

#######################################################################################
###################graph################################################################

##### add degree week to the og dataset

dhw_noaa$date <- as.Date(dhw_noaa$date, format = "%m/%d/%Y")

nurseryhealthdhw <- merge(nursery_bleaching, dhw_noaa, by.x = "date", by.y = "date", all.x = TRUE)

##### add temperature in Celsius to the og dataset

temp_frame_simple <- frame_celsius %>%
  select(-time, -X.)

#group the temp in an average per day format
temp_mean <- temp_frame_simple %>%
  group_by(date) %>%
  summarise(temp_av= mean(temp_celsius))
  
#merging in one master dataset
nurseryhealthfull <- merge(nurseryhealthdhw, temp_mean, by.x = "date", by.y = "date", all.x = TRUE)

########################################################
############## temperature plot and data ##############################

nurseryline <- nurseryhealthfull %>%
  mutate(bleaching = if_else(bleaching %in% c("0", "PB", "B"), "alive","dead"))%>%
  group_by(date, bleaching,temp_av) %>%
  summarise(total=sum(total))

survivaltemp_plot <- ggplot(nurseryhealthfull, aes(x = date)) +
  geom_line(aes(y = total, color = bleaching), size = 1) +
  geom_line(aes(y = temp_av, group = 1), color = "red", size = 1.5) +
  labs(x = "Date", y = "Value") +
  scale_color_viridis_d(name = "Bleaching") +
  theme_classic() 
  
survivaltemp_plot <- ggplot(nurseryline, aes(x = as.factor(date))) +
  geom_smooth(aes(y = total, color = bleaching, group=bleaching), size = 1) +
  geom_smooth(aes(y = temp_av*10, group = 1), color = "#ebbcd8", size = 1.5) +
  scale_y_continuous(name = "Number of colonies", sec.axis = sec_axis(~./10, name = "Degree Celsius")) +
  scale_color_manual(
    values = c("#bcebcf", "#bcc0eb"),
    name = "Condition",
    labels = c("alive" = "Alive", "dead" = "Dead")
  ) +
  labs(x = "", y = "Colonies (n)") +
  theme_classic()

survivaltemp_plot

png(file=file.path(output_wd, "tempcurve.png"), height = 2000, width = 3000, res = 350)
survivaltemp_plot
dev.off()

######## comparing temp between each Hobo #############

#converting f in Celsius
temp_full$date <- as.Date(temp_data$date, format = "%m/%d/%Y")

temp_full <- temp_full %>%
  mutate(temp_frame_c = (temp_frame_f-32)*5/9 ) %>%
  mutate(temp_sst_c = (temp_sst_f-32)*5/9 ) %>%
  mutate(temp_10m_c = (temp_10m_f-32)*5/9 ) 

#delete Fahrenheit data

temp_c <- temp_full %>%
  select(-temp_frame_f, -temp_sst_f, -temp_10m_f, -time, -X.)

#long format

temp_long <- temp_c %>%
  pivot_longer(cols = starts_with("temp_"), names_to = "hobo_site", values_to = "temp_c")

# plotting line graph

temp_graph <- ggplot(temp_long, aes(x = date, y=temp_c), fill= hobo_site)+
  geom_line(aes(y = temp_c, color = hobo_site, group=hobo_site), size = 1)+
  scale_color_manual(values = c("temp_10m_c" = "#bcebcf", "temp_frame_c" = "#bccfeb", "temp_sst_c" = "#ebbcbc"),
    name = "Position",
    labels = c("temp_10m_c" = "10m", "temp_frame_c" = "Frame", "temp_sst_c" = "Sea surface"))+
  labs(x = "Date", y = "Temperature in Celsius")+
  my_theme

temp_graph

#Comparison with noaa temp values

noaa_av <- dhw_noaaog %>%
  mutate(temp_noaa = (SST_MAX + SST_MIN) / 2)

noaa_av$date <- as.Date(noaa_av$date, format = "%m/%d/%Y")

noaa_av <- noaa_av %>%
  select(-SST_MAX, -SST_MIN, -SST.90th_HS, -SSTA.90th_HS, -X90th_HS.0, -DHW_from_90th_HS.1, -BAA_7day_max)
  
temp_comparison <- merge(temp_c, noaa_av, by.x = "date", by.y = "date", all.x = TRUE)

temp_comparison <- temp_comparison %>%
  pivot_longer(cols = starts_with("temp_"), names_to = "hobo_site", values_to = "temp_c")

#Adding the noaa surface temp curve to the previous graph

temp_comparison$hobo_site <- factor(temp_comparison$hobo_site, levels = c("temp_sst_c","temp_10m_c", "temp_frame_c", "temp_noaa"))%>%

temp_comparison <- temp_comparison %>%
  slice(1:(n() - 144))

temp_comparison$hobo_site <- factor(temp_comparison$hobo_site, 
                                    levels = c("temp_sst_c", "temp_10m_c", "temp_frame_c", "temp_noaa"))

comparison_graph <- ggplot(temp_comparison, aes(x = date, y = temp_c)) +
  geom_line(aes(y = temp_c, color = hobo_site, group = hobo_site), size = 2, alpha = 0.75) +
  scale_color_manual(values = c("temp_10m_c" = "#5cee6a", "temp_frame_c" = "#5c97ee", "temp_sst_c" = "#ee5ce0", "temp_noaa" = "#eeb35c"),
                     name = "Temperature Data\nOrigin",
                     labels = c("temp_10m_c" = "10m", "temp_frame_c" = "Frame", "temp_sst_c" = "Sea surface", "temp_noaa" = "NOAA")) +
  labs(x = "Date", y = "Temperature in Celsius") +
  my_theme +
  guides(color = guide_legend(override.aes = list(alpha = 1)))
  


comparison_graph

png(file=file.path(output_wd, "comparison_graph.png"), height = 3000, width = 4000, res = 300)
comparison_graph
dev.off()

### data analysis between the different Hobo

# function for standard error
standard_error <- function(x) sd(x)/sqrt(length(x))

# get medians and standard error
tapply(temp_comparison$temp_c, temp_comparison$hobo_site, median)
tapply(temp_comparison$temp_c, temp_comparison$hobo_site, max)
tapply(temp_comparison$temp_c, temp_comparison$hobo_site, mean)
tapply(temp_comparison$temp_c, temp_comparison$hobo_site, min)
tapply(temp_comparison$temp_c, temp_comparison$hobo_site, standard_error)

#test for normality Jarque Bera (too big of a data set for Shapiro)
jarque.bera.test(temp_comparison$temp_c)
#data:  temp_comparison$temp_c
#X-squared = 2861.6, df = 2, p-value < 2.2e-16
#not normally distributed

#Kruskal wallis
kruskal.test(temp_c ~ hobo_site, data = temp_comparison)
#data:  temp_c by hobo_site
#Kruskal-Wallis chi-squared = 364, df = 3, p-value < 2.2e-16

#post-hoc Dunns test

FSA::dunnTest(x = temp_comparison$temp_c, g = temp_comparison$hobo_site, method = "bonferroni")
#              Comparison          Z        P.unadj          P.adj
#1 temp_10m_c - temp_frame_c   2.672747   7.523305e-03   4.513983e-02   Significative dif
#2    temp_10m_c - temp_noaa  12.513624   6.288825e-36   3.773295e-35   Significative dif
#3  temp_frame_c - temp_noaa   9.840877   7.505343e-23   4.503206e-22   Significative dif
#4   temp_10m_c - temp_sst_c  -6.200872   5.615121e-10   3.369072e-09   Significative dif
#5 temp_frame_c - temp_sst_c  -8.873619   7.080681e-19   4.248409e-18   Significative dif
#6    temp_noaa - temp_sst_c -18.714496   3.771732e-78   2.263039e-77   Significative dif

#########################################################
######### bar plot territory ###########################

# put healthy at the bottom and bleached on top
nurseryhealthdhw$bleaching <- factor(nurseryhealthdhw$bleaching, levels = c("0", "PB", "B", "dead"))


nurseryhealth <- ggplot(nurseryhealthdhw, aes(x = as.factor(DHW_from_90th_HS.1), y = total, fill = bleaching)) +
  geom_bar(stat = "identity", position = "stack", col = "white") +
  theme_classic() +
  scale_fill_manual(
    values = c("0" = "orange", "PB" = "#ffe599", "B" = "white", "dead"= "black"),
    name = "Condition",
    labels = c("0" = "Healthy", "PB" = "Partially Bleached", "B" = "Bleached", "dead"= "Dead")
  ) +
  labs(x = "Degree Heating Weeks", y = "Colonies (n)")

nurseryhealth

############ calcul pourcentage 

nurserypercent <- nurseryhealthdhw %>%
  group_by(date) %>%
  mutate(percentage = (total / sum(total)) * 100) %>%
  ungroup()

nurseryhealthpercent <- ggplot(nurserypercent, aes(x = as.factor(date), y = percentage, fill = bleaching)) +
  geom_bar(stat = "identity", position = "stack", col = "white") +
  theme_classic() +
  scale_fill_manual(
    values = c("0" = "orange", "PB" = "#ffe599", "B" = "white", "dead"= "black"),
    name = "Condition",
    labels = c("0" = "Healthy", "PB" = "Partially Bleached", "B" = "Bleached", "dead"= "Dead")
  ) +
  labs(x = "Degree Heating Weeks", y = "Colonies (n)")

nurseryhealthpercent


####### Je veux superposer le graph des DHW et les bars de la survie ###########


p1 <- ggplot(nurserypercent, aes(x = as.factor(date), fill = bleaching)) +
  geom_bar(aes(y = percentage), stat = "identity", position = "stack", col = "white") +
  geom_path(aes(x = as.factor(date), y = DHW_from_90th_HS.1 * 5, group = 1), color = "white", size = 2.5) + 
  geom_path(aes(x = as.factor(date), y = DHW_from_90th_HS.1 * 5, group = 1), color = "red", size = 1.5) + #two geom line to have the white outline around the DHW curve
  scale_y_continuous(name = "Colonies (%)", sec.axis = sec_axis(~./5, name = "DHW")) +
  scale_fill_manual(
    values = c("0" = "orange", "PB" = "#ffe599", "B" = "#f6f6f0", "dead" = "black"),
    name = "Condition",
    labels = c("0" = "Healthy", "PB" = "Partially Bleached", "B" = "Bleached", "dead" = "Dead")) +
  labs(x = "", y = "Colonies (%)") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p1

png(file=file.path(output_wd, "nursery_percent_dhw.png"), height = 2000, width = 3000, res = 350)
p1
dev.off()

sapply(nurserypercent)

#stat zone for nursery percent

# Filter the data for each bleaching status
pb_data <- nurserypercent %>% filter(bleaching == "PB")
b_data <- nurserypercent %>% filter(bleaching == "B")
dead_data <- nurserypercent %>% filter(bleaching == "dead")


# Poisson regression for Partially Bleached (PB)
pb_model <- glm(total ~ DHW_from_90th_HS.1 * date, family = poisson(link = "log"), data = pb_data)
summary(pb_model)

# Poisson regression for Bleached (B)
b_model <- glm(total ~ DHW_from_90th_HS.1 * date, family = poisson(link = "log"), data = b_data)
summary(b_model)

# Poisson regression for Dead
dead_model <- glm(total ~ DHW_from_90th_HS.1 * date, family = poisson(link = "log"), data = dead_data)
summary(dead_model)

#poisson
glm(total~scale(DHW_from_90th_HS.1)+scale(date), family = quasipoisson(link = "log"), data=dead_data)

#######################################bar death

bardeath <- ggplot(nurserypercent %>% filter(bleaching == "dead"), aes(x = as.factor(date), y = percentage, color = bleaching)) +
  geom_segment(aes(xend = as.factor(date), y = 0, yend = percentage), size = 1.5) +  # Adds a horizontal bar
  geom_path(aes(x = as.factor(date), y = DHW_from_90th_HS.1 * 5, group = 1), color = "white", size = 2.5) + 
  geom_path(aes(x = as.factor(date), y = DHW_from_90th_HS.1 * 5, group = 1), color = "red", size = 1.5) + 
  scale_y_continuous(name = "Colonies (%)", sec.axis = sec_axis(~./5, name = "DHW")) +
  scale_color_manual(
    values = c("dead" = "#685b5c"),
    name = "Condition",
    labels = c("dead" = "Dead")) +
  labs(x = "", y = "Colonies (n)") +
  theme_classic() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = rel(1.35)),
    axis.text.y = element_text(size = rel(1.5)),
    axis.title.x = element_text(size = rel(1.5)),
    axis.title.y = element_text(size = rel(1.5)),
    legend.text = element_text(size = rel(1.15)),
    legend.title = element_text(size = rel(1.25))
  ) +
  scale_x_discrete(breaks = levels(as.factor(nurserypercent$date))[seq(1, length(levels(as.factor(nurserypercent$date))), by = 2)])

bardeath

png(file=file.path(output_wd, "nursery_deathpercent_dhw.png"), height = 1500, width = 1500, res = 350)
bardeath
dev.off()

####################################################################################################
########################genotype survival ##########################################################

nursery_data$bleaching[nursery_data$status == "D"] <- "dead"

nursery_data_surv <- nursery_data %>%
  filter(!(status %in% c("O"))) 


nursery_gen <- nursery_data_surv %>%
  group_by(date, genotype, status) %>%
  mutate(status = ifelse(status == "D", "dead", "alive"))  %>%
  summarise(total = n())

#### put the date in order
nursery_gen$date <- as.Date(nursery_gen$date, format = "%m/%d/%Y")

nursery_gen <- nursery_gen %>%
  arrange(date)

########### table for table 1 test

nursery_status <- nursery_data_filtered %>%
  group_by(date, genotype, bleaching) %>%
  summarise(total = n())

nursery_table <- nursery_status %>%
  pivot_wider(names_from = date, values_from = total)%>%
  arrange(genotype)
  
  nursery_table$bleaching[nursery_table$bleaching == 0] <- "alive"
  nursery_table[is.na(nursery_table)] <- 0

  write.csv(nursery_table, file = file.path(output_wd, "table1.csv"), row.names = FALSE)
  dev.off()
#####adding percentage data to nursery_gen


######## getting and adding the initial amount of colonies as dated of the 2023-06-30
######## to calculate percent of loss
initial_tot <- nursery_gen %>%
  filter(date == "2023-06-30") %>%
  group_by(genotype) %>%
  rename(totalog = total) %>%
  select(-date)

initial_tot <- initial_tot %>%
  group_by(genotype) %>%
  summarise(total = sum(totalog)) 
  

nursery_gen_perc <- left_join(nursery_gen,initial_tot, by="genotype")

########### calculating percentage of living colonies

nursery_genperc <- nursery_gen_perc %>%
  mutate(percentage = (total.x / total.y) * 100)
  
#### let's add a column that replace the value of the 100% dead by 0% alive

nursery_genperc <- nursery_genperc %>%
  mutate(percalive = if_else(status == "dead" & percentage >= 100, 0, percentage))

##### we're getting rid of all the "dead" except the one that have 100% mortality, because now it's 0%

nursery_genpercalive <- nursery_genperc %>%
  filter(!(status == "dead" & percentage < 100))

################################################################################
##################plotting total   /!\ Broken code needs to shave the "dead" factor off 

nursery_gena_alive <- nursery_genpercalive %>%
  mutate(total.x = if_else(status == "dead" & percalive = 0, 0))

plot_gen <- ggplot(nursery_genpercalive, aes(x=date, y=total.x, color=genotype)) +
  geom_line(size=1) +
  theme_classic() +
  labs(x = "Surveys dates", y = "Total number of living corals")


plot_gen 


plot_gen <- ggplot(nursery_genpercalive, aes(x = date, y = total, color = genotype)) +
  geom_line(size = 1) +
  theme_classic() +
  labs(x = "Survey dates", y = "Total number of living corals") +
  scale_y_continuous(limits = c(0, max(nursery_gen$total)), expand = c(0, 0))  

# Extract the last data point for each genotype
last_points <- nursery_gen %>%
  group_by(genotype) %>%
  slice_tail(n = 1)

# Add labels to the end of the lines
plot_gen <- plot_gen +
  geom_text(data = last_points, aes(label = genotype), nudge_x = 0.5, nudge_y = 0.5, hjust = 0)

plot_gen

###################################################################################################
####### same graph but percent 


plot_genperc <- ggplot(nursery_genpercalive, aes(x = date, y = percalive, color = genotype)) +
  geom_line(size = 1) +
  theme_classic() +
  labs(x = "Survey dates", y = "Percent of living colonies per genotype") +
  scale_y_continuous(limits = c(0, max(110)), expand = c(0, 0))


####### Extract the last data point for each genotype
label <- nursery_genpercalive %>%
  group_by(genotype) %>%
  filter(percalive != 0) %>%
  slice_tail(n=1)

####### Add labels to the lines
plot_genperc <- plot_genperc + 
  geom_label(
    data = nursery_genpercalive, 
    aes(label = genotype, x = date, y = percalive),
    nudge_x = 0, nudge_y = 0, hjust = 0)

plot_genperc


png(file=file.path(output_wd, "plot_genotype_perc.png"), height = 2000, width = 3000, res = 350)
plot_genperc
dev.off()


plot_genperc1 <- ggplot(nursery_genpercalive, aes(x = date, y = percalive, group = genotype)) +
  geom_line(size = 0.5, color = "black") +
  geom_point(aes(fill = genotype), size = 4, shape = 21, color = "white") +
  theme_classic() +
  labs(x = "Survey dates", y = "Percent of living colonies per genotype") +
  scale_y_continuous(limits = c(0, 110), expand = c(0, 0))


plot_genperc1

png(file=file.path(output_wd, "plot_genotype_perc1.png"), height = 2000, width = 3000, res = 500)
plot_genperc1
dev.off()


##############################################################################
##########stats for comparing genotypes, survival and date
#genotype as a factor:
nursery_genpercalive$genotype <- as.factor(nursery_genpercalive$genotype)

#date as factor too pls
nursery_genpercalive$date <- as.factor(nursery_genpercalive$date)

#put genotype G as the reference because it is the most populous
nursery_genpercalive$genotype <- relevel(nursery_genpercalive$genotype, ref = "G")

# Full Interaction Model: % Mortality * Date * Genotype
interaction_gen <- lm(percalive ~ genotype * date, data = nursery_genpercalive)
summary(interaction_gen)
#### and the posthoc
emm <- emmeans(interaction_gen, ~ genotype * date)
posthoc <- contrast(emm, method = "pairwise", by = "date")
summary(posthoc)

# Random Effect Model: % Mortality * Genotype | Date
random_gen <- lmer(percalive ~ genotype + (1 | date), data = nursery_genpercalive)
summary(random_gen)
# random effect ems 
emms <- emmeans(random_gen, ~ genotype)

summary(emms)

pairwise_comparisons <- pairs(emms)
summary(pairwise_comparisons)

emms_df <- as.data.frame(emms)
emms_df$emmean <- pmin(pmax(emms_df$emmean, 0), 100)
emms_df$lower.CL <- pmin(pmax(emms_df$lower.CL, 0), 100)
emms_df$upper.CL <- pmin(pmax(emms_df$upper.CL, 0), 100)

plot(emms_df)

ranef_diagnostic <- ranef(random_gen, condVar = TRUE)
qqnorm(ranef_diagnostic$date[,1])
qqline(ranef_diagnostic$date[,1])

emmgen <- ggplot(emms_df, aes(x = genotype, y = emmean)) +
  geom_errorbar(aes(ymin = lower.CL, ymax = upper.CL), width = 0, color = "black", size = 1.5) +
  geom_point(aes(fill = genotype), size = 9, shape = 21, color = "white") + 
  theme_classic() +
  labs(title = "",
       x = "",
       y = "Estimated Marginal Means 
       of living colonies") +
  geom_text(aes(label = round(emmean, 1)), hjust = -0, size = 0) +
  theme(legend.position = "none",
        axis.title = element_text(size = 24),  
        axis.text.y = element_text(size = 16),
        axis.text.x = element_blank())  


emmgen

png(file=file.path(output_wd, "emmgen .png"), height = 3000, width = 3000, res = 500)
emmgen 
dev.off()

#######################################################

simple_gen <- lm(percalive ~ genotype + date, data = nursery_genpercalive)
summary(simple_gen)

# Compute EMMs for the simplified model
emm_simple <- emmeans(simple_gen, ~ genotype * date)
summary(emm_simple)

posthoc <- contrast(emm_simple, method = "pairwise", by = "date")
summary(posthoc)
#############################################################################
##################### genotype bleaching 

nursery_data$bleaching[nursery_data$status == "D"] <- "dead"

nursery_data_bleached <- nursery_data %>%
  filter(!(bleaching %in% c("dead", "B")))

nursery_gen_bleached <- nursery_data_bleached %>%
  group_by(date, genotype) %>%
  summarise(total = n()) 


nursery_gen_bleached$date <- as.Date(nursery_gen_bleached$date, format = "%m/%d/%Y")

nursery_gen_bleached <- nursery_gen_bleached %>%
  arrange(date)

plot_gen_bl <- ggplot(nursery_gen_bleached, aes(x=date, y=total, color=genotype)) +
  geom_line(size=1) +
  theme_classic() +
  labs(x = "Surveys dates", y = "Total number of healthy corals")


plot_gen_bl 


#############################################################################
###################Kaplan Meier survival curve###############################

## first we need a data set displaying: genotype, time in days and whether they died or not and if they died, how long it took them

nursery_alldata$bleaching[nursery_alldata$status == "D"] <- "dead"


kaplanset <- nursery_alldata %>% 
  select( -site) %>%
  mutate(date = as.Date(date, format = "%m/%d/%Y")) %>%
  mutate(event = ifelse(bleaching == "dead", 1,0)) %>%
  select(-bleaching, -status) #%>%
  #filter(!(genotype %in% c("O"))) 

frame_date <- as.Date("2023-05-26")

kaplanset <- kaplanset %>%
  mutate(days_of_implantation = as.numeric(difftime(date, frame_date, units = "days"))) %>%
  mutate(survey_number = dense_rank(date))

#############okay so it's not pretty or straight forward and absolutely not adaptable to another example,
#############but it works, and it's fine by me, I managed to get the survival time of each colonies and
#############display the events in preparation for kaplan meier

#get the length of survival per colonies
kmdata <- kaplanset %>%
  group_by(frame, column, row, genotype) %>%
  summarise(total_events = sum(event)) %>%
  mutate(days_of_survival = case_when(
    total_events == 13 ~ 0,
    total_events == 12 ~ 17,
    total_events == 11 ~ 35,
    total_events == 10 ~ 53,
    total_events == 9 ~ 67,
    total_events == 8 ~ 96,
    total_events == 7 ~ 113,
    total_events == 6 ~ 132,
    total_events == 5 ~ 143,
    total_events == 4 ~ 157,
    total_events == 3 ~ 181,
    total_events == 2 ~ 197,
    total_events == 1 ~ 225,
    total_events == 0 ~ 240,
    TRUE ~ NA_real_
  ),events = as.integer(total_events > 0))

#adding the DHW to the dataset

#first we need a df that tells us which DHW value correspond to how many days in the experiment

dhw_exp <- dhw_noaa %>% slice(14029:n())

dhw_exp <- dhw_exp %>% slice(1:241)

dhw_exp <- dhw_exp %>%
  mutate(days = 1:241)

dhw_exp <- dhw_exp %>%
  select(-date)

#add dhw to the kaplan meier data set
merged_kmdata <- merge(kmdata, dhw_exp, by.x = "days_of_survival", by.y = "days", all.x = TRUE)

#get rid of the colonies that started dead
merged_kmdata <- merged_kmdata %>%
  filter(days_of_survival != 0)

###########Kaplan Meier time !

#kaplan meier with days of survival
help(Surv)

km_fit <- survfit(Surv(days_of_survival, events) ~ genotype, data = kmdata)

# Kaplan-Meier curve with separate lines for each genotype
kmcurve <- ggsurvplot(km_fit_dhw, data = merged_kmdata, 
                          pval = TRUE,
                          conf.int = TRUE,
                          risk.table = TRUE,
                          palette = "viridis",  
                          censor = TRUE)

kmcurve

png(file=file.path(output_wd, "kaplan_meier.png"), height = 4000, width = 8000, res = 350)
kmcurve
dev.off()


# Fit KM model
km_fit <- survfit(Surv(days_of_survival, events) ~ genotype, data = kmdata)

# Kaplan-Meier plot
kmcurve_plot <- ggsurvplot(
  km_fit, 
  data = kmdata, 
  pval = TRUE,
  conf.int = TRUE,
  risk.table = FALSE,     # no risk table 
  palette = "viridis",  
  censor = TRUE,
  ggtheme = theme_classic(base_size = 26)  # Set base font size for readability
)


kmcurve_plot$plot <- kmcurve_plot$plot +
  labs(title = "A") +
  theme(
    axis.title.x = element_text(size = 26),
    axis.title.y = element_text(size = 26),
    axis.text = element_text(size = 20),
    plot.margin = margin(20, 20, 20, 20),
    legend.position = "none"  
  )

# Create risk table as separate plot
kmcurve_table <- ggsurvplot(
  km_fit,
  data = kmdata,
  risk.table = TRUE,   # Only risk table
  risk.table.fontsize = 7,
  ggtheme = theme_classic(base_size = 30)  
)


kmcurve_table$table <- kmcurve_table$table +
  labs(title = "B") +  
  theme(
    axis.text = element_text(size = 26),  
    axis.title = element_text(size = 26),
    plot.margin = margin(20, 20, 20, 20)
  )

# Combine the main plot and risk table with patchwork, setting A and B side by side
combined_plot <- (kmcurve_plot$plot | kmcurve_table$table) +
  plot_layout(widths = c(2, 1)) 

combined_plot

# Save the combined plot as a PNG
png(file = file.path(output_wd, "kaplan_meier.png"), height = 4000, width = 8000, res = 350)
print(combined_plot)
dev.off()



#Kaplan meier with DHW

# Fit the Kaplan-Meier survival curve
km_fit_dhw <- survfit(Surv(DHW_from_90th_HS.1, events) ~ genotype, data = merged_kmdata)

# Kaplan-Meier curve with separate lines for each genotype
kmcurve_dhw <- ggsurvplot(
  km_fit_dhw, 
  data = merged_kmdata, 
  pval = TRUE,
  conf.int = TRUE,
  risk.table = TRUE,
  palette = "rainbow",
  censor = TRUE,
  xlab = "Degree Heating Week",  # Change x-axis title
  ggtheme = theme_classic(base_size = 16),  # Set base font size for the whole plot
  tables.theme = theme_classic(base_size = 14)  # Set base font size for the risk table
)

kmcurve_dhw$plot <- kmcurve_dhw$plot + 
  theme(
    axis.title.x = element_text(size = 18),  # X-axis title size
    axis.title.y = element_text(size = 18),  # Y-axis title size
    axis.text = element_text(size = 16),     # Axis text size
    legend.text = element_text(size = 16),   # Legend text size
    legend.title = element_text(size = 16)   # Legend title size
  )

kmcurve_dhw

png(file=file.path(output_wd, "kaplan_meier_dhw.png"), height = 4000, width = 8000, res = 400)
kmcurve_dhw
dev.off()



####### stat test of Kaplan and his gang

kmdata$genotype <- relevel(kmdata$genotype, ref = "G")  

# Fit the Kaplan-Meier survival curve
km_fit <- survfit(Surv(days_of_survival, events) ~ genotype, data = kmdata)

summary(km_fit)
colnames(merged_kmdata)
merged_kmdata[, 8]
# Perform pairwise comparisons with the log-rank test
library(survival)
pairwise_result <- pairwise_survdiff(Surv(days_of_survival, events) ~ genotype, data = merged_kmdata, p.adjust.method = "bonferroni")
pairwise_result

help("strata")
pairwise_dhw <- pairwise_survdiff(Surv(days_of_survival, events) ~ genotype, DHW_squared, data = merged_kmdata, p.adjust.method = "bonferroni")
pairwise_dhw

########################cox

merged_kmdata$genotype <- factor(merged_kmdata$genotype)

merged_kmdata$genotype <- relevel(merged_kmdata$genotype, ref = "G")

cox_model <- coxph(formula = Surv(days_of_survival, events) ~ genotype + DHW_from_90th_HS.1, data = merged_kmdata)
 
summary(cox_model)

merged_kmdata$DHW_squared <- merged_kmdata$DHW_from_90th_HS.1^2

merged_kmdata$DHW_sqrt <- sqrt(merged_kmdata$DHW_from_90th_HS.1)

cox_model_nonlinear <- coxph(Surv(days_of_survival, events) ~ genotype + DHW_sqrt, data = merged_kmdata)

cox_residuals <- resid(cox_model_nonlinear, type = "martingale")


ggplot(data = data.frame(DHW_from_90th_HS.1 = merged_kmdata$DHW_from_90th_HS.1, Residuals = cox_residuals), aes(x = DHW_from_90th_HS.1, y = Residuals)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  xlab("DHW_from_90th_HS.1") +
  ylab("Partial Residuals")

ggplot(data = data.frame(Time = merged_kmdata$days_of_survival, Residuals = cox_residuals), aes(x = Time, y = Residuals)) +
  geom_point() +
  geom_smooth(method = "loess", se = FALSE) +
  xlab("Time") +
  ylab("Martingale Residuals")

forest_plot <- ggforest(cox_model, data = merged_kmdata)

forest_plot

png(file=file.path(output_wd, "runforest.png"), height = 2000, width = 4000, res = 350)
forest_plot
dev.off()

cox_model <- coxph(Surv(days_of_survival, events) ~ genotype + DHW_from_90th_HS.1, data = merged_kmdata)
cox_model
ggforest(cox_model, data = merged_kmdata)

# Check proportional hazards assumption
cox.zph_test <- cox.zph(cox_model)
cox.zph_test
plot(cox.zph_test)

#schoenfeld is violated so moving to a stratification of "genotype"

#test
cox_model_stratified <- coxph(Surv(days_of_survival, events) ~ DHW_from_90th_HS.1 + strata(genotype), data = merged_kmdata)
cox_model_stratified

#schoenfeld again
cox.zph(cox_model_stratified)

#still violated

#cox model with time dependant covariate
coxph(Surv(days_of_survival, events) ~ genotype + tt(DHW_from_90th_HS.1), data = merged_kmdata)

#aft model
aft_model <- survreg(Surv(days_of_survival, events) ~ genotype + DHW_from_90th_HS.1, data = merged_kmdata, dist = "weibull")
aft_model

# AIC for Weibull model
AIC_weibull <- AIC(aft_model)
AIC_weibull
# BIC for Weibull model
BIC_weibull <- BIC(aft_model)
BIC_weibull

AIC_cox <- AIC(cox_model)
AIC_cox
BIC_cox <- BIC(cox_model)
BIC_cox

# Extract Deviance residuals from the Weibull model
deviance_weibull <- residuals(aft_model, type = "deviance")

ggplot(data = data.frame(SurvivalTime = merged_kmdata$days_of_survival, Residuals = deviance_weibull), aes(x = SurvivalTime, y = Residuals)) +
  geom_point() +
  geom_smooth(method = "loess", se = TRUE, color = "blue") +
  labs(x = "Survival Time", y = "Deviance Residuals") +
  theme_minimal()

# Check proportional hazards assumption
cox_fit <- coxph(Surv(days_of_survival, events) ~ genotype + DHW_from_90th_HS.1, data = kmdata)
cox.zph_test <- cox.zph(cox_fit)
plot(cox.zph_test)


#test
pairwise_p[is.na(pairwise_p)] <- 1
dist_matrix <- 1 - pairwise_p

# Perform NMDS
nmds_result <- metaMDS(dist_matrix, k = 2, trymax = 100)
nmds_result <- metaMDS(dist_matrix, k = 2, distance = "euclidean", trymax = 100)
# Extract the NMDS coordinates
nmds_coords <- as.data.frame(nmds_result$points)
nmds_coords$genotype <- rownames(nmds_coords)

# Plot the NMDS results
ggplot(nmds_coords, aes(MDS1, MDS2, label = genotype)) +
  geom_point() +
  geom_text(vjust = 1, hjust = 1) +
  labs(title = "NMDS Ordination of Genotypes Based on Pairwise Differences",
       x = "NMDS1", y = "NMDS2") +
  theme_minimal()


# hierarchy cluster
unique(kmdata$genotype)

pairwise_pvalues_matrix <- pairwise_result$p.value

#matrix into proper format
pairwise_pvalues_matrix[is.na(pairwise_pvalues_matrix)] <- 1  
pairwise_pvalues_matrix <- as.matrix(pairwise_pvalues_matrix)

pairwise_pvalues_matrix

# p-values to a dissimilarity matrix
dissimilarity_matrix <- 1 - pairwise_pvalues_matrix

# Hierarchical clustering : average linkage
hc <- hclust(as.dist(dissimilarity_matrix), method = "average")

cluster <- plot(hc, main = "",
     xlab = "Genotypes", ylab = "Dissimilarity", sub = "",
     cex = 0.8)

print(cluster)

png(file=file.path(output_wd, "kaplan_meier_dhw.png"), height = 4000, width = 8000, res = 350)
kmcurve_dhw
dev.off()