######################################################################################### ######################### CCMI Nursery ################################################## ######################################################################################### # objective: Nursery data analysis # Author: Lucas Le Gall (legall.lucas.llg@gmail.com) # Date created: Sept 2023 # Last edited: 2024 # clear work space rm(list=ls()) citation("vegan") ######################################################################################### #################################### Set up ############################################# library(tidyverse) library(fishualize) library(patchwork) library(FSA) library(vegan) library(survival) library(survminer) library(viridis) library(tseries) library(dplyr) library(lme4) library(emmeans) ######################################################################################### ################################ working directories #################################### data_wd <- "C:/Users/CCMI/Central Caribbean Marine Institute, Inc/Research - REEL - nursery_llg (1)/Nursery/data" output_wd <- "C:/Users/CCMI/Central Caribbean Marine Institute, Inc/Research - REEL - nursery_llg (1)/Nursery/output" nursery_data <- read.csv(file=file.path(data_wd, "nursery_health_status_2023.csv") , fileEncoding="UTF-8-BOM") nursery_alldata <- read.csv(file=file.path(data_wd, "nursery_health_status_2023 - ALL DATA.csv") , fileEncoding="UTF-8-BOM") dhw_noaaog <- read.csv(file=file.path(data_wd, "dhwnoaa.csv") , fileEncoding="UTF-8-BOM") temp_data <- read.csv(file=file.path(data_wd, "icon_frame.csv") , fileEncoding="UTF-8-BOM") temp_full <- read.csv(file=file.path(data_wd, "temp_full.csv") , fileEncoding="UTF-8-BOM") ######get the noaa's data straight from the interweb #noaalink <- "http://coralreefwatch.noaa.gov/product/vs/data/cayman_islands.txt" #dhw_noaaog <- read.table(noaalink, header = TRUE, sep = "\t") ######################################################################################## ############################# set my own theme for figures ############################# my_theme <- theme_classic() + theme(axis.title.x = element_text(size = 22, color = "black"), axis.title.y = element_text(size = 22, color = "black"), text=element_text(size=16)) + theme(axis.text.y = element_text(size=18, color="black"), axis.text.x = element_text(size=18, color="black", angle = 0, vjust = .0)) + theme(legend.text = element_text(size = 16), legend.title = element_text(size = 18, face = "bold")) ######################################################################################### ######################data cleaning###################################################### summary(nursery_data) #to put the date in a correct format understood by R nursery_data$date <- as.Date(nursery_data$date, format = "%m/%d/%Y") nursery_data$bleaching[nursery_data$status == "D"] <- "dead" nursery_data_filtered <- nursery_data %>% filter(!(status %in% c("O", "0"))) nursery_bleaching <- nursery_data_filtered %>% group_by(date, bleaching) %>% summarise(total = n()) summary(nursery_bleaching) # filter only useful things of off the noaa df dhw_noaa <- dhw_noaaog %>% select(1, 7) #####arrondis #dhw_noaa$DHW_from_90th_HS.1 <- ceiling(dhw_noaa$DHW_from_90th_HS.1) # Add a Celcius collumn and get rid of the farenheit column in the temp data #proper format on date temp_data$date <- as.Date(temp_data$date, format = "%m/%d/%Y") temp_data <- temp_data %>% mutate(temp_celsius = (temp_f-32)*5/9 ) frame_celsius <- temp_data %>% select(-temp_f) ####################################################################################### ###################graph################################################################ ##### add degree week to the og dataset dhw_noaa$date <- as.Date(dhw_noaa$date, format = "%m/%d/%Y") nurseryhealthdhw <- merge(nursery_bleaching, dhw_noaa, by.x = "date", by.y = "date", all.x = TRUE) ##### add temperature in Celsius to the og dataset temp_frame_simple <- frame_celsius %>% select(-time, -X.) #group the temp in an average per day format temp_mean <- temp_frame_simple %>% group_by(date) %>% summarise(temp_av= mean(temp_celsius)) #merging in one master dataset nurseryhealthfull <- merge(nurseryhealthdhw, temp_mean, by.x = "date", by.y = "date", all.x = TRUE) ######################################################## ############## temperature plot and data ############################## nurseryline <- nurseryhealthfull %>% mutate(bleaching = if_else(bleaching %in% c("0", "PB", "B"), "alive","dead"))%>% group_by(date, bleaching,temp_av) %>% summarise(total=sum(total)) survivaltemp_plot <- ggplot(nurseryhealthfull, aes(x = date)) + geom_line(aes(y = total, color = bleaching), size = 1) + geom_line(aes(y = temp_av, group = 1), color = "red", size = 1.5) + labs(x = "Date", y = "Value") + scale_color_viridis_d(name = "Bleaching") + theme_classic() survivaltemp_plot <- ggplot(nurseryline, aes(x = as.factor(date))) + geom_smooth(aes(y = total, color = bleaching, group=bleaching), size = 1) + geom_smooth(aes(y = temp_av*10, group = 1), color = "#ebbcd8", size = 1.5) + scale_y_continuous(name = "Number of colonies", sec.axis = sec_axis(~./10, name = "Degree Celsius")) + scale_color_manual( values = c("#bcebcf", "#bcc0eb"), name = "Condition", labels = c("alive" = "Alive", "dead" = "Dead") ) + labs(x = "", y = "Colonies (n)") + theme_classic() survivaltemp_plot png(file=file.path(output_wd, "tempcurve.png"), height = 2000, width = 3000, res = 350) survivaltemp_plot dev.off() ######## comparing temp between each Hobo ############# #converting f in Celsius temp_full$date <- as.Date(temp_data$date, format = "%m/%d/%Y") temp_full <- temp_full %>% mutate(temp_frame_c = (temp_frame_f-32)*5/9 ) %>% mutate(temp_sst_c = (temp_sst_f-32)*5/9 ) %>% mutate(temp_10m_c = (temp_10m_f-32)*5/9 ) #delete Fahrenheit data temp_c <- temp_full %>% select(-temp_frame_f, -temp_sst_f, -temp_10m_f, -time, -X.) #long format temp_long <- temp_c %>% pivot_longer(cols = starts_with("temp_"), names_to = "hobo_site", values_to = "temp_c") # plotting line graph temp_graph <- ggplot(temp_long, aes(x = date, y=temp_c), fill= hobo_site)+ geom_line(aes(y = temp_c, color = hobo_site, group=hobo_site), size = 1)+ scale_color_manual(values = c("temp_10m_c" = "#bcebcf", "temp_frame_c" = "#bccfeb", "temp_sst_c" = "#ebbcbc"), name = "Position", labels = c("temp_10m_c" = "10m", "temp_frame_c" = "Frame", "temp_sst_c" = "Sea surface"))+ labs(x = "Date", y = "Temperature in Celsius")+ my_theme temp_graph #Comparison with noaa temp values noaa_av <- dhw_noaaog %>% mutate(temp_noaa = (SST_MAX + SST_MIN) / 2) noaa_av$date <- as.Date(noaa_av$date, format = "%m/%d/%Y") noaa_av <- noaa_av %>% select(-SST_MAX, -SST_MIN, -SST.90th_HS, -SSTA.90th_HS, -X90th_HS.0, -DHW_from_90th_HS.1, -BAA_7day_max) temp_comparison <- merge(temp_c, noaa_av, by.x = "date", by.y = "date", all.x = TRUE) temp_comparison <- temp_comparison %>% pivot_longer(cols = starts_with("temp_"), names_to = "hobo_site", values_to = "temp_c") #Adding the noaa surface temp curve to the previous graph temp_comparison$hobo_site <- factor(temp_comparison$hobo_site, levels = c("temp_sst_c","temp_10m_c", "temp_frame_c", "temp_noaa"))%>% temp_comparison <- temp_comparison %>% slice(1:(n() - 144)) temp_comparison$hobo_site <- factor(temp_comparison$hobo_site, levels = c("temp_sst_c", "temp_10m_c", "temp_frame_c", "temp_noaa")) comparison_graph <- ggplot(temp_comparison, aes(x = date, y = temp_c)) + geom_line(aes(y = temp_c, color = hobo_site, group = hobo_site), size = 2, alpha = 0.75) + scale_color_manual(values = c("temp_10m_c" = "#5cee6a", "temp_frame_c" = "#5c97ee", "temp_sst_c" = "#ee5ce0", "temp_noaa" = "#eeb35c"), name = "Temperature Data\nOrigin", labels = c("temp_10m_c" = "10m", "temp_frame_c" = "Frame", "temp_sst_c" = "Sea surface", "temp_noaa" = "NOAA")) + labs(x = "Date", y = "Temperature in Celsius") + my_theme + guides(color = guide_legend(override.aes = list(alpha = 1))) comparison_graph png(file=file.path(output_wd, "comparison_graph.png"), height = 3000, width = 4000, res = 300) comparison_graph dev.off() ### data analysis between the different Hobo # function for standard error standard_error <- function(x) sd(x)/sqrt(length(x)) # get medians and standard error tapply(temp_comparison$temp_c, temp_comparison$hobo_site, median) tapply(temp_comparison$temp_c, temp_comparison$hobo_site, max) tapply(temp_comparison$temp_c, temp_comparison$hobo_site, mean) tapply(temp_comparison$temp_c, temp_comparison$hobo_site, min) tapply(temp_comparison$temp_c, temp_comparison$hobo_site, standard_error) #test for normality Jarque Bera (too big of a data set for Shapiro) jarque.bera.test(temp_comparison$temp_c) #data: temp_comparison$temp_c #X-squared = 2861.6, df = 2, p-value < 2.2e-16 #not normally distributed #Kruskal wallis kruskal.test(temp_c ~ hobo_site, data = temp_comparison) #data: temp_c by hobo_site #Kruskal-Wallis chi-squared = 364, df = 3, p-value < 2.2e-16 #post-hoc Dunns test FSA::dunnTest(x = temp_comparison$temp_c, g = temp_comparison$hobo_site, method = "bonferroni") # Comparison Z P.unadj P.adj #1 temp_10m_c - temp_frame_c 2.672747 7.523305e-03 4.513983e-02 Significative dif #2 temp_10m_c - temp_noaa 12.513624 6.288825e-36 3.773295e-35 Significative dif #3 temp_frame_c - temp_noaa 9.840877 7.505343e-23 4.503206e-22 Significative dif #4 temp_10m_c - temp_sst_c -6.200872 5.615121e-10 3.369072e-09 Significative dif #5 temp_frame_c - temp_sst_c -8.873619 7.080681e-19 4.248409e-18 Significative dif #6 temp_noaa - temp_sst_c -18.714496 3.771732e-78 2.263039e-77 Significative dif ######################################################### ######### bar plot territory ########################### # put healthy at the bottom and bleached on top nurseryhealthdhw$bleaching <- factor(nurseryhealthdhw$bleaching, levels = c("0", "PB", "B", "dead")) nurseryhealth <- ggplot(nurseryhealthdhw, aes(x = as.factor(DHW_from_90th_HS.1), y = total, fill = bleaching)) + geom_bar(stat = "identity", position = "stack", col = "white") + theme_classic() + scale_fill_manual( values = c("0" = "orange", "PB" = "#ffe599", "B" = "white", "dead"= "black"), name = "Condition", labels = c("0" = "Healthy", "PB" = "Partially Bleached", "B" = "Bleached", "dead"= "Dead") ) + labs(x = "Degree Heating Weeks", y = "Colonies (n)") nurseryhealth ############ calcul pourcentage nurserypercent <- nurseryhealthdhw %>% group_by(date) %>% mutate(percentage = (total / sum(total)) * 100) %>% ungroup() nurseryhealthpercent <- ggplot(nurserypercent, aes(x = as.factor(date), y = percentage, fill = bleaching)) + geom_bar(stat = "identity", position = "stack", col = "white") + theme_classic() + scale_fill_manual( values = c("0" = "orange", "PB" = "#ffe599", "B" = "white", "dead"= "black"), name = "Condition", labels = c("0" = "Healthy", "PB" = "Partially Bleached", "B" = "Bleached", "dead"= "Dead") ) + labs(x = "Degree Heating Weeks", y = "Colonies (n)") nurseryhealthpercent ####### Je veux superposer le graph des DHW et les bars de la survie ########### p1 <- ggplot(nurserypercent, aes(x = as.factor(date), fill = bleaching)) + geom_bar(aes(y = percentage), stat = "identity", position = "stack", col = "white") + geom_path(aes(x = as.factor(date), y = DHW_from_90th_HS.1 * 5, group = 1), color = "white", size = 2.5) + geom_path(aes(x = as.factor(date), y = DHW_from_90th_HS.1 * 5, group = 1), color = "red", size = 1.5) + #two geom line to have the white outline around the DHW curve scale_y_continuous(name = "Colonies (%)", sec.axis = sec_axis(~./5, name = "DHW")) + scale_fill_manual( values = c("0" = "orange", "PB" = "#ffe599", "B" = "#f6f6f0", "dead" = "black"), name = "Condition", labels = c("0" = "Healthy", "PB" = "Partially Bleached", "B" = "Bleached", "dead" = "Dead")) + labs(x = "", y = "Colonies (%)") + theme_classic() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) p1 png(file=file.path(output_wd, "nursery_percent_dhw.png"), height = 2000, width = 3000, res = 350) p1 dev.off() sapply(nurserypercent) #stat zone for nursery percent # Filter the data for each bleaching status pb_data <- nurserypercent %>% filter(bleaching == "PB") b_data <- nurserypercent %>% filter(bleaching == "B") dead_data <- nurserypercent %>% filter(bleaching == "dead") # Poisson regression for Partially Bleached (PB) pb_model <- glm(total ~ DHW_from_90th_HS.1 * date, family = poisson(link = "log"), data = pb_data) summary(pb_model) # Poisson regression for Bleached (B) b_model <- glm(total ~ DHW_from_90th_HS.1 * date, family = poisson(link = "log"), data = b_data) summary(b_model) # Poisson regression for Dead dead_model <- glm(total ~ DHW_from_90th_HS.1 * date, family = poisson(link = "log"), data = dead_data) summary(dead_model) #poisson glm(total~scale(DHW_from_90th_HS.1)+scale(date), family = quasipoisson(link = "log"), data=dead_data) #######################################bar death bardeath <- ggplot(nurserypercent %>% filter(bleaching == "dead"), aes(x = as.factor(date), y = percentage, color = bleaching)) + geom_segment(aes(xend = as.factor(date), y = 0, yend = percentage), size = 1.5) + # Adds a horizontal bar geom_path(aes(x = as.factor(date), y = DHW_from_90th_HS.1 * 5, group = 1), color = "white", size = 2.5) + geom_path(aes(x = as.factor(date), y = DHW_from_90th_HS.1 * 5, group = 1), color = "red", size = 1.5) + scale_y_continuous(name = "Colonies (%)", sec.axis = sec_axis(~./5, name = "DHW")) + scale_color_manual( values = c("dead" = "#685b5c"), name = "Condition", labels = c("dead" = "Dead")) + labs(x = "", y = "Colonies (n)") + theme_classic() + theme( axis.text.x = element_text(angle = 45, hjust = 1, size = rel(1.35)), axis.text.y = element_text(size = rel(1.5)), axis.title.x = element_text(size = rel(1.5)), axis.title.y = element_text(size = rel(1.5)), legend.text = element_text(size = rel(1.15)), legend.title = element_text(size = rel(1.25)) ) + scale_x_discrete(breaks = levels(as.factor(nurserypercent$date))[seq(1, length(levels(as.factor(nurserypercent$date))), by = 2)]) bardeath png(file=file.path(output_wd, "nursery_deathpercent_dhw.png"), height = 1500, width = 1500, res = 350) bardeath dev.off() #################################################################################################### ########################genotype survival ########################################################## nursery_data$bleaching[nursery_data$status == "D"] <- "dead" nursery_data_surv <- nursery_data %>% filter(!(status %in% c("O"))) nursery_gen <- nursery_data_surv %>% group_by(date, genotype, status) %>% mutate(status = ifelse(status == "D", "dead", "alive")) %>% summarise(total = n()) #### put the date in order nursery_gen$date <- as.Date(nursery_gen$date, format = "%m/%d/%Y") nursery_gen <- nursery_gen %>% arrange(date) ########### table for table 1 test nursery_status <- nursery_data_filtered %>% group_by(date, genotype, bleaching) %>% summarise(total = n()) nursery_table <- nursery_status %>% pivot_wider(names_from = date, values_from = total)%>% arrange(genotype) nursery_table$bleaching[nursery_table$bleaching == 0] <- "alive" nursery_table[is.na(nursery_table)] <- 0 write.csv(nursery_table, file = file.path(output_wd, "table1.csv"), row.names = FALSE) dev.off() #####adding percentage data to nursery_gen ######## getting and adding the initial amount of colonies as dated of the 2023-06-30 ######## to calculate percent of loss initial_tot <- nursery_gen %>% filter(date == "2023-06-30") %>% group_by(genotype) %>% rename(totalog = total) %>% select(-date) initial_tot <- initial_tot %>% group_by(genotype) %>% summarise(total = sum(totalog)) nursery_gen_perc <- left_join(nursery_gen,initial_tot, by="genotype") ########### calculating percentage of living colonies nursery_genperc <- nursery_gen_perc %>% mutate(percentage = (total.x / total.y) * 100) #### let's add a column that replace the value of the 100% dead by 0% alive nursery_genperc <- nursery_genperc %>% mutate(percalive = if_else(status == "dead" & percentage >= 100, 0, percentage)) ##### we're getting rid of all the "dead" except the one that have 100% mortality, because now it's 0% nursery_genpercalive <- nursery_genperc %>% filter(!(status == "dead" & percentage < 100)) ################################################################################ ##################plotting total /!\ Broken code needs to shave the "dead" factor off nursery_gena_alive <- nursery_genpercalive %>% mutate(total.x = if_else(status == "dead" & percalive = 0, 0)) plot_gen <- ggplot(nursery_genpercalive, aes(x=date, y=total.x, color=genotype)) + geom_line(size=1) + theme_classic() + labs(x = "Surveys dates", y = "Total number of living corals") plot_gen plot_gen <- ggplot(nursery_genpercalive, aes(x = date, y = total, color = genotype)) + geom_line(size = 1) + theme_classic() + labs(x = "Survey dates", y = "Total number of living corals") + scale_y_continuous(limits = c(0, max(nursery_gen$total)), expand = c(0, 0)) # Extract the last data point for each genotype last_points <- nursery_gen %>% group_by(genotype) %>% slice_tail(n = 1) # Add labels to the end of the lines plot_gen <- plot_gen + geom_text(data = last_points, aes(label = genotype), nudge_x = 0.5, nudge_y = 0.5, hjust = 0) plot_gen ################################################################################################### ####### same graph but percent plot_genperc <- ggplot(nursery_genpercalive, aes(x = date, y = percalive, color = genotype)) + geom_line(size = 1) + theme_classic() + labs(x = "Survey dates", y = "Percent of living colonies per genotype") + scale_y_continuous(limits = c(0, max(110)), expand = c(0, 0)) ####### Extract the last data point for each genotype label <- nursery_genpercalive %>% group_by(genotype) %>% filter(percalive != 0) %>% slice_tail(n=1) ####### Add labels to the lines plot_genperc <- plot_genperc + geom_label( data = nursery_genpercalive, aes(label = genotype, x = date, y = percalive), nudge_x = 0, nudge_y = 0, hjust = 0) plot_genperc png(file=file.path(output_wd, "plot_genotype_perc.png"), height = 2000, width = 3000, res = 350) plot_genperc dev.off() plot_genperc1 <- ggplot(nursery_genpercalive, aes(x = date, y = percalive, group = genotype)) + geom_line(size = 0.5, color = "black") + geom_point(aes(fill = genotype), size = 4, shape = 21, color = "white") + theme_classic() + labs(x = "Survey dates", y = "Percent of living colonies per genotype") + scale_y_continuous(limits = c(0, 110), expand = c(0, 0)) plot_genperc1 png(file=file.path(output_wd, "plot_genotype_perc1.png"), height = 2000, width = 3000, res = 500) plot_genperc1 dev.off() ############################################################################## ##########stats for comparing genotypes, survival and date #genotype as a factor: nursery_genpercalive$genotype <- as.factor(nursery_genpercalive$genotype) #date as factor too pls nursery_genpercalive$date <- as.factor(nursery_genpercalive$date) #put genotype G as the reference because it is the most populous nursery_genpercalive$genotype <- relevel(nursery_genpercalive$genotype, ref = "G") # Full Interaction Model: % Mortality * Date * Genotype interaction_gen <- lm(percalive ~ genotype * date, data = nursery_genpercalive) summary(interaction_gen) #### and the posthoc emm <- emmeans(interaction_gen, ~ genotype * date) posthoc <- contrast(emm, method = "pairwise", by = "date") summary(posthoc) # Random Effect Model: % Mortality * Genotype | Date random_gen <- lmer(percalive ~ genotype + (1 | date), data = nursery_genpercalive) summary(random_gen) # random effect ems emms <- emmeans(random_gen, ~ genotype) summary(emms) pairwise_comparisons <- pairs(emms) summary(pairwise_comparisons) emms_df <- as.data.frame(emms) emms_df$emmean <- pmin(pmax(emms_df$emmean, 0), 100) emms_df$lower.CL <- pmin(pmax(emms_df$lower.CL, 0), 100) emms_df$upper.CL <- pmin(pmax(emms_df$upper.CL, 0), 100) plot(emms_df) ranef_diagnostic <- ranef(random_gen, condVar = TRUE) qqnorm(ranef_diagnostic$date[,1]) qqline(ranef_diagnostic$date[,1]) emmgen <- ggplot(emms_df, aes(x = genotype, y = emmean)) + geom_errorbar(aes(ymin = lower.CL, ymax = upper.CL), width = 0, color = "black", size = 1.5) + geom_point(aes(fill = genotype), size = 9, shape = 21, color = "white") + theme_classic() + labs(title = "", x = "", y = "Estimated Marginal Means of living colonies") + geom_text(aes(label = round(emmean, 1)), hjust = -0, size = 0) + theme(legend.position = "none", axis.title = element_text(size = 24), axis.text.y = element_text(size = 16), axis.text.x = element_blank()) emmgen png(file=file.path(output_wd, "emmgen .png"), height = 3000, width = 3000, res = 500) emmgen dev.off() ####################################################### simple_gen <- lm(percalive ~ genotype + date, data = nursery_genpercalive) summary(simple_gen) # Compute EMMs for the simplified model emm_simple <- emmeans(simple_gen, ~ genotype * date) summary(emm_simple) posthoc <- contrast(emm_simple, method = "pairwise", by = "date") summary(posthoc) ############################################################################# ##################### genotype bleaching nursery_data$bleaching[nursery_data$status == "D"] <- "dead" nursery_data_bleached <- nursery_data %>% filter(!(bleaching %in% c("dead", "B"))) nursery_gen_bleached <- nursery_data_bleached %>% group_by(date, genotype) %>% summarise(total = n()) nursery_gen_bleached$date <- as.Date(nursery_gen_bleached$date, format = "%m/%d/%Y") nursery_gen_bleached <- nursery_gen_bleached %>% arrange(date) plot_gen_bl <- ggplot(nursery_gen_bleached, aes(x=date, y=total, color=genotype)) + geom_line(size=1) + theme_classic() + labs(x = "Surveys dates", y = "Total number of healthy corals") plot_gen_bl ############################################################################# ###################Kaplan Meier survival curve############################### ## first we need a data set displaying: genotype, time in days and whether they died or not and if they died, how long it took them nursery_alldata$bleaching[nursery_alldata$status == "D"] <- "dead" kaplanset <- nursery_alldata %>% select( -site) %>% mutate(date = as.Date(date, format = "%m/%d/%Y")) %>% mutate(event = ifelse(bleaching == "dead", 1,0)) %>% select(-bleaching, -status) #%>% #filter(!(genotype %in% c("O"))) frame_date <- as.Date("2023-05-26") kaplanset <- kaplanset %>% mutate(days_of_implantation = as.numeric(difftime(date, frame_date, units = "days"))) %>% mutate(survey_number = dense_rank(date)) #############okay so it's not pretty or straight forward and absolutely not adaptable to another example, #############but it works, and it's fine by me, I managed to get the survival time of each colonies and #############display the events in preparation for kaplan meier #get the length of survival per colonies kmdata <- kaplanset %>% group_by(frame, column, row, genotype) %>% summarise(total_events = sum(event)) %>% mutate(days_of_survival = case_when( total_events == 13 ~ 0, total_events == 12 ~ 17, total_events == 11 ~ 35, total_events == 10 ~ 53, total_events == 9 ~ 67, total_events == 8 ~ 96, total_events == 7 ~ 113, total_events == 6 ~ 132, total_events == 5 ~ 143, total_events == 4 ~ 157, total_events == 3 ~ 181, total_events == 2 ~ 197, total_events == 1 ~ 225, total_events == 0 ~ 240, TRUE ~ NA_real_ ),events = as.integer(total_events > 0)) #adding the DHW to the dataset #first we need a df that tells us which DHW value correspond to how many days in the experiment dhw_exp <- dhw_noaa %>% slice(14029:n()) dhw_exp <- dhw_exp %>% slice(1:241) dhw_exp <- dhw_exp %>% mutate(days = 1:241) dhw_exp <- dhw_exp %>% select(-date) #add dhw to the kaplan meier data set merged_kmdata <- merge(kmdata, dhw_exp, by.x = "days_of_survival", by.y = "days", all.x = TRUE) #get rid of the colonies that started dead merged_kmdata <- merged_kmdata %>% filter(days_of_survival != 0) ###########Kaplan Meier time ! #kaplan meier with days of survival help(Surv) km_fit <- survfit(Surv(days_of_survival, events) ~ genotype, data = kmdata) # Kaplan-Meier curve with separate lines for each genotype kmcurve <- ggsurvplot(km_fit_dhw, data = merged_kmdata, pval = TRUE, conf.int = TRUE, risk.table = TRUE, palette = "viridis", censor = TRUE) kmcurve png(file=file.path(output_wd, "kaplan_meier.png"), height = 4000, width = 8000, res = 350) kmcurve dev.off() # Fit KM model km_fit <- survfit(Surv(days_of_survival, events) ~ genotype, data = kmdata) # Kaplan-Meier plot kmcurve_plot <- ggsurvplot( km_fit, data = kmdata, pval = TRUE, conf.int = TRUE, risk.table = FALSE, # no risk table palette = "viridis", censor = TRUE, ggtheme = theme_classic(base_size = 26) # Set base font size for readability ) kmcurve_plot$plot <- kmcurve_plot$plot + labs(title = "A") + theme( axis.title.x = element_text(size = 26), axis.title.y = element_text(size = 26), axis.text = element_text(size = 20), plot.margin = margin(20, 20, 20, 20), legend.position = "none" ) # Create risk table as separate plot kmcurve_table <- ggsurvplot( km_fit, data = kmdata, risk.table = TRUE, # Only risk table risk.table.fontsize = 7, ggtheme = theme_classic(base_size = 30) ) kmcurve_table$table <- kmcurve_table$table + labs(title = "B") + theme( axis.text = element_text(size = 26), axis.title = element_text(size = 26), plot.margin = margin(20, 20, 20, 20) ) # Combine the main plot and risk table with patchwork, setting A and B side by side combined_plot <- (kmcurve_plot$plot | kmcurve_table$table) + plot_layout(widths = c(2, 1)) combined_plot # Save the combined plot as a PNG png(file = file.path(output_wd, "kaplan_meier.png"), height = 4000, width = 8000, res = 350) print(combined_plot) dev.off() #Kaplan meier with DHW # Fit the Kaplan-Meier survival curve km_fit_dhw <- survfit(Surv(DHW_from_90th_HS.1, events) ~ genotype, data = merged_kmdata) # Kaplan-Meier curve with separate lines for each genotype kmcurve_dhw <- ggsurvplot( km_fit_dhw, data = merged_kmdata, pval = TRUE, conf.int = TRUE, risk.table = TRUE, palette = "rainbow", censor = TRUE, xlab = "Degree Heating Week", # Change x-axis title ggtheme = theme_classic(base_size = 16), # Set base font size for the whole plot tables.theme = theme_classic(base_size = 14) # Set base font size for the risk table ) kmcurve_dhw$plot <- kmcurve_dhw$plot + theme( axis.title.x = element_text(size = 18), # X-axis title size axis.title.y = element_text(size = 18), # Y-axis title size axis.text = element_text(size = 16), # Axis text size legend.text = element_text(size = 16), # Legend text size legend.title = element_text(size = 16) # Legend title size ) kmcurve_dhw png(file=file.path(output_wd, "kaplan_meier_dhw.png"), height = 4000, width = 8000, res = 400) kmcurve_dhw dev.off() ####### stat test of Kaplan and his gang kmdata$genotype <- relevel(kmdata$genotype, ref = "G") # Fit the Kaplan-Meier survival curve km_fit <- survfit(Surv(days_of_survival, events) ~ genotype, data = kmdata) summary(km_fit) colnames(merged_kmdata) merged_kmdata[, 8] # Perform pairwise comparisons with the log-rank test library(survival) pairwise_result <- pairwise_survdiff(Surv(days_of_survival, events) ~ genotype, data = merged_kmdata, p.adjust.method = "bonferroni") pairwise_result help("strata") pairwise_dhw <- pairwise_survdiff(Surv(days_of_survival, events) ~ genotype, DHW_squared, data = merged_kmdata, p.adjust.method = "bonferroni") pairwise_dhw ########################cox merged_kmdata$genotype <- factor(merged_kmdata$genotype) merged_kmdata$genotype <- relevel(merged_kmdata$genotype, ref = "G") cox_model <- coxph(formula = Surv(days_of_survival, events) ~ genotype + DHW_from_90th_HS.1, data = merged_kmdata) summary(cox_model) merged_kmdata$DHW_squared <- merged_kmdata$DHW_from_90th_HS.1^2 merged_kmdata$DHW_sqrt <- sqrt(merged_kmdata$DHW_from_90th_HS.1) cox_model_nonlinear <- coxph(Surv(days_of_survival, events) ~ genotype + DHW_sqrt, data = merged_kmdata) cox_residuals <- resid(cox_model_nonlinear, type = "martingale") ggplot(data = data.frame(DHW_from_90th_HS.1 = merged_kmdata$DHW_from_90th_HS.1, Residuals = cox_residuals), aes(x = DHW_from_90th_HS.1, y = Residuals)) + geom_point() + geom_smooth(method = "lm", se = FALSE) + xlab("DHW_from_90th_HS.1") + ylab("Partial Residuals") ggplot(data = data.frame(Time = merged_kmdata$days_of_survival, Residuals = cox_residuals), aes(x = Time, y = Residuals)) + geom_point() + geom_smooth(method = "loess", se = FALSE) + xlab("Time") + ylab("Martingale Residuals") forest_plot <- ggforest(cox_model, data = merged_kmdata) forest_plot png(file=file.path(output_wd, "runforest.png"), height = 2000, width = 4000, res = 350) forest_plot dev.off() cox_model <- coxph(Surv(days_of_survival, events) ~ genotype + DHW_from_90th_HS.1, data = merged_kmdata) cox_model ggforest(cox_model, data = merged_kmdata) # Check proportional hazards assumption cox.zph_test <- cox.zph(cox_model) cox.zph_test plot(cox.zph_test) #schoenfeld is violated so moving to a stratification of "genotype" #test cox_model_stratified <- coxph(Surv(days_of_survival, events) ~ DHW_from_90th_HS.1 + strata(genotype), data = merged_kmdata) cox_model_stratified #schoenfeld again cox.zph(cox_model_stratified) #still violated #cox model with time dependant covariate coxph(Surv(days_of_survival, events) ~ genotype + tt(DHW_from_90th_HS.1), data = merged_kmdata) #aft model aft_model <- survreg(Surv(days_of_survival, events) ~ genotype + DHW_from_90th_HS.1, data = merged_kmdata, dist = "weibull") aft_model # AIC for Weibull model AIC_weibull <- AIC(aft_model) AIC_weibull # BIC for Weibull model BIC_weibull <- BIC(aft_model) BIC_weibull AIC_cox <- AIC(cox_model) AIC_cox BIC_cox <- BIC(cox_model) BIC_cox # Extract Deviance residuals from the Weibull model deviance_weibull <- residuals(aft_model, type = "deviance") ggplot(data = data.frame(SurvivalTime = merged_kmdata$days_of_survival, Residuals = deviance_weibull), aes(x = SurvivalTime, y = Residuals)) + geom_point() + geom_smooth(method = "loess", se = TRUE, color = "blue") + labs(x = "Survival Time", y = "Deviance Residuals") + theme_minimal() # Check proportional hazards assumption cox_fit <- coxph(Surv(days_of_survival, events) ~ genotype + DHW_from_90th_HS.1, data = kmdata) cox.zph_test <- cox.zph(cox_fit) plot(cox.zph_test) #test pairwise_p[is.na(pairwise_p)] <- 1 dist_matrix <- 1 - pairwise_p # Perform NMDS nmds_result <- metaMDS(dist_matrix, k = 2, trymax = 100) nmds_result <- metaMDS(dist_matrix, k = 2, distance = "euclidean", trymax = 100) # Extract the NMDS coordinates nmds_coords <- as.data.frame(nmds_result$points) nmds_coords$genotype <- rownames(nmds_coords) # Plot the NMDS results ggplot(nmds_coords, aes(MDS1, MDS2, label = genotype)) + geom_point() + geom_text(vjust = 1, hjust = 1) + labs(title = "NMDS Ordination of Genotypes Based on Pairwise Differences", x = "NMDS1", y = "NMDS2") + theme_minimal() # hierarchy cluster unique(kmdata$genotype) pairwise_pvalues_matrix <- pairwise_result$p.value #matrix into proper format pairwise_pvalues_matrix[is.na(pairwise_pvalues_matrix)] <- 1 pairwise_pvalues_matrix <- as.matrix(pairwise_pvalues_matrix) pairwise_pvalues_matrix # p-values to a dissimilarity matrix dissimilarity_matrix <- 1 - pairwise_pvalues_matrix # Hierarchical clustering : average linkage hc <- hclust(as.dist(dissimilarity_matrix), method = "average") cluster <- plot(hc, main = "", xlab = "Genotypes", ylab = "Dissimilarity", sub = "", cex = 0.8) print(cluster) png(file=file.path(output_wd, "kaplan_meier_dhw.png"), height = 4000, width = 8000, res = 350) kmcurve_dhw dev.off()