####
#### No.9.2 Visualize tag-jumping (previously termed "index-hopping," but revised)
#### 2022.05.12 revision for Environmental DNA
#### R 4.1.2
####

# Set working directory
if(basename(getwd()) != "09_Exp2_exosap") setwd("09_Exp2_exosap")

# Set random seeds (for reproduction)
ran.seed <- 1234
set.seed(ran.seed)

# Load library and functions
library(tidyverse); packageVersion("tidyverse") # 1.3.1, 2021.10.16
library(phyloseq); packageVersion("phyloseq") # 1.38.0, 2021.11.18
library(cowplot); packageVersion("cowplot") # 1.1.1, 2021.6.13
library(ggsci); packageVersion("ggsci") # 2.9, 2021.6.13
library(RColorBrewer); packageVersion("RColorBrewer") # 1.1.3, 2022.5.11
theme_set(theme_cowplot())
source("../functions_R/F02_HelperFunctions.R") # Helper function for visualization

# Generate output folder
od <- basename(rstudioapi::getSourceEditorContext()$path)
(output_folder <- paste0(str_sub(od, end = -3), "Out")); rm(od)
dir.create(output_folder)


# ----------------------------------------------- #
#    Load phyloseq object
# ----------------------------------------------- #
ps_all <- readRDS("../07_CompilePhyloseqOut/ps_all.obj")
ps_exp2 <- readRDS("../07_CompilePhyloseqOut/ps_exp2.obj")
ps_exp2_norm <- readRDS("09_1_SummarizeRawReadsOut/ps_exp2_norm.obj")
ps_exp2_norm2 <- readRDS("09_1_SummarizeRawReadsOut/ps_exp2_norm2.obj")
ps_exp2_norm_nat <- ps_exp2_norm %>% subset_taxa(genus != "STDseqs")
ps_exp2_norm2_nat <- ps_exp2_norm2 %>% subset_taxa(genus != "STDseqs")


# ----------------------------------------------- #
#         Visualize pattern: Reads
# ----------------------------------------------- #
get_palette <- colorRampPalette(brewer.pal(8, "Paired"))
# Visualize
target_rank <- "family"
ps_rename <- taxa_name_summarize(ps_exp2_norm_nat, target_rank, top_taxa_n = 10)
ps_m1 <- speedyseq::psmelt(ps_rename)
ps_m2 <- stats::aggregate(ps_m1$Abundance, by=list(ps_m1$Sample, ps_m1$family), "sum") # Summed up to make phylum sum
ps_m3 <- stats::aggregate(ps_m1$Abundance, by=list(ps_m1$Sample, ps_m1$rep_tax), "sum") # Summed up to make phylum sum
colnames(ps_m2) <- c("sample", target_rank, "abundance")
colnames(ps_m3) <- c("sample", "rep_tax", "abundance")
# Figures
f1 <- ggplot(ps_m2, aes(x = sample, y = abundance, group = family, fill = family)) +
  geom_bar(stat = "identity", colour = NA) +
  theme(axis.text.x = element_text(angle = -90, hjust = 1, vjust = 0.5, size = 6)) + 
  scale_fill_manual(values = get_palette(29)) +
  #scale_fill_igv() +
  xlab(NULL) + ylab("Sequence reads") +
  NULL
f2 <- ggplot(ps_m3, aes(x = sample, y = abundance, group = rep_tax, fill = rep_tax)) +
  geom_bar(stat = "identity", colour = NA) + theme(axis.text.x = element_text(angle = -90, hjust = 1, vjust = 0.5, size = 6)) + 
  xlab(NULL) + ylab("Relative abundance") +
  scale_fill_manual(values = get_palette(11)) +
  #scale_fill_igv() + #scale_fill_viridis(discrete = T)
  NULL


# ----------------------------------------------- #
#         Visualize index-hopping
# ----------------------------------------------- #
## Sample reads
h1 <- ps_m1 %>% filter(sample_nc == "sample") %>%
  ggplot(aes(x = replicate, y = Abundance, fill = rep_tax)) +
  geom_bar(stat = "identity", colour = NA) +
  theme(axis.text.x = element_text(angle = -90, hjust = 1, vjust = 0.5, size = 6)) + 
  facet_wrap(~ time + temperature + purification_after_1st_pcr) +
  scale_fill_brewer("family", palette = "Paired") +
  xlab(NULL) + ylab("Sequence reads") + panel_border() +
  NULL
## Possible index-hopping
h2 <- ps_m1 %>% filter(sample_nc != "sample") %>%
  ggplot(aes(x = replicate, y = Abundance * 100, fill = rep_tax)) +
  geom_bar(stat = "identity", colour = NA) +
  theme(axis.text.x = element_text(angle = -90, hjust = 1, vjust = 0.5, size = 6)) + 
  facet_wrap(~ time + temperature + purification_after_1st_pcr) +
  scale_fill_brewer("family", palette = "Paired") +
  xlab(NULL) +
  ylab("Proportion of sequence reads\ngenerated by index hopping (%)") +
  panel_border() +
  NULL
h3 <- ps_m1 %>% filter(sample_nc == "none") %>%
  ggplot(aes(x = replicate, y = Abundance * 100, fill = rep_tax)) +
  geom_bar(stat = "identity", colour = NA) +
  theme(axis.text.x = element_text(angle = -90, hjust = 1, vjust = 0.5, size = 6)) + 
  facet_wrap(~ time + temperature + purification_after_1st_pcr) +
  scale_fill_brewer("family", palette = "Paired") +
  xlab(NULL) +
  ylab("Proportion of sequence reads\ngenerated by index hopping (%)") +
  panel_border() +
  NULL
h4 <- ps_m1 %>% filter(index_hopping2 == "IH_w_S01") %>%
  ggplot(aes(x = replicate, y = Abundance * 100, fill = rep_tax)) +
  geom_bar(stat = "identity", colour = NA) +
  theme(axis.text.x = element_text(angle = -90, hjust = 1, vjust = 0.5, size = 6)) + 
  facet_wrap(~ time + temperature + purification_after_1st_pcr) +
  scale_fill_brewer("family", palette = "Paired") +
  xlab(NULL) +
  ylab("Proportion of sequence reads\ngenerated by index hopping (%)") +
  panel_border() +
  ggtitle("Index-hopping (one index switched)") +
  NULL


# ----------------------------------------------- #
# Visualize index-hopping: jitter plot, all OTU combined
# ----------------------------------------------- #
ps_m4 <- ps_exp2_norm_nat %>%
  merge_taxa(eqtaxa = taxa_names(.), archetype = 1) %>%
  speedyseq::psmelt() %>%
  select(OTU, Sample, Abundance, I7_Index_ID, I5_Index_ID,
         I7_Index_ID2, I5_Index_ID2, description, test_category,
         sample_nc, temperature, time, site, purification_after_1st_pcr,
         replicate, index_hopping, index_hopping2) %>%
  tibble()
ps_m4$purification_after_1st_pcr <- factor(ps_m4$purification_after_1st_pcr)
ps_m4$temperature <- factor(ps_m4$temperature)
ps_m4$time <- factor(ps_m4$time)

(j1 <- ps_m4 %>% filter(index_hopping2 == "IH_w_S01") %>%
    ggplot(aes(x = time, y = Abundance * 100,
               group = purification_after_1st_pcr:temperature:time,
               fill = purification_after_1st_pcr:temperature)) +
    geom_boxplot(outlier.size = 0, outlier.shape = NA, alpha = 0.7) +
    geom_point(pch = 16, color = "gray20", position = position_jitterdodge()) +
    scale_fill_manual(values = c("royalblue", "skyblue", "red3", "darkred"),#get_palette(4),
                      name = "Purification and\ntemperature") +
    ylab("Proportion of sequence reads for each sample\ngenerated by index hopping (%)") +
    xlab("Time after pooling (min)") +
    ylim(0, 1) +
    NULL)
# Check data
j1$data[,c("Abundance", "purification_after_1st_pcr", "temperature", "time")] %>%
  group_by(purification_after_1st_pcr, temperature, time) %>%
  summarize(mean_IH = mean(Abundance*100),
            max_IH = max(Abundance*100),
            sd_IH = sd(Abundance*100))
mean((j1$data %>% pull(Abundance))*100)
sd((j1$data %>% pull(Abundance))*100)

## Brief statistics
ps_m5 <- ps_exp2 %>%
  merge_taxa(eqtaxa = taxa_names(.), archetype = 1) %>%
  speedyseq::psmelt() %>%
  select(OTU, Sample, Abundance, I7_Index_ID, I5_Index_ID,
         I7_Index_ID2, I5_Index_ID2, description, test_category,
         sample_nc, temperature, time, site, purification_after_1st_pcr,
         replicate, index_hopping, index_hopping2) %>%
  tibble()
ps_m5_cat_max <- ps_m5 %>% group_by(test_category) %>% summarize(max_reads = max(Abundance)) %>% data.frame()
ps_m5_df <- ps_m5 %>% filter(index_hopping2 == "IH_w_S01") %>% data.frame()
ps_m5_df$max_reads <- ps_m5_cat_max[match(ps_m5_df$test_category, ps_m5_cat_max$test_category),"max_reads"]
ps_m5_df <- ps_m5_df %>%
  select(Sample, Abundance, max_reads, purification_after_1st_pcr, temperature, time) %>%
  arrange(Sample)
ps_m5_df$purification_after_1st_pcr <- factor(ps_m5_df$purification_after_1st_pcr)
ps_m5_df$temperature <- factor(ps_m5_df$temperature)
ps_m5_df$time <- as.numeric(ps_m5_df$time)
colnames(ps_m5_df)[c(2,4,5)] <- c("reads", "purif", "temp")

# Performe GLM
summary(aov(glm(cbind(reads, max_reads) ~ purif + time + temp + purif:time + purif:temp + time:temp,
    data = ps_m5_df, family = binomial(link = "logit"))))


# ----------------------------------------------- #
# Visualize index-hopping: jitter plot, each OTU separated
# ----------------------------------------------- #
# Probability of index hopping event per OTU
ps_m6 <- ps_exp2_norm2_nat %>%
  speedyseq::psmelt() %>% filter(index_hopping2 == "IH_w_S01") %>%
  select(OTU, Sample, Abundance, I7_Index_ID, I5_Index_ID,
         I7_Index_ID2, I5_Index_ID2, description, test_category,
         sample_nc, temperature, time, site, purification_after_1st_pcr,
         replicate, index_hopping, index_hopping2) %>%
  filter(!is.na(Abundance)) %>%
  tibble()
ps_m6$purification_after_1st_pcr <- factor(ps_m6$purification_after_1st_pcr)
ps_m6$temperature <- factor(ps_m6$temperature)
ps_m6$time <- factor(ps_m6$time)

(j2 <- ps_m6 %>%
    ggplot(aes(x = time, y = Abundance * 100,
               group = purification_after_1st_pcr:temperature:time,
               color = purification_after_1st_pcr:temperature,
               fill = purification_after_1st_pcr:temperature)) +
    geom_point(pch = 19, size = 2, alpha = 0.8, position = position_jitterdodge(jitter.width = 0.2)) +
    scale_color_manual(values = c("royalblue", "skyblue", "red3", "darkred"),
                      name = "Purification and\ntemperature") +
    scale_fill_manual(values = c("royalblue", "skyblue", "red3", "darkred"),
                      name = "Purification and\ntemperature") +
    ylab("Proportion of sequence reads for each OTU\ngenerated by index hopping (%)") +
    scale_y_log10() +
    xlab("Time after pooling (min)") +
    NULL)
# Check data
j2$data[,c("Abundance", "purification_after_1st_pcr", "temperature", "time")] %>%
  group_by(purification_after_1st_pcr, temperature, time) %>%
  summarize(mean_IH = mean(Abundance*100),
            sd_IH = sd(Abundance*100),
            max_IH = max(Abundance*100),
            n_IH = sum(Abundance>0))
ih_vals_OTU_none <- (j2$data %>%
                  filter(purification_after_1st_pcr == "none") %>% 
                  pull(Abundance)) * 100
hist(ih_vals_OTU_none)

# Count the number of index hopping events
ps_m6 <- ps_m6 %>% mutate(ih_happened = as.numeric(.$Abundance > 0))
ih_event_summary <- ps_m6[,c("ih_happened", "OTU", "purification_after_1st_pcr", "temperature", "time")] %>%
  group_by(purification_after_1st_pcr, temperature, time) %>%
  summarize(ih_event = sum(ih_happened),
            total_n = n(),
            ih_event_ratio = ih_event/total_n)

## Brief statistics
ps_m7 <- ps_exp2 %>%
  speedyseq::psmelt() %>%
  select(OTU, Sample, Abundance, I7_Index_ID, I5_Index_ID,
         I7_Index_ID2, I5_Index_ID2, description, test_category,
         sample_nc, temperature, time, site, purification_after_1st_pcr,
         replicate, index_hopping, index_hopping2) %>%
  tibble()
# Identify OTU reads in each category
ps_m7_cat_max <- ps_m7 %>% filter(sample_nc == "sample") %>%
  group_by(test_category, OTU) %>% summarize(max_reads = max(Abundance)) %>% data.frame()
ps_m7_cat_max$test_category_OTU <- paste0(ps_m7_cat_max$test_category, "_", ps_m7_cat_max$OTU)
ps_m7_df <- ps_m7 %>% filter(index_hopping2 == "IH_w_S01") %>% data.frame()
ps_m7_df$test_category_OTU <- paste0(ps_m7_df$test_category, "_", ps_m7_df$OTU)
# Assign max reads of each category x OTU
ps_m7_df$max_reads <- ps_m7_cat_max[match(ps_m7_df$test_category_OTU, ps_m7_cat_max$test_category_OTU),"max_reads"]
# Set-up tibble
ps_m7_df <- ps_m7_df %>%
  select(Sample, Abundance, max_reads, purification_after_1st_pcr, temperature, time) %>%
  arrange(Sample) %>% filter(max_reads > 0)
ps_m7_df$purification_after_1st_pcr <- factor(ps_m7_df$purification_after_1st_pcr)
ps_m7_df$temperature <- factor(ps_m7_df$temperature)
ps_m7_df$time <- as.numeric(ps_m7_df$time)
colnames(ps_m7_df)[c(2,4,5)] <- c("reads", "purif", "temp")

# Perform GLM
summary(aov(glm(cbind(reads, max_reads) ~ purif + time + temp + purif:time + purif:temp + time:temp,
                data = ps_m7_df, family = binomial(link = "logit"))))
summary(aov(glm(cbind(reads, max_reads) ~ purif * time * temp ,
                data = ps_m7_df, family = binomial(link = "logit"))))


# ----------------------------------------------- #
#         Save data
# ----------------------------------------------- #
# Re-output data
write.csv(otu_table(ps_exp2_norm), sprintf("%s/otu_table_norm.csv", output_folder))
write.csv(sample_data(ps_exp2_norm), sprintf("%s/sample_data_norm.csv", output_folder))
write.csv(as.data.frame(tax_table(ps_exp2_norm)), sprintf("%s/tax_table_norm.csv", output_folder))
write.csv(otu_table(ps_exp2_norm2), sprintf("%s/otu_table_norm2.csv", output_folder))
write.csv(sample_data(ps_exp2_norm2), sprintf("%s/sample_data_norm2.csv", output_folder))
write.csv(as.data.frame(tax_table(ps_exp2_norm2)), sprintf("%s/tax_table_norm2.csv", output_folder))

# Save figure objects
#dir.create("../FigCode"); dir.create("../FigCode/00_RawFigs")
fig_dir <- "../FigCode/00_RawFigs/"
saveRDS(list(f2, h1, h4), paste0(fig_dir, "9_2_Fig_Exp2_SummaryReads.obj"))
saveRDS(list(j1, j2), paste0(fig_dir, "9_2_Fig_Exp2_IndexHoppingRate.obj"))

# Save session info
writeLines(capture.output(sessionInfo()),
           paste0("../00_SessionInfo/", output_folder, "_", substr(Sys.time(), 1, 10), ".txt"))

