#Author: Francisco Pereira Lobo (franciscolobo@gmail.com) - 2024
#Description: produces quality metrics for non-redundant proteomes (#loci and
#BUSCO).

#loading libraries
library(ggplot2)
library(ggrepel)
library(cowplot)
library(tidyr)
library(reshape)
library(ggpubr)
library(dplyr)
library(scales)

#cleaning
rm(list=ls())

#This path should point to the directory produced when you uncompressed file
#reproducibility.tgz
setwd("~/projects/evolution_of_complexity/docs/biorx_evolution_of_complexity/version1/dataFiles/reproductibility_raw_data/reproducibility/")

#defining plot order
plot_order <- c("Protist", "Fungi", "Plantae", "Metazoa")

#getting metadata
genome_metadata <- read.table("data/metadata/genome_metadata_all.txt", sep="\t", header=TRUE)
genome_metadata$Popular_group_name <- as.factor(genome_metadata$Popular_group_name)

#creating major groups
tmp <- genome_metadata$Popular_group_name
tmp <- gsub("\\(Vertebrata\\)", "", tmp)
tmp <- gsub("\\(Invertebrata\\)", "", tmp)
tmp <- gsub("\\(Angiosperm\\)", "", tmp)
tmp <- gsub("\\(Chlorophyta\\)", "", tmp)
tmp <- gsub("\\(Lycophyte\\)", "", tmp)
tmp <- gsub("\\(Bryophyte\\)", "", tmp)
genome_metadata$Group <- as.factor(tmp)

#defining group color for plotting
df_color <- data.frame(genome_metadata$ID, genome_metadata$Group, genome_metadata$short_name)
colnames(df_color) = c("genomeID", "group", "shortName")


jColors <- data.frame(LABEL = levels(as.factor(df_color$group)),
                      COLOR = I(RColorBrewer::brewer.pal(nlevels(as.factor(df_color$group)), 
                                                         name = 'Set1')))
# I like my protists orange, thank you :-)
jColors$COLOR[jColors$LABEL == "Protist"] <- "#FF7F00"

genome_metadata$color <- jColors$COLOR[match(genome_metadata$Group, jColors$LABEL)]

#defining species to plot names (outliers removed from downstream analyses).
genome_metadata$plot_name_flag_C <- genome_metadata$short_name
genome_metadata$plot_name_flag_S <- genome_metadata$short_name
genome_metadata$plot_name_flag_D <- genome_metadata$short_name
genome_metadata$plot_name_flag_F <- genome_metadata$short_name
genome_metadata$plot_name_flag_M <- genome_metadata$short_name

genome_metadata$plot_name_flag_C[genome_metadata$Completeness > 75] <- ""
genome_metadata$plot_name_flag_S[genome_metadata$SingleCopy_BUSCOs > 75] <- ""
genome_metadata$plot_name_flag_D[genome_metadata$Duplicated_BUSCOs < 25] <- ""
genome_metadata$plot_name_flag_F[genome_metadata$Fragmented_BUSCOs < 25] <- ""
genome_metadata$plot_name_flag_M[genome_metadata$Missing_BUSCOs < 25] <- ""

#adding jitter
pos <- position_jitter(width=0.1, seed=1)

#creating plots

#proteome size per group
NR <- ggplot2::ggplot(genome_metadata, aes(x = factor(Group, level= plot_order), y = NonRedundantProteomeSize, fill=color)) +
  geom_violin(alpha=0.5, scale = "width") +
#  geom_point(data = genome_metadata[genome_metadata$plot_name_flag_C != "",], color = "red") +
  geom_point(position=pos, color = "black") +
  theme_light() +
  scale_y_continuous(trans='log10') +
  ggtitle("Non-redundant proteome size") +
  labs(x = "Groups", y = "Number of sequences") +
    theme(legend.position="none")  

pos <- position_jitter(width=0.1, seed=1)

df_not <- genome_metadata[genome_metadata$plot_name_flag_C != "",]
df_ok <- genome_metadata[genome_metadata$plot_name_flag_C == "",]

#busco completeness
C <- ggplot2::ggplot(genome_metadata, aes(x = factor(Group, level= plot_order), y = Completeness, fill=color, label = plot_name_flag_C)) +
  geom_violin(alpha=0.5, scale = "width") +
  geom_point(data = df_not, position=pos, color = "red") +
  geom_point(data = df_ok, position=pos, color = "black") +
  geom_text_repel(data = genome_metadata[genome_metadata$plot_name_flag_C != "",], position=pos, point.padding = 0, min.segment.length = 0, seed = 42, box.padding = 0.5, segment.ncp = 3, force = 20) +
  theme_light() +
  geom_hline(yintercept=75, linetype="dashed") +
  ggtitle("Completeness") +
  labs(x = "Groups", y = "Frequency") +
  ylim(0,100) +
  scale_x_discrete(
    expand = expansion(mult = 0.25)
  ) +
  theme(legend.position="none")  


pos <- position_jitter(width=0.1, seed=1)

df_not <- genome_metadata[genome_metadata$plot_name_flag_S != "",]
df_ok <- genome_metadata[genome_metadata$plot_name_flag_S == "",]

S <- ggplot2::ggplot(genome_metadata, aes(x = factor(Group, level= plot_order), y = SingleCopy_BUSCOs, fill=color, label = plot_name_flag_S)) +
  geom_violin(alpha=0.5, scale = "width") +
  geom_point(data = df_not, position=pos, color = "red") +
  geom_point(data = df_ok, position=pos, color = "black") +
  geom_text_repel(data = genome_metadata[genome_metadata$plot_name_flag_S != "",], position=pos, point.padding = 0, min.segment.length = 0, seed = 42, box.padding = 0.5, segment.ncp = 3, force = 20) +
  theme_light() +
  geom_hline(yintercept=75, linetype="dashed") +
  ggtitle("Single-copy BUSCOs") +
  labs(x = "Groups", y = "Frequency") +
  ylim(0,100) +
  scale_x_discrete(
    expand = expansion(mult = 0.25)
  ) +
  theme(legend.position="none")  


pos <- position_jitter(width=0.1, seed=1)

df_not <- genome_metadata[genome_metadata$plot_name_flag_D != "",]
df_ok <- genome_metadata[genome_metadata$plot_name_flag_D == "",]

D <- ggplot2::ggplot(genome_metadata, aes(x = factor(Group, level= plot_order), y = Duplicated_BUSCOs, fill=color, label = plot_name_flag_D)) +
  geom_violin(alpha=0.5, scale = "width") +
  geom_point(data = df_not, position=pos, color = "red") +
  geom_point(data = df_ok, position=pos, color = "black") +
  geom_text_repel(data = genome_metadata[genome_metadata$plot_name_flag_D != "",], position=pos, point.padding = 0, min.segment.length = 0, seed = 42, box.padding = 0.5, segment.ncp = 3, force = 20) +
  theme_light() +
  geom_hline(yintercept=25, linetype="dashed") +
  ggtitle("Duplicated BUSCOs") +
  labs(x = "Groups", y = "Frequency") +
  ylim(0,100) +
  scale_x_discrete(
    expand = expansion(mult = 0.25)
  ) +
  theme(legend.position="none")  


pos <- position_jitter(width=0.1, seed=1)

df_not <- genome_metadata[genome_metadata$plot_name_flag_F != "",]
df_ok <- genome_metadata[genome_metadata$plot_name_flag_F == "",]

F <- ggplot2::ggplot(genome_metadata, aes(x = factor(Group, level= plot_order), y = Fragmented_BUSCOs, fill=color, label = plot_name_flag_F)) +
  geom_violin(alpha=0.5, scale = "width") +
  geom_point(data = df_not, position=pos, color = "red") +
  geom_point(data = df_ok, position=pos, color = "black") +
  geom_text_repel(data = genome_metadata[genome_metadata$plot_name_flag_F != "",], position=pos, point.padding = 0, min.segment.length = 0, seed = 42, box.padding = 0.5, segment.ncp = 3, force = 20) +
  theme_light() +
  geom_hline(yintercept=25, linetype="dashed") +
  ggtitle("Fragmented BUSCOs") +
  labs(x = "Groups", y = "Frequency") +
  ylim(0,100) +
  scale_x_discrete(
    expand = expansion(mult = 0.25)
  ) +
  theme(legend.position="none")  


pos <- position_jitter(width=0.1, seed=1)

df_not <- genome_metadata[genome_metadata$plot_name_flag_M != "",]
df_ok <- genome_metadata[genome_metadata$plot_name_flag_M == "",]

M <- ggplot2::ggplot(genome_metadata, aes(x = factor(Group, level= plot_order), y = Missing_BUSCOs, fill=color, label = plot_name_flag_M)) +
  geom_violin(alpha=0.5, scale = "width") +
  geom_point(data = df_not, position=pos, color = "red") +
  geom_point(data = df_ok, position=pos, color = "black") +
  geom_text_repel(data = genome_metadata[genome_metadata$plot_name_flag_M != "",], position=pos, point.padding = 0, min.segment.length = 0, seed = 42, box.padding = 0.5, segment.ncp = 3, force = 20) +
  geom_hline(yintercept=25, linetype="dashed") +
  theme_light() +
  ggtitle("Missing BUSCOs") +
  labs(x = "Groups", y = "Frequency") +
  ylim(0,100) +
  scale_x_discrete(
    expand = expansion(mult = 0.25)
  ) +
  theme(legend.position="none")  

cairo_pdf("results/figures/Supplementary_Figure_1_B_Proteome_Quality.pdf", width = 16, height = 12)

prow <- plot_grid(NR, C, S, D, F, M, labels = c("A", "B", "C", "D", "E", "F"),
                  hjust = -1,
                  ncol = 2,
                  nrow = 3
)

prow

dev.off()


