#Author: Francisco Pereira Lobo (franciscolobo@gmail.com) - 2024
#Description: compares how different values of NCT for H. sapiens interfere
#with IPR/GO associated with NCT.

#loading libraries
library(reshape)
library(ggplot2)
library(tidyr)
library(ggpubr)
library(dplyr)
library(scales)
set.seed(42)

rm(list=ls())


#this path should point to the directory produced when you uncompressed file
#reproducibility.tgz
setwd("~/projects/evolution_of_complexity/docs/biorx_evolution_of_complexity/version1/dataFiles/reproductibility_raw_data/reproducibility/")

#loading CALANGO output objects
load(file = "results/RData/gene2GO.RData")
load(file = "results/RData/homologous2IPR.RData")

plot_order <- c("IPR_homology", "IPR_GO")

#genome coverage data
genome_metadata <- read.table("data/metadata/metadata_spp.txt", sep="\t", header=TRUE)
colnames(genome_metadata) <- c("Spp", "NCBI_genome_ID", "Group", "Spp2", "ShortSpp", "non_redundant_proteome", "NCT")


#gene-level annotation data
gene_level_annotation_data <- read.table("data/metadata/genome2database2gene.txt", header=FALSE)
#gene_level_annotation_data_all <- read.table("database_desc/genome2database2gene_all.txt", header=FALSE)

#adding proteome size
gene_level_annotation_data$V4 <- genome_metadata$non_redundant_proteome[match(gene_level_annotation_data$V1, genome_metadata$NCBI_genome_ID)]


#mean annotation coverage
gene_level_annotation_data$V5 <- (gene_level_annotation_data$V2/gene_level_annotation_data$V4)*100

#adding annotation schema
gene_level_annotation_data$V6 <- genome_metadata$Group[match(gene_level_annotation_data$V1, genome_metadata$NCBI_genome_ID)]

#adding group2 information
gene_level_annotation_data$Group2 <- gene_level_annotation_data$V6
gene_level_annotation_data$Group2 <- gsub("\\(Angiosperm\\)", "", gene_level_annotation_data$Group2)
gene_level_annotation_data$Group2 <- gsub("\\(Chlorophyta\\)", "", gene_level_annotation_data$Group2)
gene_level_annotation_data$Group2 <- gsub("\\(Vertebrata\\)", "", gene_level_annotation_data$Group2)
gene_level_annotation_data$Group2 <- gsub("\\(Invertebrata\\)", "", gene_level_annotation_data$Group2)

jColors <- data.frame(LABEL = levels(as.factor(gene_level_annotation_data$Group2)),
                      COLOR = I(RColorBrewer::brewer.pal(nlevels(as.factor(gene_level_annotation_data$Group2)),
                                                         name = 'Set1')))
col_letters <- jColors$COLOR
names(col_letters) <- jColors$LABEL
col_letters["Protist"] <- "#FF7F00"

gene_level_annotation_data$Color <- jColors$COLOR[match(gene_level_annotation_data$Group2, jColors$LABEL)]

#column names
colnames(gene_level_annotation_data) <- c("GenomeID", "num_annotated_genes", "Database", "proteome_size", "% Proteome annotation", "Group", "Group2", "Color")

#plotting
gene_level_annotation_data <- gene_level_annotation_data[gene_level_annotation_data$Database %in% c("IPR_GO", "IPR_homology"),]

p_database_mean_ann_cov_per_group <- ggplot(gene_level_annotation_data, aes(y=`% Proteome annotation`, x=factor(Database, level = plot_order))) +
  geom_violin(scale = "width") +
  scale_y_log10() +
  coord_flip() +
  theme_light() +
  theme(axis.title.y=element_blank()) +
  theme(plot.margin = unit(c(0,0,0,0), "cm")) +
  stat_summary(fun=median, geom="crossbar", width=0.2, color="black") +
  stat_summary(aes(color = Group), fun=median, geom="point", size=2, position = position_jitter(width = 0.1, height = 0))# +
#  scale_color_manual(values=c("#E41A1C", "#377EB8", "#4DAF4A", "#FF7F00")) +
  theme(legend.position = "none")

pdf("results/figures/Figure_1_C_Database_annotation_coverage_per_group.pdf", width=10, heigh=5)
  p_database_mean_ann_cov_per_group
dev.off()
