#Author: Francisco Pereira Lobo (franciscolobo@gmail.com) - 2024
#Description: Finds molecular convergences (GO terms associated with NCT and 
#independently expanded in complex multicellular plants and metazoans)

library(reshape)
library(ggplot2)
library(tidyr)
library(ggpubr)
library(dplyr)
library(ggvenn)
library(gplots)
library(rrvgo)

rm(list=ls())

####################################Getting Data##########################################

#this path should point to the directory produced when you uncompressed file
#reproducibility.tgz
setwd("~/projects/evolution_of_complexity/docs/biorx_evolution_of_complexity/version1/dataFiles/reproductibility_raw_data/reproducibility/")

#load CALANGO output object
load(file = "results/RData/gene2GO.RData")

output <- gene2GO

genome_metadata <- read.table("data/metadata/metadata_spp.txt", sep="\t", header=TRUE)
colnames(genome_metadata) <- c("Spp", "NCBI_genome_ID", "Group", "Spp2", "ShortSpp", "non_redundant_proteome", "NCT")

GO_profile_convergences_fungi_metazoa <- rep(0, length(output$sig_IDs))
names(GO_profile_convergences_fungi_metazoa) <- colnames(output$y[,output$sig_IDs])

GO_profile_convergences_plant_metazoa <- rep(0, length(output$sig_IDs))
names(GO_profile_convergences_plant_metazoa) <- colnames(output$y[,output$sig_IDs])

#iterates through each GO ID associated with NCT and checks if either fungi or
#angiosperms are within the three groups with the greatest non-zero median
#values of abundance of the GO ID. If so, label them as convergences.

for (fname in output$sig_IDs) {
  tmp <- gsub("\\(Vertebrata\\)", "", output$groups)
  tmp <- gsub("\\(Invertebrata\\)", "", tmp)
  tmp <- gsub("\\(Angiosperm\\)", "", tmp)
  tmp <- gsub("\\(Chlorophyta\\)", "", tmp)
  output$groups_simple <- tmp
  tmp <- data.frame(output$x, output$short.name, output$groups, output$groups_simple, output$y[fname], (output$y[fname]/output$denominator))
  colnames(tmp) <- c("cell_type", "short", "group", "group_color", "count", "frequency")
  df <- tmp%>%
  group_by(group)%>%
    summarise(Mean=mean(frequency), Max=max(frequency), Min=min(frequency), Median=median(frequency), Std=sd(frequency))
  df <- df[order(df$Mean,decreasing = TRUE),]
  df <- df[df$Median > 0,]
  max_num <- min(dim(df)[1], 3)
  df <- df[1:max_num,]
  for (group in df$group) {
    if (group %in% c("Fungi")) {
      GO_profile_convergences_fungi_metazoa[fname] <- 1
    }
    if (group %in% c("Plantae(Angiosperm)")) {
      GO_profile_convergences_plant_metazoa[fname] <- 1
    }
  }
}

convergences_fungi_metazoa <- names(GO_profile_convergences_fungi_metazoa[GO_profile_convergences_fungi_metazoa > 0])
convergences_plant_metazoa <- names(GO_profile_convergences_plant_metazoa[GO_profile_convergences_plant_metazoa > 0])

convergences <- c(convergences_plant_metazoa)

df <- as.data.frame(output$contrasts.corrected[output$sig_IDs])
df$GO_ID <- rownames(df)
df2 <- df[convergences,]
colnames(df2) <- c("q-value", "GO_ID")
scoresShared <- setNames(-log10(df2$`q-value`), df2$GO_ID)
simMatrixShared <- calculateSimMatrix(df2$GO_ID,
                                      orgdb="org.Hs.eg.db",
                                      ont="BP",
                                      method="Rel")

reducedTermsShared <- reduceSimMatrix(simMatrixShared,
                                      scoresShared,
                                      threshold=0.2,
                                      orgdb="org.Hs.eg.db")

pdf(file = "results/figures/Figure_3C.pdf", width = 7, height = 6)
treemapPlot(reducedTermsShared)
dev.off()
