#Author: Francisco Pereira Lobo (franciscolobo@gmail.com) - 2024
#Description: Produce comparison of prevalence across groups for homology
#and functional- annotation data

#loading libraries
library(reshape)
library(ggplot2)
library(tidyr)
library(ggpubr)
library(dplyr)

# cleaning
rm(list=ls())

####################################Getting Data##########################################


#this path should point to the directory produced when you uncompressed file
#reproducibility.tgz
setwd("~/projects/evolution_of_complexity/docs/biorx_evolution_of_complexity/version1/dataFiles/reproductibility_raw_data/reproducibility/")

#loading CALANGO output objects
load(file = "results/RData/gene2GO.RData")
load(file = "results/RData/homologous2IPR.RData")

#loading metadata
genome_metadata <- read.table("data/metadata/metadata_spp.txt", sep="\t", header=TRUE)

colnames(genome_metadata) <- c("Spp", "NCBI_genome_ID", "Group", "Spp2", "ShortSpp", "non_redundant_proteome", "NCT")


#Merging metazoans and plants to make four groups
tmp <- gsub("\\(Vertebrata\\)", "", genome_metadata$Group)
tmp <- gsub("\\(Invertebrata\\)", "", tmp)
tmp <- gsub("\\(Angiosperm\\)", "", tmp)
tmp <- gsub("\\(Chlorophyta\\)", "", tmp)
genome_metadata$Popular_group_name_small <- as.factor(tmp)
rm(tmp)

#Computing prevalence for all homologous regions
genome2IPRterms <- homologous2IPR$y
genome2IPRterms_2 <- apply(genome2IPRterms, 1, function(x) ifelse(x > 0, 1, 0))
IPR_term_prevalence <- apply(genome2IPRterms_2, 1, sum)/dim(genome2IPRterms_2)[2]

#computing term prevalence per group
group2IPRterms <- data.frame(rep(0, length(rownames(genome2IPRterms_2))), rep(0, length(rownames(genome2IPRterms_2))), rep(0, length(rownames(genome2IPRterms_2))), rep(0, length(rownames(genome2IPRterms_2))))
colnames(group2IPRterms) <- c("Metazoa", "Fungi", "Plantae", "Protist")
rownames(group2IPRterms) <- rownames(genome2IPRterms_2)

i <- 1

for (group in colnames(group2IPRterms)) {
  i2 <- 1
  genome_IDs <- genome_metadata$NCBI_genome_ID[genome_metadata$Popular_group_name_small %in% group]
  regions <- genome2IPRterms_2[,genome_IDs]
  for (region in rownames(regions)) {
    if (any(regions[region,] > 0)) {
      group2IPRterms[region, group] <- 1
    }
  print(paste(group, i, i2))
  i2 <- i2 + 1
  }
  i <- i + 1
}

tmp <- group2IPRterms[,1:3]
tmp <- tmp[apply(tmp, 1, function(x) (sum(x == 0) < 3)),]


#Adding mixed category to represent annotation terms present in more than one organism group
tmp$Shared <- apply(tmp, 1, function(x) ifelse(sum(x > 0) > 1, 1, 0))
#tmp <- group2terms

#groups to highlight lineage-exclusive terms
groups <- c("Metazoa", "Fungi", "Plantae")

#Computing annotation terms that are either exclusive or mixed.

i2 <- 1

for (i in rownames(tmp)) {
  print(i2)
  if (tmp[i, "Shared"] == 1) {
    tmp[i, groups] <- 0
  }
  i2 <- i2 + 1
}

#not-so-final dataframe to plot data
df_final <- data.frame(IPR_term_prevalence)
colnames(df_final) <- "Prevalence"
common <- intersect(rownames(df_final), rownames(tmp))
df_final_tmp <- df_final[common,]
names(df_final_tmp) <- common
group2terms_tmp <- tmp[common,]

#merging general and group prevalence data
merged_homology_IPR <- merge(df_final_tmp, as.data.frame((group2terms_tmp)), by=0, all=TRUE)
merged_homology_IPR_melted <- melt(merged_homology_IPR, id.vars = c("x", "Row.names"), measure.vars =  groups <- c("Metazoa", "Fungi", "Plantae", "Shared"))
colnames(merged_homology_IPR_melted) <- c("Prevalence", "ID", "Group", "GroupCount")

#computing total counts of homologous regions per group
Metazoa <- dim(merged_homology_IPR_melted[(merged_homology_IPR_melted$Group == "Metazoa" & merged_homology_IPR_melted$GroupCount == 1),])[1]/length(unique(merged_homology_IPR_melted$ID))
Plant <- dim(merged_homology_IPR_melted[(merged_homology_IPR_melted$Group == "Plantae" & merged_homology_IPR_melted$GroupCount == 1),])[1]/length(unique(merged_homology_IPR_melted$ID))
Fungi <- dim(merged_homology_IPR_melted[(merged_homology_IPR_melted$Group == "Fungi" & merged_homology_IPR_melted$GroupCount == 1),])[1]/length(unique(merged_homology_IPR_melted$ID))
Shared <- dim(merged_homology_IPR_melted[(merged_homology_IPR_melted$Group == "Shared" & merged_homology_IPR_melted$GroupCount == 1),])[1]/length(unique(merged_homology_IPR_melted$ID))

count_per_group <- data.frame(Metazoa, Plant, Fungi, Shared)
rownames(count_per_group) <- "Homology"

#computing relative percentages of each group class per prevalence bin
merged_homology_IPR_melted <- merged_homology_IPR_melted  %>%
  group_by(Prevalence, Group) %>%
  summarise(n = sum(GroupCount)) %>%
  mutate(percentage = n / sum(n))

#final prevalence dataframe for all homologous regions

#merged_homology_all_melted$Group <- gsub("Mixed", "Shared", merged_homology_all_melted$Group)

merged_homology_IPR_melted$cat1 <- rep("Homology", length(dim(merged_homology_IPR_melted)[1]))
merged_homology_IPR_melted$cat2 <- rep("All", length(dim(merged_homology_IPR_melted)[1]))

#p_hom_all <- ggplot(data = merged_homology_all_melted, aes(x=prevalence, y=percentage, fill=Group)) + geom_area(alpha=0.6 , size=0.1, colour="black") #+ theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.title.y=element_blank())  + theme(plot.margin = unit(c(0,0,0,0), "cm")) + labs(title="Homology_all")


#####################################################################
###Computing prevalence for GO terms in groups, gene-level annotation data

#subsetting for GOs found at least once
genome2IPRGOterms <- gene2GO$y[gene2GO$sum >= 1]
#genome2GOterms <- genome2GOterms[,significant_GO_all]
genome2IPRGOterms_2 <- apply(genome2IPRGOterms, 1, function(x) ifelse(x > 0, 1, 0))
IPRGO_term_prevalence <- apply(genome2IPRGOterms_2, 1, sum)/dim(genome2IPRGOterms_2)[2]

group2IPRGOterms <- data.frame(rep(0, length(rownames(genome2IPRGOterms_2))), rep(0, length(rownames(genome2IPRGOterms_2))), rep(0, length(rownames(genome2IPRGOterms_2))), rep(0, length(rownames(genome2IPRGOterms_2))))

colnames(group2IPRGOterms) <- c("Metazoa", "Fungi", "Plantae", "Protist")
rownames(group2IPRGOterms) <- rownames(genome2IPRGOterms_2)

i <- 1

for (group in colnames(group2IPRGOterms)) {
  i2 <- 1
  genome_IDs <- genome_metadata$NCBI_genome_ID[genome_metadata$Popular_group_name_small %in% group]
  regions <- genome2IPRGOterms_2[,genome_IDs]
  for (region in rownames(regions)) {
    if (any(regions[region,] > 0)) {
      group2IPRGOterms[region, group] <- 1
    }
    print(paste(group, i, i2))
    i2 <- i2 + 1
  }
  i <- i + 1
}

tmp <- group2IPRGOterms[,1:3]
tmp <- tmp[apply(tmp, 1, function(x) (sum(x == 0) < 3)),]
tmp$Shared <- apply(tmp, 1, function(x) ifelse(sum(x > 0) > 1, 1, 0))

groups <- c("Metazoa", "Fungi", "Plantae")


for (i in rownames(tmp)) {
  if (tmp[i, "Shared"] == 1) {
    tmp[i, groups] <- 0
  } else {
  }
}

df_final <- data.frame(IPRGO_term_prevalence)
common <- intersect(rownames(df_final), rownames(tmp))
df_final_tmp <- df_final[common,]
names(df_final_tmp) <- common

names(df_final_tmp) <- common
group2IPRGOterms_tmp <- tmp[common,]

merged_IPRGO_all <- merge(df_final_tmp, as.data.frame((group2IPRGOterms_tmp)), by=0, all=TRUE)
merged_IPRGO_all_melted <- melt(merged_IPRGO_all, id.vars = c("x", "Row.names"), measure.vars =  groups <- c("Metazoa", "Fungi", "Plantae", "Shared"))
colnames(merged_IPRGO_all_melted) <- c("Prevalence", "ID", "Group", "GroupCount")


#computing total counts of Pfam homologous regions per group
Metazoa <- dim(merged_IPRGO_all_melted[(merged_IPRGO_all_melted$Group == "Metazoa" & merged_IPRGO_all_melted$GroupCount == 1),])[1]/length(unique(merged_IPRGO_all_melted$ID))
Plant <- dim(merged_IPRGO_all_melted[(merged_IPRGO_all_melted$Group == "Plantae" & merged_IPRGO_all_melted$GroupCount == 1),])[1]/length(unique(merged_IPRGO_all_melted$ID))
Fungi <- dim(merged_IPRGO_all_melted[(merged_IPRGO_all_melted$Group == "Fungi" & merged_IPRGO_all_melted$GroupCount == 1),])[1]/length(unique(merged_IPRGO_all_melted$ID))
Shared <- dim(merged_IPRGO_all_melted[(merged_IPRGO_all_melted$Group == "Shared" & merged_IPRGO_all_melted$GroupCount == 1),])[1]/length(unique(merged_IPRGO_all_melted$ID))

count_per_group <- rbind(count_per_group, data.frame(Metazoa, Plant, Fungi, Shared))
rownames(count_per_group) <- c("Homology", "GO")

merged_IPRGO_all_melted <- merged_IPRGO_all_melted  %>%
  group_by(Prevalence, Group) %>%
  summarise(n = sum(GroupCount)) %>%
  mutate(percentage = n / sum(n))

#merged_GO_all_melted$Group <- gsub("Mixed", "Shared", merged_GO_all_melted$Group)

merged_IPRGO_all_melted$cat1 <- rep("GO", length(dim(merged_IPRGO_all_melted)[1]))
merged_IPRGO_all_melted$cat2 <- rep("All", length(dim(merged_IPRGO_all_melted)[1]))

#p_GO_all <- ggplot(merged_GO_all_melted, aes(x=prevalence, y=percentage, fill=Group)) + geom_area(alpha=0.6 , size=0.1, colour="black")  #+ theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.title.y=element_blank())  + theme(plot.margin = unit(c(0,0,0,0), "cm")) + labs(title="GO_all")


#getting significant annotation terms
#significant_homologous_regions_IPR <- intersect(names(homologous2IPR$contrasts.corrected[homologous2IPR$contrasts.corrected < 0.05]), names(homologous2IPR$sum[homologous2IPR$sum > 1]))
#significant_GO_IPR <- intersect(names(gene2GO$contrasts.corrected[gene2GO$contrasts.corrected < 0.05]), names(gene2GO$sum[gene2GO$sum > 1]))

significant_homologous_regions_IPR <- homologous2IPR$sig_IDs
significant_GO_IPR <- gene2GO$sig_IDs
  
#Printing supplementary Table 2

#Computing prevalence for all homologous regions
genome2IPRterms_sig <- (genome2IPRterms[,significant_homologous_regions_IPR])
genome2IPRterms_2_sig <- apply(genome2IPRterms_sig, 1, function(x) ifelse(x > 0, 1, 0))
IPRterm_prevalence_sig <- apply(genome2IPRterms_2_sig, 1, sum)/dim(genome2IPRterms_2_sig)[2]

#computing term prevalence per group
group2IPRterms_sig <- data.frame(rep(0, length(rownames(genome2IPRterms_2_sig))), rep(0, length(rownames(genome2IPRterms_2_sig))), rep(0, length(rownames(genome2IPRterms_2_sig))), rep(0, length(rownames(genome2IPRterms_2_sig))))
colnames(group2IPRterms_sig) <- c("Metazoa", "Fungi", "Plantae", "Protist")
rownames(group2IPRterms_sig) <- rownames(genome2IPRterms_2_sig)

for (genome in colnames(genome2IPRterms_2_sig)) {
  group <- as.character(genome_metadata$Popular_group_name_small[match(genome, genome_metadata$NCBI_genome_ID)])
  for (term in rownames(genome2IPRterms_2_sig)) {
    if (genome2IPRterms_2_sig[term, genome] == 1) {
      group2IPRterms_sig[term, group] <- as.numeric(1)
    }
  }
}

tmp <- group2IPRterms_sig[,1:3]
tmp <- tmp[apply(tmp, 1, function(x) (sum(x == 0) < 3)),]
tmp$Shared <- apply(tmp, 1, function(x) ifelse(sum(x > 0) > 1, 1, 0))

#groups to highlight lineage-exclusive terms
groups <- c("Metazoa", "Fungi", "Plantae")

#Computing annotation terms that are either exclusive or mixed.
for (i in rownames(tmp)) {
  if (tmp[i, "Shared"] == 1) {
    tmp[i, groups] <- 0
  } else {
  }
}

#not-so-final dataframe to plot data
df_final <- data.frame(IPRterm_prevalence_sig)
colnames(df_final) <- "Prevalence"
common <- intersect(rownames(df_final), rownames(tmp))
df_final_tmp <- df_final[common,]
names(df_final_tmp) <- common
group2IPRterms_tmp <- tmp[common,]

#merging general and group prevalence data
merged_homology_IPR_sig <- merge(df_final_tmp, as.data.frame((group2IPRterms_tmp)), by=0, all=TRUE)
merged_homology_IPR_melted_sig <- melt(merged_homology_IPR_sig, id.vars = c("x", "Row.names"), measure.vars =  groups <- c("Metazoa", "Fungi", "Plantae", "Shared"))
colnames(merged_homology_IPR_melted_sig) <- c("Prevalence", "ID", "Group", "GroupCount")

#computing total counts of homologous regions per group
Metazoa <- dim(merged_homology_IPR_melted_sig[(merged_homology_IPR_melted_sig$Group == "Metazoa" & merged_homology_IPR_melted_sig$GroupCount == 1),])[1]/length(unique(merged_homology_IPR_melted_sig$ID))
Plant <- dim(merged_homology_IPR_melted_sig[(merged_homology_IPR_melted_sig$Group == "Plantae" & merged_homology_IPR_melted_sig$GroupCount == 1),])[1]/length(unique(merged_homology_IPR_melted_sig$ID))
Fungi <- dim(merged_homology_IPR_melted_sig[(merged_homology_IPR_melted_sig$Group == "Fungi" & merged_homology_IPR_melted_sig$GroupCount == 1),])[1]/length(unique(merged_homology_IPR_melted_sig$ID))
Shared <- dim(merged_homology_IPR_melted_sig[(merged_homology_IPR_melted_sig$Group == "Shared" & merged_homology_IPR_melted_sig$GroupCount == 1),])[1]/length(unique(merged_homology_IPR_melted_sig$ID))

count_per_group_sig <- data.frame(Metazoa, Plant, Fungi, Shared)
rownames(count_per_group_sig) <- "Homology"

#computing relative percentages of each group class per prevalence bin
merged_homology_IPR_melted_sig <- merged_homology_IPR_melted_sig  %>%
  group_by(Prevalence, Group) %>%
  summarise(n = sum(GroupCount)) %>%
  mutate(percentage = n / sum(n))

#final prevalence dataframe for all homologous regions

merged_homology_IPR_melted_sig$cat1 <- rep("Homology", length(dim(merged_homology_IPR_melted_sig)[1]))
merged_homology_IPR_melted_sig$cat2 <- rep("Significant", length(dim(merged_homology_IPR_melted_sig)[1]))

#p_hom_IPR <- ggplot(data = merged_homology_IPR_melted_sig, aes(x=Prevalence, y=percentage, fill=Group)) + geom_area(alpha=0.6 , size=0.1, colour="black") #+ theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.title.y=element_blank())  + theme(plot.margin = unit(c(0,0,0,0), "cm")) + labs(title="Homology_all")

#################

#####################################################################
###Computing prevalence for GO terms in groups, gene-level annotation data

#subsetting for GOs found at least once
genome2IPRGOterms_sig <- gene2GO$y[gene2GO$sum >= 1]
genome2IPRGOterms_sig <- genome2IPRGOterms_sig[,significant_GO_IPR]
genome2IPRGOterms_2_sig <- apply(genome2IPRGOterms_sig, 1, function(x) ifelse(x > 0, 1, 0))
IPRGO_term_prevalence_sig <- apply(genome2IPRGOterms_2_sig, 1, sum)/dim(genome2IPRGOterms_2_sig)[2]

group2IPRGOterms_sig <- data.frame(rep(0, length(rownames(genome2IPRGOterms_2_sig))), rep(0, length(rownames(genome2IPRGOterms_2_sig))), rep(0, length(rownames(genome2IPRGOterms_2_sig))), rep(0, length(rownames(genome2IPRGOterms_2_sig))))

colnames(group2IPRGOterms_sig) <- c("Metazoa", "Fungi", "Plantae", "Protist")
rownames(group2IPRGOterms_sig) <- rownames(genome2IPRGOterms_2_sig)

for (i in colnames(genome2IPRGOterms_2_sig)) {
  group <- as.character(genome_metadata$Popular_group_name_small[match(i, genome_metadata$NCBI_genome_ID)])
  for (i2 in rownames(genome2IPRGOterms_2_sig)) {
    if (genome2IPRGOterms_2_sig[i2, i] == 1) {
      group2IPRGOterms_sig[i2,group] <- as.numeric(1)
    }
  }
}

tmp <- group2IPRGOterms_sig[,1:3]
tmp <- tmp[apply(tmp, 1, function(x) (sum(x == 0) < 3)),]
tmp$Shared <- apply(tmp, 1, function(x) ifelse(sum(x > 0) > 1, 1, 0))

groups <- c("Metazoa", "Fungi", "Plantae")

for (i in rownames(tmp)) {
  if (tmp[i, "Shared"] == 1) {
    tmp[i, groups] <- 0
  } else {
  }
}

df_final <- data.frame(IPRGO_term_prevalence_sig)
common <- intersect(rownames(df_final), rownames(tmp))
df_final_tmp <- df_final[common,]
names(df_final_tmp) <- common

names(df_final_tmp) <- common
group2IPRGOterms_tmp <- tmp[common,]

merged_IPRGO_all_sig <- merge(df_final_tmp, as.data.frame((group2IPRGOterms_tmp)), by=0, all=TRUE)
merged_IPRGO_all_melted_sig <- melt(merged_IPRGO_all_sig, id.vars = c("x", "Row.names"), measure.vars =  groups <- c("Metazoa", "Fungi", "Plantae", "Shared"))
colnames(merged_IPRGO_all_melted_sig) <- c("Prevalence", "ID", "Group", "GroupCount")


#computing total counts of Pfam homologous regions per group
Metazoa <- dim(merged_IPRGO_all_melted_sig[(merged_IPRGO_all_melted_sig$Group == "Metazoa" & merged_IPRGO_all_melted_sig$GroupCount == 1),])[1]/length(unique(merged_IPRGO_all_melted_sig$ID))
Plant <- dim(merged_IPRGO_all_melted_sig[(merged_IPRGO_all_melted_sig$Group == "Plantae" & merged_IPRGO_all_melted_sig$GroupCount == 1),])[1]/length(unique(merged_IPRGO_all_melted_sig$ID))
Fungi <- dim(merged_IPRGO_all_melted_sig[(merged_IPRGO_all_melted_sig$Group == "Fungi" & merged_IPRGO_all_melted_sig$GroupCount == 1),])[1]/length(unique(merged_IPRGO_all_melted_sig$ID))
Shared <- dim(merged_IPRGO_all_melted_sig[(merged_IPRGO_all_melted_sig$Group == "Shared" & merged_IPRGO_all_melted_sig$GroupCount == 1),])[1]/length(unique(merged_IPRGO_all_melted_sig$ID))

count_per_group_sig <- rbind(count_per_group_sig, data.frame(Metazoa, Plant, Fungi, Shared))
rownames(count_per_group_sig) <- c("Homology", "GO")

merged_IPRGO_all_melted_sig <- merged_IPRGO_all_melted_sig  %>%
  group_by(Prevalence, Group) %>%
  summarise(n = sum(GroupCount)) %>%
  mutate(percentage = n / sum(n))

merged_IPRGO_all_melted_sig$cat1 <- rep("GO", length(dim(merged_IPRGO_all_melted_sig)[1]))
merged_IPRGO_all_melted_sig$cat2 <- rep("Significant", length(dim(merged_IPRGO_all_melted_sig)[1]))

#p_GO_all_sig <- ggplot(merged_GO_all_melted_sig, aes(x=prevalence, y=percentage, fill=Group)) + geom_area(alpha=0.6 , size=0.1, colour="black")  #+ theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.title.y=element_blank())  + theme(plot.margin = unit(c(0,0,0,0), "cm")) + labs(title="GO_all")

###Plotting Figure 2A

df_prevalence_all_groups <- (rbind(merged_homology_IPR_melted, merged_IPRGO_all_melted))

df_prevalence_significant <- (rbind(merged_homology_IPR_melted_sig, merged_IPRGO_all_melted_sig, ))

df_prevalence_final <- rbind(df_prevalence_all_groups, df_prevalence_significant)

p_annotation_term_prevalence_per_group_final <- ggplot(df_prevalence_final, aes(x=Prevalence, y=percentage, fill=Group)) +
                                                       geom_area(size=0.1, colour="black") +
                                                       theme_light() +
                                                       facet_grid(cat1~cat2) +
                                                       theme(legend.position = "top") +
                                                       ylab("Percentage") +
                                                       scale_fill_manual(values=c("#377EB8", "#E41A1C", "#4DAF4A", "#FFFF28")) +
                                                       guides(fill = guide_legend(nrow =  1))



## Plotting Figure 2B

count_per_group_cat <- count_per_group
count_per_group_cat$cat1 <- rownames(count_per_group_cat)
count_per_group_melt <- melt(count_per_group_cat)
colnames(count_per_group_melt) <- c("Annotation\nschema", "Group", "% annotation terms")
count_per_group_melt$Class <- rep("All", dim(count_per_group_melt)[1])


count_per_group_sig_cat <- count_per_group_sig
count_per_group_sig_cat$cat1 <- rownames(count_per_group_sig_cat)
count_per_group_sig_melt <- melt(count_per_group_sig_cat)
colnames(count_per_group_sig_melt) <- c("Annotation\nschema", "Group", "% annotation terms")
count_per_group_sig_melt$Class <- rep("Significant", dim(count_per_group_sig_melt)[1])

df_count <- rbind(count_per_group_melt, count_per_group_sig_melt)
### Plotting Figure 2C

##merging KOMODO2 outputs to get prevalence data for databases

prev_homologous2IPR <- data.frame(names(homologous2IPR$greaterthanzero), homologous2IPR$greaterthanzero, rep("IPR_homology", length(homologous2IPR$greaterthanzero)))
colnames(prev_homologous2IPR) <- c("ID", "Prevalence", "Database")

#binding
database_prevalence <- prev_homologous2IPR

#my_comparisons_all <- list( c("Homology_all", "Homology_sig"), c("GO_all", "GO_sig"), c("Homology_all", "GO_all"), c("Homology_sig", "GO_sig"))
my_comparisons_all <- list(c("Homology_all", "GO_all"), c("Homology_sig", "GO_sig"))

df <- data.frame(database_prevalence$Prevalence)
df$Class1 <- rep("Homology_all", length(df[,1]))
df$Class2 <- rep("Prevalence", length(df[,1]))
df$Class3 <- rep("Homology", length(df[,1]))
colnames(df) <- c("Value", "Database", "Statistics", "Annotation\nschema")
rownames(df) <- NULL

tmp_df <- data.frame((homologous2IPR$greaterthanzero[significant_homologous_regions_IPR]))
tmp_df <- as.data.frame(tmp_df[complete.cases(tmp_df),])
tmp_df$Class1 <- rep("Homology_sig", dim(tmp_df)[1])
tmp_df$Class2 <- rep("Prevalence", dim(tmp_df)[1])
tmp_df$Class3 <- rep("Homology", dim(tmp_df)[1])
colnames(tmp_df) <- c("Value", "Database", "Statistics", "Annotation\nschema")
rownames(tmp_df) <- NULL

df <- rbind(df, tmp_df)

tmp_df <- data.frame(sapply(gene2GO$y, function(v){sum(v > 0) / length(v)}))
tmp_df <- data.frame(tmp_df[tmp_df$sapply.gene2GO.y..function.v... > 0,])
tmp_df$Class1 <- rep("GO_all", dim(tmp_df)[1])
tmp_df$Class2 <- rep("Prevalence", dim(tmp_df)[1])
tmp_df$Class3 <- rep("GO", dim(tmp_df)[1])
colnames(tmp_df) <- c("Value", "Database", "Statistics", "Annotation\nschema")
rownames(tmp_df) <- NULL

df <- rbind(df, tmp_df)


tmp_df <- data.frame(gene2GO$greaterthanzero[significant_GO_IPR])
tmp_df$Class1 <- rep("GO_sig", dim(tmp_df)[1])
tmp_df$Class2 <- rep("Prevalence", length(gene2GO$greaterthanzero[significant_GO_IPR]))
tmp_df$Class3 <- rep("GO", dim(tmp_df)[1])
colnames(tmp_df) <- c("Value", "Database", "Statistics", "Annotation\nschema")
rownames(tmp_df) <- NULL

df <- rbind(df, tmp_df)

df$fill <- rep(0.05, dim(df)[1])
df$fill[grep("Sig", df$Database)] <- 0.5
df$fill[grep("Homology_all", df$Database)] <- 0.01

#compare_means(len ~ supp, data = df)

df_prevalence_distribution_all <- df
df_prevalence_distribution_all$`Annotation\nschema` <- as.factor(df_prevalence_distribution_all$`Annotation\nschema`)
rm(df)

###########################################
#Plots prevalence histograms for distinct annotation schemas

plot_prevalence_distribution_final <- ggplot(df_prevalence_distribution_all[df_prevalence_distribution_all$Statistics == "Prevalence",], aes(x=Database, y=Value)) +
  geom_violin(scale="width", aes(fill=`Annotation\nschema`)) +
  scale_fill_manual(values=c("#E69F00", "#56B4E9")) +
  #  geom_jitter(alpha = 0.01) +
  #     coord_flip() +
#  xlab("Database") +
  ylab("Prevalence") +
  #     theme(axis.title.y = element_blank()) +
  stat_compare_means(comparisons = my_comparisons_all, method="wilcox.test") +
#  geom_point(alpha=df_prevalence_distribution_all$fill,
#             size = .1,
#             position = position_jitter(
#               seed = 1, width = .1)) +
  stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median, geom = "crossbar", width = 0.2, col="black") +
  theme_light(base_size = 10) +
  theme(axis.title.x=element_blank()) +
  theme(axis.text.x = element_text(angle = 45, hjust=1), plot.margin = margin(5,1,1.5,1.2)) +
  scale_y_continuous(expand = c(0, 0.2))



p_count_per_multi_data <- ggplot(df_count, aes(x=Group, y=`% annotation terms`, fill=`Annotation\nschema`)) +
  geom_bar(stat="identity", position=position_dodge())+
  ylim(0,1) +
  facet_grid(~Class) +
  theme_light() +
  ylab("Percentage") +
  theme(axis.title.x=element_blank()) +
  theme(legend.position = "top", axis.text.x = element_text(angle = 45, hjust=1), plot.margin = margin(1,1,1.5,1.2)) +
  scale_fill_manual(values=c("#E69F00", "#56B4E9"))


p_count_final <- ggarrange(p_count_per_multi_data, plot_prevalence_distribution_final, labels = c("B", "C"), align = c("v"), common.legend=TRUE, ncol=2, heights = c(2,2))

pdf("results/figures/Figure_2.pdf", width = 10, height = 10)
ggarrange(p_annotation_term_prevalence_per_group_final, p_count_final, nrow=2, labels = c("A", ""), heights=c(1,1.3))
dev.off()
