#Author: Francisco Pereira Lobo (franciscolobo@gmail.com) - 2024
#Description: produces CALANGO outputs for downstream analyses.

#loading libraries
library(CALANGO)

#cleaning the environment
rm(list=ls())

#All the results this script generates are already available in "results/".

#If you want to (re)create only CALANGO's output objects from input files,
#configure render_report_flag to false.

#Please notice that significant IDs needed to be defined manualy, as CALANGO
#does not create CALANGO_output$sig_IDs when render.report is FALSE.
render_report_flag <- FALSE

#If you want to (re)create html dynamical files too, comment the line above
#and set it to TRUE. This will take a while :-)
#ender_report_flag <- TRUE

#This path should point to the directory produced when you uncompressed file
#zenodo_reproductibility.tgz
setwd("~/projects/evolution_of_complexity/docs/biorx_evolution_of_complexity/version1/dataFiles/reproductibility_raw_data/zenodo_reproductibility/")


## GO-based annotation (function-based)

#less_H_sapiens - excludes H. sapiens from analysis
if (render_report_flag == TRUE) {
  gene2GO_less_H_sapiens <- run_CALANGO(defs = "data/parameters_CALANGO/parameters_gene2GO_less_H_sapiens.txt", basedir = ".", cores = 10, render.report = TRUE)
}
if (render_report_flag == FALSE) {
  gene2GO_less_H_sapiens <- run_CALANGO(defs = "data/parameters_CALANGO/parameters_gene2GO_less_H_sapiens.txt", basedir = ".", cores = 10, render.report = FALSE)
  gene2GO_less_H_sapiens$sig_IDs <- names(gene2GO_less_H_sapiens$contrasts.corrected[gene2GO_less_H_sapiens$contrasts.corrected < 0.01])
}

save(gene2GO_less_H_sapiens, file="results/RData/gene2GO_less_H_sapiens.RData")
rm(gene2GO_less_H_sapiens)
gc()

#original_NCT - uses original NCT data for H. sapiens
if (render_report_flag == TRUE) {
  gene2GO_original_NCT <- run_CALANGO(defs = "data/parameters_CALANGO/parameters_gene2GO_original_NCT.txt", basedir = ".", cores = 10, render.report = TRUE)
}

if (render_report_flag == FALSE) {
  gene2GO_original_NCT <- run_CALANGO(defs = "data/parameters_CALANGO/parameters_gene2GO_original_NCT.txt", basedir = ".", cores = 10, render.report = FALSE)
  gene2GO_original_NCT$sig_IDs <- names(gene2GO_original_NCT$contrasts.corrected[gene2GO_original_NCT$contrasts.corrected < 0.01])
}

save(gene2GO_original_NCT, file="results/RData/gene2GO_original_NCT.RData")
rm(gene2GO_original_NCT)
gc()

#gene2GO - uses chimp NCT as a proxy for humans (used in downstream analyses).
if (render_report_flag == TRUE) {
  gene2GO <- run_CALANGO(defs = "data/parameters_CALANGO/parameters_gene2GO.txt", basedir = ".", cores = 10, render.report = TRUE)
}

if (render_report_flag == FALSE) {
  gene2GO <- run_CALANGO(defs = "data/parameters_CALANGO/parameters_gene2GO.txt", basedir = ".", cores = 10, render.report = FALSE)
  gene2GO$sig_IDs <- names(gene2GO$contrasts.corrected[gene2GO$contrasts.corrected < 0.01])
}

save(gene2GO, file="results/RData/gene2GO.RData")
rm(gene2GO)
gc()

##IPR-based annotation (homology-based).

#gene2IPR - uses chimp NCT as a proxy for humans (used in downstream analyses).
if (render_report_flag == TRUE) {
  homologous2IPR <- run_CALANGO(defs = "data/parameters_CALANGO/parameters_homologous2IPR.txt", basedir = ".", cores = 10, render.report = TRUE)
}
if (render_report_flag == FALSE) {
  homologous2IPR <- run_CALANGO(defs = "data/parameters_CALANGO/parameters_homologous2IPR.txt", basedir = ".", cores = 10, render.report = FALSE)
  homologous2IPR$sig_IDs <- names(homologous2IPR$contrasts.corrected[homologous2IPR$contrasts.corrected < 0.01])
}

save(homologous2IPR, file="results/RData/homologous2IPR.RData")
rm(homologous2IPR)
gc()

#excluding H. sapiens
if (render_report_flag == TRUE) {
  homologous2IPR_less_H_sapiens <- run_CALANGO(defs = "data/parameters_CALANGO/parameters_homologous2IPR_less_H_sapiens.txt", basedir = ".", cores = 10, render.report = TRUE)
}
if (render_report_flag == FALSE) {
  homologous2IPR_less_H_sapiens <- run_CALANGO(defs = "data/parameters_CALANGO/parameters_homologous2IPR_less_H_sapiens.txt", basedir = ".", cores = 10, render.report = FALSE)
  homologous2IPR_less_H_sapiens$sig_IDs <- names(homologous2IPR_less_H_sapiens$contrasts.corrected[homologous2IPR_less_H_sapiens$contrasts.corrected < 0.01])
}

save(homologous2IPR_less_H_sapiens, file="results/RData/homologous2IPR_less_H_sapiens.RData")
rm(homologous2IPR_less_H_sapiens)
gc()

#using original NCT value for H. sapiens.
if (render_report_flag == TRUE) {
  homologous2IPR_original_NCT <- run_CALANGO(defs = "data/parameters_CALANGO/parameters_homologous2IPR_original_NCT.txt", basedir = ".", cores = 10, render.report = TRUE)
}
if (render_report_flag == FALSE) {
  homologous2IPR_original_NCT <- run_CALANGO(defs = "data/parameters_CALANGO/parameters_homologous2IPR_original_NCT.txt", basedir = ".", cores = 10, render.report = FALSE)
  homologous2IPR_original_NCT$sig_IDs <- names(homologous2IPR_original_NCT$contrasts.corrected[homologous2IPR_original_NCT$contrasts.corrected < 0.01])
}

save(homologous2IPR_original_NCT, file="results/RData/homologous2IPR_original_NCT.RData")
rm(homologous2IPR_original_NCT)
gc()

#Each SUPERFAMILY region to its GO term, if available
if (render_report_flag == TRUE) {
  SUPERFAMILY2GO <- run_CALANGO(defs = "data/parameters_CALANGO/parameters_SUPERFAMILY2GO.txt", basedir = ".", cores = 10, render.report = TRUE)
}
if (render_report_flag == FALSE) {
  SUPERFAMILY2GO <- run_CALANGO(defs = "data/parameters_CALANGO/parameters_SUPERFAMILY2GO.txt", basedir = ".", cores = 10, render.report = FALSE)
  SUPERFAMILY2GO$sig_IDs <- names(SUPERFAMILY2GO$contrasts.corrected[SUPERFAMILY2GO$contrasts.corrected < 0.01])
}

save(SUPERFAMILY2GO, file="results/RData/SUPERFAMILY2GO.RData")
rm(SUPERFAMILY2GO)
gc()

#Each SUPERFAMILY region to SUPERFAMILY ID, if available
if (render_report_flag == TRUE) {
  homologous2SUPERFAMILY <- run_CALANGO(defs = "data/parameters_CALANGO/parameters_homologous2SUPERFAMILY.txt", basedir = ".", cores = 10, render.report = TRUE)
}
if (render_report_flag == FALSE) {
  homologous2SUPERFAMILY <- run_CALANGO(defs = "data/parameters_CALANGO/parameters_homologous2SUPERFAMILY.txt", basedir = ".", cores = 10, render.report = FALSE)
  homologous2SUPERFAMILY$sig_IDs <- names(homologous2SUPERFAMILY$contrasts.corrected[homologous2SUPERFAMILY$contrasts.corrected < 0.01])
}

save(homologous2SUPERFAMILY, file="results/RData/homologous2SUPERFAMILY.RData")
rm(homologous2SUPERFAMILY)
gc()

#Using "naive" lm (regular Pearson's correlation) to emulate the (amazing) VC work
if (render_report_flag == TRUE) {
  homologous2SUPERFAMILY_VC <- run_CALANGO(defs = "data/parameters_CALANGO/parameters_homologous2SUPERFAMILY_VC.txt", basedir = ".", cores = 10, render.report = TRUE)
}
if (render_report_flag == FALSE) {
  homologous2SUPERFAMILY_VC <- run_CALANGO(defs = "data/parameters_CALANGO/parameters_homologous2SUPERFAMILY_VC.txt", basedir = ".", cores = 10, render.report = FALSE)
  homologous2SUPERFAMILY_VC$sig_IDs <- c(names(homologous2SUPERFAMILY_VC$correlations.pearson[homologous2SUPERFAMILY_VC$correlations.pearson <= -0.8]), names(homologous2SUPERFAMILY_VC$correlations.pearson[homologous2SUPERFAMILY_VC$correlations.pearson >= 0.8]))
}

save(homologous2SUPERFAMILY_VC, file="results/RData/homologous2SUPERFAMILY_VC.RData")
rm(homologous2SUPERFAMILY_VC)
gc()

#saving session info for reproductibility
sessionInfo_05 <- sessionInfo()
save(sessionInfo_05, file="results/RData/sessionInfo_05.RData")
