#Author: Francisco Pereira Lobo (franciscolobo@gmail.com) - 2024
#Description: uses the time-calibrated phylogeny of extant groups of eukaryotes
#as a phylogenetic scaffold and:
# 1) trim it to major groups shared with our species list with NCT values and
#genomic data available
# 2) insert missing species using the time-calibrated species tree from
#TimeTreeOfLife
# 3) correct branch lengths taking into account the time as predicted in these
#trees to "naively" correct them
# 4) produce a time-calibrated species tree to be used as phylogenetic scaffold
#for downstream analyses

#cleaning environment
rm(list=ls())

#Configuring environment
library(ape)
library(phytools)
library(RRphylo)
library(ggtree)


#functions
select.tip.or.node <- function(element, tree) {
  ifelse(element < Ntip(tree)+1,
    tree$tip.label[element],
    tree$node.label[element-Ntip(tree)])
}

return.divergence.time <- function(tree, d_matrix, n_matrix, spp1, spp2) {
  mrca <- MRCA(tree, c(spp1, spp2))
  sppNodeID <- n_matrix$child[n_matrix$chi.name == spp1]
  div_time <- d_matrix[sppNodeID, mrca]
  return(div_time)
}

#this path should point to the directory produced when you uncompressed file
#reproducibility.tgz
setwd("~/projects/evolution_of_complexity/docs/biorx_evolution_of_complexity/version1/dataFiles/reproductibility_raw_data/reproducibility/")

#divergence times for TToL and scaffold trees
load("results/RData/divergence_time_data.RData")

#dictionary of names between the trees
phylogenetic_metadata <- read.table("data/metadata/phylogenetic_metadata.txt", sep="\t", header=TRUE)
names <- read.table("data/metadata/TToLIDs2ShortNames2Scaffold.txt", sep="\t", header=FALSE)
phylogenetic_metadata$ShortName <- names$V2[match(phylogenetic_metadata$Species, names$V1)]
rm(names)

###

#1) Producing a scaffold Eukarya tree containing major groups also found in our dataset

#reading the input (soon-to-be scaffold) tree
input_tree <- read.tree("data/trees/raw/PBAYES_CATGTR_136o_10genes_AmoOut_ugam_CIs.nwk.tre")

cairo_pdf("results/figures/Supplementary_Figure_1_C_Tree_scaffold.pdf", width = 5, height = 20)
ggtree::ggtree(input_tree) +
  ggtree::geom_hilight(node=c(102, 107, 254, 112, 115, 116, 118, 122, 264, 125, 126, 127, 271, 133, 266, 136, 229, 222, 213, 211, 70, 69, 67, 207, 64, 62, 61, 60, 59, 58, 57, 56, 55, 54, 51, 181, 180, 173, 29, 162, 145), fill="indianred1") + 
  geom_tiplab(hjust = -0.1, size = 3) + 
#  geom_text(aes(label=node), hjust=-.5, size=2, color="black") +
  ggplot2::xlim(0, 2600)

dev.off()


# select which species to keep based on our species list
spp_include <- c("Op_Drosoph", "Op_Homo_sa", "Op_Mus_mus", "Op_Nematos", "Op_Ciona_i", "Op_Amphime", "Op_Gallus_", "Op_Capsasp", "Op_Neurosp", "Op_Schizos", "Op_Ustilag", "Am_Dictyos", "Ar_Arabido", "Ar_Vitis_v", "Ar_Volvox_", "Ar_Ostreoc", "St_Cafeter")
spp_exclude <- c("Ap_Nutomon", "Am_Trichos", "Am_Pessone", "Am_Filamoe", "Am_Stereom", "Am_Vannell", "Am_Vexilli", "Am_Cunea_s", "Br_Pygsuia", "Ap_Thecamo", "Op_Nuclear", "Op_Spizell", "Op_Gonapod", "Op_Allomyc", "Op_Cryptoc", "Op_Amoebid", "Op_Salping", "Op_Nematos", "Op_Stegody", "Op_Capitel", "Op_Octopus", "Op_Lottia", "Op_Anas_pl", "Ex_Tsukuba", "Ex_Percolo", "Ex_Naegler", "Ex_Eutrept", "Ex_Euglena", "Ex_Perkins", "Ex_Neobodo", "Ex_Bodonid", "Ex_Trypano", "Ex_Leptomo", "Cr_Palpito", "Cr_Roombia", "Cr_Goniomo", "Cr_Cryptom", "Cr_Chroomo", "Cr_Cryptop", "Cr_Guillar", "Cr_Geminig", "Ar_Colp29", "Ar_Galdier", "Ar_Cyanidi", "Ar_Porphya", "Ar_Chondru", "Ar_Rhodosu", "Ar_Rhodell", "Ar_Compsop", "Ar_Porphyr", "Ar_Erythro", "Ar_Cyanopt", "Ar_Gloeoch", "Ar_Cyanoph", "Ar_Nephros", "Ar_Pyramim", "Ar_Mantoni", "Ar_Pycnoco", "Ar_Prasino", "Ar_Tetrase", "Ar_Coccomy", "Ar_Mesosti", "Ar_Klebsor", "Ar_Coleoch", "Ar_Physcom", "Ar_Pteridi", "Ar_Pinus_t", "Ar_Amborel", "Ha_Ancorac", "Ha_Choanoc", "Ha_Raineri", "Ha_Raphidi", "Ha_Acantho", "Ha_Pavlova", "Ha_Phaeocy", "Ha_Prymnes", "Ha_Emilian", "Ha_Coccoli", "Ha_Calcidi", "Te_Telonem", "Te_P−2", "Rh_Bigelow", "Rh_Euglyph", "Rh_D1", "Rh_Plasmod", "Rh_Gromia", "Rh_Astrolo", "Rh_Reticul", "Rh_Elphidi", "Al_Climaco", "Al_Oxytric", "Al_Tetrahy", "Al_Platyop", "Al_Vitrell", "Al_Hammond", "Al_Cryptos", "Al_Perkins", "Al_Oxyrrhi", "Al_Amoebop", "Al_Noctilu", "Al_Amphidi", "Al_Symbiod", "Al_Heteroc", "Al_Crypthe", "Al_Neocera", "Al_Alexand")


#generating a scaffold tree with taxa of interest
scaffold_tree <- drop.tip(input_tree,input_tree$tip.label[-match(spp_include, input_tree$tip.label)])
#adding inner node labels
scaffold_tree$node.label <- paste0("t", c(1:scaffold_tree$Nnode))

###

#2) Incorporate missing groups 

# The TToL does not provide the most accepted phylogeny for major eukaryotic lineages
# It does provide, however, a correct topology for inner nodes, but with different divergence times, specially for deeper nodes.

#Read donor tree from TToL
donor_tree <- read.tree("data/trees/raw/TToL_spp_list.nwk")

#transforming the tree structure to a df - donor tree
edge_table_donor <- data.frame(
  "parent" = donor_tree$edge[,1],
  "par.name" = sapply(donor_tree$edge[,1],
                      select.tip.or.node,
                      tree = donor_tree),
  "child" = donor_tree$edge[,2],
  "chi.name" = sapply(donor_tree$edge[,2],
                      select.tip.or.node,
                      tree = donor_tree)
)

#transforming the tree structure to a df - scaffold tree
edge_table_scaff <- data.frame(
  "parent" = scaffold_tree$edge[,1],
  "par.name" = sapply(scaffold_tree$edge[,1],
                      select.tip.or.node,
                      tree = scaffold_tree),
  "child" = scaffold_tree$edge[,2],
  "chi.name" = sapply(scaffold_tree$edge[,2],
                      select.tip.or.node,
                      tree = scaffold_tree)
)

#getting node distances
dTToL <- dist.nodes(donor_tree)
dScaff <- dist.nodes(scaffold_tree)

# Fungi

#this dataframe describes the species nodes that need to be incorporated, together with the nodes where they should be inserted
# we'll fix the node ages latter

data.frame(bind=c("OP_Asperg",
                  "OP_Saccha",
                  "OP_Kluyve"),
           reference=c("Op_Neurosp",
                       "Op_Neurosp-OP_Asperg",
                       "OP_Saccha"),
           poly=c(FALSE,
                  FALSE,
                  FALSE))->dato

#getting divergence times from TToL

sppList1 <- c("Aspergillus_nidulans", "Saccharomyces_cerevisiae", "Kluyveromyces_lactis")
sppList2 <- c("Neurospora_crassa", "Neurospora_crassa", "Saccharomyces_cerevisiae")

dist_TToL <- as.data.frame(cbind(sppList1, sppList2))
dist_TToL$dist <- 0

colnames(dist_TToL)

#computing the corrected distance for node Op_Schizos-Op_Neurosp

#for(spp in spp2correct) {
#  spp_1 <- dist_TToL[i,1]
#  spp_2 <- dist_TToL[i,2]
#  spp_1_short <- phylogenetic_metadata$ShortName[phylogenetic_metadata$Species == spp_1]
#  dist <- return.divergence.time(tree = donor_tree, d_matrix = dTToL, n_matrix = edge_table_donor, spp1 = spp_1, spp2 = spp_2)
  #  dist_TToL$dist[i] <- dist * correction
#  dist_TToL$dist[i] <- dist
#}

correction <- divergence_time_data$Ratio[divergence_time_data$Spp1 == "N_cra" & divergence_time_data$Spp2 == "U_may"]

for(i in 1:dim(dist_TToL)[1]) {
  spp_1 <- dist_TToL[i,1]
  spp_2 <- dist_TToL[i,2]
  dist <- return.divergence.time(tree = donor_tree, d_matrix = dTToL, n_matrix = edge_table_donor, spp1 = spp_1, spp2 = spp_2)
  dist_TToL$dist[i] <- dist
#  dist_TToL$dist[i] <- dist
}

ages_TToL.node <- vector()
ages_TToL.names <- vector()

for (i in 1:dim(dist_TToL)[1]) {
  ages_TToL.node[i] <- dist_TToL$dist[i]
  spp1 <- phylogenetic_metadata$FinalName[phylogenetic_metadata$Species == dist_TToL[i,1]]
  spp2 <- phylogenetic_metadata$FinalName[phylogenetic_metadata$Species == dist_TToL[i,2]]
  ages_TToL.names[i] <- paste0(spp1, "-", spp2)
}

names(ages_TToL.node) <- ages_TToL.names

#getting tip names and ages

#sppList1 <- c("Saccharomyces_cerevisiae", "Kluyveromyces_lactis", "Aspergillus_nidulans", "Neurospora_crassa", "Schizosaccharomyces_pombe", "Ustilago_maydis")
#sppList2 <- c("Kluyveromyces_lactis", "Saccharomyces_cerevisiae", "Neurospora_crassa", "Aspergillus_nidulans", "Saccharomyces_cerevisiae", "Saccharomyces_cerevisiae")

#ages_TToL.tips <- rep(0, length(sppList1))
#names(ages_TToL.tips) <- phylogenetic_metadata$NameToInclude[match(sppList1, phylogenetic_metadata$Species)]

#for (i in 1:length(sppList1)) {
#  spp_1 <- sppList1[i]
#  spp_2 <- sppList1[i]
#  dist <- return.divergence.time(tree = donor_tree, d_matrix = dTToL, n_matrix = edge_table_donor, spp1 = spp_1, spp2 = spp_2)
#  ages_TToL.tips[phylogenetic_metadata$NameToInclude[match(spp_1, phylogenetic_metadata$Species)]] <- dist
#}

scaffold_tree <- tree.merger(backbone=scaffold_tree,data=dato, node.ages = ages_TToL.node, plot=FALSE)

# Metazoa
data.frame(bind=c("Op_Hydra_v",
                  "Op_Trichopl",
                  "Op_Anophel",
                  "Op_Apis_m",
                  "Op_Daphnia",
                  "Op_Caenor_b",
                  "Op_Caenor_e",
                  "Op_branchio",
                  "Op_Xenopus",
                  "Op_Danio_r",
                  "Op_Takifug",
                  "Op_Tetraod",
                  "Op_Anolis_c",
                  "Op_Rattus",
                  "Op_Pan_tro",
                  "Op_Tupaia",
                  "Op_Canis",
                  "Op_Felis_c",
                  "Op_Bos_tau"),
           reference=c("Op_Nematos",
                       "Op_Hydra_v-Op_Nematos",
                       "Op_Drosoph",
                       "Op_Drosoph-Op_Anophel",
                       "Op_Drosoph-Op_Apis_m",
                       "Op_Drosoph-Op_Daphnia",
                       "Op_Caenor_b",
                       "Op_Ciona_i-Op_Gallus_",
                       "Op_Gallus_-Op_Mus_mus",
                       "Op_Gallus_-Op_Xenopus",
                       "Op_Danio_r",
                       "Op_Takifug",
                       "Op_Gallus_",
                       "Op_Mus_mus",
                       "Op_Homo_sa",
                       "Op_Homo_sa-Op_Pan_tro",
                       "Op_Mus_mus-Op_Homo_sa",
                       "Op_Canis",
                       "Op_Canis-Op_Felis_c"),
           poly=c(FALSE,
                  FALSE,
                  FALSE,
                  FALSE,
                  FALSE,
                  FALSE,
                  FALSE,
                  FALSE,
                  FALSE,
                  FALSE,
                  FALSE,
                  FALSE,
                  FALSE,
                  FALSE,
                  FALSE,
                  FALSE,
                  FALSE,
                  FALSE,
                  FALSE))->dato


sppList1 <- c("Trichoplax_adhaerens", "Hydra_vulgaris", "Anopheles_gambiae", "Apis_mellifera", "Daphnia_pulex", "Caenorhabditis_briggsae", "Caenorhabditis_elegans", "Branchiostoma_floridae", "Xenopus_tropicalis", "Danio_rerio", "Takifugu_rubripes", "Tetraodon_nigroviridis", "Anolis_carolinensis", "Rattus_norvegicus", "Pan_troglodytes", "Tupaia_chinensis", "Canis_lupus", "Felis_catus", "Bos_taurus")
sppList2 <- c("Amphimedon_queenslandica", "Nematostella_vectensis", "Drosophila_melanogaster", "Drosophila_melanogaster", "Drosophila_melanogaster", "Drosophila_melanogaster", "Caenorhabditis_briggsae", "Gallus_gallus", "Gallus_gallus", "Gallus_gallus", "Danio_rerio", "Takifugu_rubripes", "Gallus_gallus", "Mus_musculus", "Homo_sapiens", "Homo_sapiens", "Homo_sapiens", "Canis_lupus", "Canis_lupus")

comparisons <- as.data.frame(cbind(sppList1, sppList2))
comparisons$dist <- 0

colnames(comparisons)

for(i in 1:nrow(comparisons)) {
  spp_1 <- comparisons[i,1]
  spp_2 <- comparisons[i,2]
  dist <- return.divergence.time(tree = donor_tree, d_matrix = dTToL, n_matrix = edge_table_donor, spp1 = spp_1, spp2 = spp_2)
  comparisons$dist[i] <- dist
}

ages.node <- vector()
ages.names <- vector()
for (i in 1:dim(comparisons)[1]) {
  ages.node[i] <- comparisons$dist[i]
  spp1 <- phylogenetic_metadata$FinalName[phylogenetic_metadata$Species == comparisons[i,1]]
  spp2 <- phylogenetic_metadata$FinalName[phylogenetic_metadata$Species == comparisons[i,2]]
  ages.names[i] <- paste0(spp1, "-", spp2)
}

names(ages.node) <- ages.names

#scaffold_tree <- tree.merger(backbone=scaffold_tree,data=dato, plot=FALSE)
scaffold_tree <- tree.merger(backbone=scaffold_tree,data=dato, node.ages = ages.node, plot=FALSE)

#Archaeplastida

data.frame(bind=c("Ar_Ostreoc_t",
                  "Ar_Chlamyd",
                  "Ar_Sorghum",
                  "Ar_Oryza",
                  "Ar_Brachy"),
           reference=c("Ar_Ostreoc",
                       "Ar_Volvox_",
                       "Ar_Arabido-Ar_Vitis_v",
                       "Ar_Sorghum",
                       "Ar_Oryza"),
           poly=c(FALSE,
                  FALSE,
                  FALSE,
                  FALSE,
                  FALSE))->dato



sppList1 <- c("Ostreococcus_tauri", "Chlamydomonas_reinhardtii", "Sorghum_bicolor", "Oryza_sativa", "Brachypodium_distachyon")
sppList2 <- c("Ostreococcus_lucimarinus", "Volvox_carteri", "Arabidopsis_thaliana", "Sorghum_bicolor", "Oryza_sativa")

comparisons <- as.data.frame(cbind(sppList1, sppList2))
comparisons$dist <- 0

colnames(comparisons)

for(i in 1:nrow(comparisons)) {
  spp_1 <- comparisons[i,1]
  spp_2 <- comparisons[i,2]
  dist <- return.divergence.time(tree = donor_tree, d_matrix = dTToL, n_matrix = edge_table_donor, spp1 = spp_1, spp2 = spp_2)
  comparisons$dist[i] <- dist
}

ages.node <- vector()
ages.names <- vector()
for (i in 1:dim(comparisons)[1]) {
  ages.node[i] <- comparisons$dist[i]
  spp1 <- phylogenetic_metadata$FinalName[phylogenetic_metadata$Species == comparisons[i,1]]
  spp2 <- phylogenetic_metadata$FinalName[phylogenetic_metadata$Species == comparisons[i,2]]
  ages.names[i] <- paste0(spp1, "-", spp2)
}

names(ages.node) <- ages.names

#scaffold_tree <- tree.merger(backbone=scaffold_tree,data=dato, plot=FALSE)
scaffold_tree <- tree.merger(backbone=scaffold_tree,data=dato, node.ages = ages.node, plot=FALSE)

# "Protozoa", for the lack of a better name ;-)

#adding one species to the tree and removing anchor species St_cafeter
data.frame(bind=c("St_Phy_ram"),
          reference=c("St_Cafeter"),
          poly=c(FALSE))->dato
scaffold_tree <- tree.merger(backbone=scaffold_tree,data=dato, plot=FALSE)

#drop anchor tip
scaffold_tree <- drop.tip(scaffold_tree, "St_Cafeter")

#finishing the job
data.frame(bind=c("St_Phy_soj",
                  "St_Phy_inf",
                  "St_Phy_par",
                  "St_Achlya",
                  "St_Sapro"),
           reference=c("St_Phy_ram",
                       "St_Phy_ram-St_Phy_soj",
                       "St_Phy_inf",
                       "St_Phy_inf-St_Phy_ram",
                       "St_Achlya"),
           poly=c(FALSE,
                  FALSE,
                  FALSE,
                  FALSE,
                  FALSE))->dato


sppList1 <- c("Phytophthora_sojae", "Phytophthora_infestans", "Phytophthora_parasitica", "Achlya_hypogyna", "Saprolegnia_parasitica")
sppList2 <- c("Phytophthora_ramorum", "Phytophthora_ramorum", "Phytophthora_infestans", "Phytophthora_infestans", "Achlya_hypogyna")

comparisons <- as.data.frame(cbind(sppList1, sppList2))
comparisons$dist <- 0

colnames(comparisons)

for(i in 1:nrow(comparisons)) {
  spp_1 <- comparisons[i,1]
  spp_2 <- comparisons[i,2]
  dist <- return.divergence.time(tree = donor_tree, d_matrix = dTToL, n_matrix = edge_table_donor, spp1 = spp_1, spp2 = spp_2)
  comparisons$dist[i] <- dist
}

ages.node <- vector()
ages.names <- vector()
for (i in 1:dim(comparisons)[1]) {
  ages.node[i] <- comparisons$dist[i]
  spp1 <- phylogenetic_metadata$FinalName[phylogenetic_metadata$Species == comparisons[i,1]]
  spp2 <- phylogenetic_metadata$FinalName[phylogenetic_metadata$Species == comparisons[i,2]]
  ages.names[i] <- paste0(spp1, "-", spp2)
}

names(ages.node) <- ages.names

#scaffold_tree <- tree.merger(backbone=scaffold_tree,data=dato, plot=FALSE)
scaffold_tree <- tree.merger(backbone=scaffold_tree,data=dato, node.ages = ages.node, plot=FALSE)


scaffold_tree$tip.label <- phylogenetic_metadata$Species[match(scaffold_tree$tip.label, phylogenetic_metadata$FinalName)]

#force.ultrametric to deal with rounding issues during tree stitching
final_tree <- force.ultrametric(scaffold_tree, method = "extend")
final_tree_no_branch_length <- final_tree
final_tree_no_branch_length$edge.length <- NULL

la <- ggtree(final_tree) + theme_tree2() + geom_tiplab(align=TRUE, linesize=.5) + geom_text(aes(label=node), hjust=-.3)
la <- revts(la)
plot(la)


#save pdf
cairo_pdf("results/figures/Supplementary_Figure_1_E_final_tree_stitched.pdf", width = 20, height = 14)
ggtree::ggtree(final_tree_no_branch_length, size=2) +
  ggtree::geom_hilight(node=c(19, 20, 22, 82, 17, 26, 84, 31, 33, 34, 35, 90, 39, 40, 97, 45, 60, 14, 13, 52), fill="steelblue") + 
  geom_tiplab(hjust = -0.1, size = 6) + 
#  geom_text(aes(label=node), hjust=-.5, size=2, color="black") +
  ggplot2::xlim(0, 20)

dev.off()


#uncomment to generate tree
#write.tree(final_tree, "data/trees/final_tree_branches_corrected.nwk")