## Penalized Likelihood and BioGeoBears script for GBS data of Cristaria
## Tim Böhnert & Federico Luebert 
## Nov. 2019

## ------------ install.packages ------------
install.packages("optimx", dependencies=TRUE, repos="http://cran.rstudio.com")
install.packages("snow")
install.packages("phylobase")
install.packages("devtools")
install.packages("rexpokit") # dependencies=FALSE   ## on arch based linux install 'gcc-fortran' from official repository
install.packages("cladoRcpp")
install.packages("strap")
install.packages("png")

## load devtools to install packages from out site CRAN
library(devtools)
## :::::: Install the updated BioGeoBEARS from GitHub as recommended:
devtools::install_github(repo = "nmatzke/BioGeoBEARS") #, INSTALL_opts = "--byte-compile", dependencies=T)
## :::::: Install Phyloch from fmichonneau GitHub repo; this is the same package as from Christoph Heibls homepage:
devtools::install_github(repo = "fmichonneau/phyloch", INSTALL_opts = "--byte-compile", dependencies=T)
## :::::: Install deeptree from willgearty's GitHub repo
devtools::install_github("willgearty/deeptime")

## ------------ load packages ------------
library(BioGeoBEARS)
library(cladoRcpp)
library(rexpokit)
library(parallel)
library(optimx)
library(ape)
library(strap)    #for plotting geologic time scale
library(phyloch)
library(png)
library(phytools)
library(deeptime)
library(ggtree)
#library(treeio)

## ------------ Penalized Likelihood ------------
## Set working directory on different PCs
setwd(                "F:/Sciebo/02_Dissertation/2_Cristaria_MALVA/Cristaria_Phyl/Analysis/P-Likelihood")  ## Windows T480s
setwd("/home/tim/WinData/Sciebo/02_Dissertation/2_Cristaria_MALVA/Cristaria_Phyl/Analysis/P-Likelihood")  ## Linux T480s
setwd( "/home/tim/WinData/Sciebo/02_Dissertation/2_Cristaria_MALVA/Cristaria_Phyl/Analysis/P-Likelihood")  ## Linux Workstation
setwd("/home/tim/WinData/Sciebo/02_Dissertation/2_Cristaria_MALVA/Cristaria_Phyl/Analysis/BioGeoBEARS_2021-05")

list.files()

## read tree file
Cris_pops30 <- read.tree("Cris_pops30_OUT_RAxML.tre")  ## ape Funktion
class(Cris_pops30)

## is rooted?
is.rooted(Cris_pops30)
rCris_pops30 <- root(Cris_pops30, node = getMRCA(Cris_pops30, c("Lec_het_ED1877", "Lec_ame_ED1878")), resolve.root = T)
is.rooted(rCris_pops30)

## ladderize
lrCris_pops30 <- ladderize(rCris_pops30, right = F)

## plot phylogeny
plot(lrCris_pops30, use.edge.length = FALSE, cex = 0.5)
nodelabels(cex = 0.5)

## vector with calibration points and where to place them in the phylogeny
cal_Cris     <- c(20.7, 7.3) ## only for 'age.min' setting
max_cal_Cris <- c(31.2, 11)
min_cal_Cris <- c(10.6, 3.6)
node_Cris    <- c(getMRCA(lrCris_pops30, c("Lec_het_ED1877", "C_and_ED3106")), getMRCA(lrCris_pops30, c("C_and_ED3106", "C_int_ED1425")))

## To check if branch length is zero at some nodes and replace it by VERY short branch length
lrCris_pops30$edge.length[which(lrCris_pops30$edge.length == 0)] <- 0.000000001

## make cross validation with different values for lambda
l <- 10^(-1:6)
cv1 <- sapply(l,function(x) sum(attr(chronopl(lrCris_pops30, lambda = x, age.min = min_cal_Cris, 
                                              age.max = max_cal_Cris, node = node_Cris, CV = T), "D2")))

## plot results for documentation (appendix!!!)
pdf("CrossValidation_PL_Cristaria.pdf")
plot(log(l), cv1)         
dev.off()

## Run Penalized Likelihood analyses using 'chronpl()' function
chrono_Cris <- chronopl(lrCris_pops30, lambda = 1, age.min = cal_Cris, node = node_Cris)
chrono_CrisMIN_MAX <- chronopl(lrCris_pops30, lambda = 1, age.min = min_cal_Cris, age.max = max_cal_Cris, node = node_Cris)
chrono_CrisMIN <- chronopl(lrCris_pops30, lambda = 1, age.min = cal_Cris, node = node_Cris)

## plot dated tree with time-scale
plot(chrono_Cris, cex = 0.5, label.offset = 0.5)
axisPhylo()
nodelabels(cex = 0.65)

plot(chrono_CrisMIN, cex = 0.5, label.offset = 0.5)
axisPhylo()

## ------------ Full dated tree for Suppl. ------------
## PLOT FULL CRISTARIA DATE TREE FOR SUPPLEMENT FIGURE
plot(chrono_CrisMIN, cex = 0.5, label.offset = 0.5)
axisPhylo()
nodelabels(cex = 0.5)

chrono_CrisMIN$root.time <- 20.7

pdf("Cris_GBS_PL-Tree.pdf", height = 15, width = 10)
geoscalePhylo(tree = chrono_CrisMIN, boxes = "Epoch", units = c("Period", "Epoch"), cex.age = 0.7, cex.ts = 1,
              x.lim = c(-1, 21), y.lim = c(1, 113), width = 1, quat.rm = F, tick.scale = 5, label.offset = 0.25,
              erotate = 270)
dev.off()

write.tree(chrono_Cris, file = "Cristaria_ChronPL_FullTree.tre")
CrisTree <- read.tree(file = "Cristaria_ChronPL_FullTree.tre")
summary.phylo(CrisTree)
typeof(CrisTree)
sort(branching.times(CrisTree), decreasing = T)

clade_age <- c("122", "223", "220", "212", "129", "135", "138", "204", "192", "185", "144")
clade_age

sort(chrono_Cris$edge)

plot.phylo(CrisTree)
nodelabels(CrisTree$node.label, adj = c(1.25, -0.5), frame = "n", cex = 0.8, font = 2)
nodelabels(round(branching.times(CrisTree), digits = 2), adj = c(1.25, -0.5), frame = "n", cex = 0.8, font = 2)
branching.times(CrisTree)
axisPhylo()


## ------------ PL PSEUDO bootstrapping ------------
## make three lists to store a tree in each one
boo <- list()
booc <- list()

## assign the class "multiphyle"
class(boo) <- "multiPhylo"
class(booc) <- "multiPhylo"

## compute 1000x branching times
for (i in 1:1000) {
  boo[[i]] <- compute.brtime(Cris_pops30, method = "coalescent", force.positive = NULL)
  booc[[i]] <- chronopl(boo[[i]], lambda = 1, age.min = min_cal_Cris, age.max = max_cal_Cris, node = node_Cris)
}

## make data frame to store values of PL bootstraping of selected nodes
boot_values <- matrix(ncol = length(booc[[1]]$tip.label)-1, nrow = 1000)
for (i in 1:1000 ) {for (j in 1:length(booc[[1]]$tip.label)-1){
  boot_values[i, j] <- branching.times(booc[[i]])[j]
}}

## calculate length of SD bars
stdevs <- apply(boot_values, 1, sd)
write.tree(chrono_Cris, file = "Cris_Chrono7.tre")
## ------------ PL bootstrapping ------------
## ------------     NOT USED     ------------



## :::::::::::::::::::::::::::::::::::::
## ------------ BioGeoBears ------------

## read back safed tree-file if not loaded anymore
ChronoCrisTree <- read.tree(file = "Cristaria_ChronPL_FullTree.tre")

## GENERAL PREPERATION
## drop tips .... keep.tips

## check tip labels in alphabetic order
sort(ChronoCrisTree$tip.label)

## make list of tips you wanna keep
## outgroup Lecanophora will be excluded for ancestral area reconstruction
KeepList <- c("C_and_ED3103", "C_multiflora_ED3107", "C_gra_ED3458", "C_multifida_W5633", "C_gla_ED1603", 
              "C_arg_ED3001", "C_cya_ED3451", "C_con_ED3022", "C_vir_ED1408", "C_mol_ED3016", "C_int_ED1414")

## make a new phylo object with reduced sampling
chrono_Cris <- keep.tip(ChronoCrisTree, KeepList)

## check tree
plot(chrono_Cris)
axisPhylo()

## check tip labels in alphabetic order
OldNames <- sort(chrono_Cris$tip.label)

## make list of new tipnames sorted the same way as the old labels
NewNames <- sort(c("Cristaria_andicola", "Cristaria_multiflora", "Cristaria_gracilis", "Cristaria_multifida", "Cristaria_glaucophylla", 
              "Cristaria_argyliifolia", "Cristaria_cyanea", "Cristaria_concinna", "Cristaria_viridiluteola", "Cristaria_molinae", "Cristaria_integerrima"))

## make a data frame with the old and the new names
d <- as.data.frame(cbind(OLDlabel = OldNames, NEWlabel = NewNames))
d

## use function sub.taxa.label() to replace old tip lables with new ones
## load function from r-script
source("SubTaxaLabel.R")
tr <- sub.taxa.label(chrono_Cris, d)
plot(tr, label.offset = 0.15)
axisPhylo()

## write time calibrated tree with new tip label into tree-file
write.tree(tr, "BioGeoBears_Cris.tre")


tr <- read.tree(file = "BioGeoBears_Cris.tre")


## ------------ DEC maxareas 2 ------------
## Analysis with 'maxareas' & 'max_range_size' = 2

## get the colors for the number of states and maxareas
## adjust 'maxareas' and 'max_range_size' 
states_list_0based_index = rcpp_areas_list_to_states_list(areas = c("A","B","C","D","E","F"), maxareas = 2) 
colors_matrix = get_colors_for_numareas(6)
colors_list_for_states = mix_colors_for_states(colors_matrix, states_list_0based_index)
colors_list_for_states

# read a table with color lists
coltipsCris <- read.table("BioGeoBEARS_color_Cris.txt", row.names = 1)
coltips <- coltipsCris[tr$tip.label,]

## set parameter for BioGeoBears and run analysis
BioGeoBEARS_model_object = define_BioGeoBEARS_model_object(minval_anagenesis = 1e-15,
                                                           minval_cladogenesis = 1e-03, 
                                                           maxval = 6)



BioGeoBEARS_run_object = define_BioGeoBEARS_run(BioGeoBEARS_model_object = BioGeoBEARS_model_object,
                                                trfn="BioGeoBears_Cris.tre", 
                                                geogfn = "BioGeoBEARS_distr_Cris.txt",
                                                num_cores_to_use = 8, 
                                                max_range_size = 3)
BioGeoBEARS_run_object$geogfn = "BioGeoBEARS_distr_Cris.txt"
CrisAArC = bears_optim_run(BioGeoBEARS_run_object)

## get the states with the highest likelihood at each node, use only 
## columns 2 to 16 because the first column is an order number
## check possible combinations on https://planetcalc.com
nodestates <- get_ML_states_from_relprobs(CrisAArC$ML_marginal_prob_each_state_at_branch_top_AT_node[,2:42],
                                          statenames = c("A","B","C","D","E","F","AB","AC","AD","AE","AF","BC","BD","BE","BF","CD","CE","CF","DE","DF","EF",
                                                         "ABC","ABD","ABE","ABF","ACD","ACE","ACF","ADE","ADF","AEF","BCD","BCE","BCF","BDE","BDF","BEF","CDE","CDF","CEF","DEF"))
## just to see the results
plot_BioGeoBEARS_results(CrisAArC, cornercoords_loc = "/home/tim/R/x86_64-pc-linux-gnu-library/3.4/BioGeoBEARS/extdata/a_scripts/")

## fit tree to the time scale using the crown node age used for dating (PL)
tr$root.time <- 7.325

## read map
Atacama <- readPNG("Ata_BioGeoBears_MAP.png")

## save final plot in PDF file
pdf("Cris_GBS_BioGeoBEARS.pdf", height = 7, width = 10)
geoscalePhylo(tree = tr, boxes = "Epoch", units = c("Period", "Epoch"), cex.age = 0.9, cex.ts = 1.3, 
              cex.tip = 1, x.lim = c(-3.75, 11), y.lim = c(1, 11.15), width = 1.25, 
              quat.rm = T, erotate = 0, tick.scale = 1, label.offset = 1.6, arotate = 0)
tiplabels(pch = 22, bg = as.character(coltips$V2), cex = 2, adj = c(0.65, 0.51), lwd = 0.75)
tiplabels(pch = 22, bg = as.character(coltips$V3), cex = 2, adj = c(0.9,  0.51), lwd = 0.75)
tiplabels(pch = 22, bg = as.character(coltips$V4), cex = 2, adj = c(1.15, 0.51), lwd = 0.75)
tiplabels(pch = 22, bg = as.character(coltips$V5), cex = 2, adj = c(1.4,  0.51), lwd = 0.75)
tiplabels(pch = 22, bg = as.character(coltips$V6), cex = 2, adj = c(1.65, 0.51), lwd = 0.75)
tiplabels(pch = 22, bg = as.character(coltips$V7), cex = 2, adj = c(1.9,  0.51), lwd = 0.75)

mtext(side = 3, line = -0.8, at = c(7.45, 7.7, 7.95, 8.2, 8.45, 8.7), text = c("A", "B", "C", "D", "E", "F"), cex = 0.8, font = 2)
legend(-0.9, 11.6, legend = c("A: Peru","B: N Chilenan Andes","C: Meditteranean Andes","D: S Atacama Pampa","E: S Atacama Coast", "F: N Atacama Coast",
                          "AB","AC","AD","AE","AF","BC","BD","BE","BF","CD","CE","CF","D+E","D+F","EF",
                          "ABC","ABD","ABE","ABF","ACD","ACE","ACF","ADE","ADF","AEF","BCD","BCE","BCF","BDE","BDF","BEF","C+D+E","CDF","CEF","DEF")
       [c(1:6,19,20,38)], pch = 22, pt.bg = colors_list_for_states[c(1:6,19,20,38)], bty = "n", cex = 1, pt.cex = 1.5)
plot_BioGeoBEARS_results(CrisAArC, plotwhat = "pie", tipcex = 0, statecex = 0.55, plotsplits = T, plotlegend = F, legend_cex = 0.5, titlecex = 0, splitcex = 0.4,
                         cornercoords_loc = "/home/tim/R/x86_64-pc-linux-gnu-library/3.4/BioGeoBEARS/extdata/a_scripts/",
                         skiptree = T, tipboxes_TF = F)
nodelabels(nodestates[12:21], adj = c(1.95, -0.85), frame = "none", cex = 0.75)
par(xpd = NA)
rect(-4.3, -2, -1, 11.65, col = "white", border = NA)
segments(-1, -1.051, -1, 0.59)
#rasterImage(Atacama, -5, -1.95, -2, 12)
text(6, 0.05, "Pleistocene"); text(6, -0.775, "Quarternary", cex = 1.25)
dev.off()


## ------------ DEC maxareas 3 ------------
## Analysis with 'maxareas' & 'max_range_size' = 3

## get the colors for the number of states and maxareas
## adjust 'maxareas' and 'max_range_size' 
states_list_0based_index = rcpp_areas_list_to_states_list(areas = c("A","B","C","D","E","F"), maxareas = 3) 
colors_matrix = get_colors_for_numareas(6)
colors_list_for_states = mix_colors_for_states(colors_matrix, states_list_0based_index)
colors_list_for_states

# read a table with color lists
coltipsCris <- read.table("BioGeoBEARS_color_Cris.txt", row.names = 1)
coltips <- coltipsCris[tr$tip.label,]

## set parameter for BioGeoBears and run analysis
BioGeoBEARS_model_object = define_BioGeoBEARS_model_object(minval_anagenesis = 1e-15, minval_cladogenesis = 1e-03, maxval = 6)
BioGeoBEARS_run_object = define_BioGeoBEARS_run(BioGeoBEARS_model_object = BioGeoBEARS_model_object,
                                                trfn="BioGeoBears_Cris.tre", 
                                                geogfn = "BioGeoBEARS_distr_Cris.txt",
                                                num_cores_to_use = 16, max_range_size = 3)
BioGeoBEARS_run_object$geogfn = "BioGeoBEARS_distr_Cris.txt"
CrisAArC = bears_optim_run(BioGeoBEARS_run_object)


numstates_from_numareas(numareas=4, maxareas=4, include_null_range=TRUE)
numstates_from_numareas(numareas=4, maxareas=4, include_null_range=FALSE)
numstates_from_numareas(numareas=4, maxareas=3, include_null_range=TRUE)
numstates_from_numareas(numareas=4, maxareas=2, include_null_range=TRUE)
numstates_from_numareas(numareas=10, maxareas=10, include_null_range=TRUE)
