#!/mnt/shared/scratch/jhackel/apps/conda/bin/Rscript
#SBATCH --partition=medium
#SBATCH --mem=4G

# partition diversification rates per area, both overall and per time period

library(ape)
library(geiger) # for tips()
library(phangorn) # for Descendants()

# go to working directory
setwd("/mnt/shared/scratch/jhackel/russulaceae/6_diversification_R1")

# read areas
areas = read.table('../4_its_metadata_R1/area_codes.tab', sep='\t', header=TRUE, comment.char='',
                   stringsAsFactors=FALSE, colClasses="character")

# read tree (used both for BAMM and corHMM)
tree = read.tree('../3_supertree_R1/supertree_calibrated.nwk')

# get node ages (round to 6 decimal points - some tips don't line up exactly to 1/0)
na = round(1 - node.depth.edgelength(tree), 6)

# load corHMM stochastic maps
load("../5_biogeo_R1/corHMM_ER_stoch_maps.Rdata")

# load BAMM rate shifts posterior distribution
bamm = read.table('./BAMM/event_data.txt', sep=',', header=TRUE,
                 stringsAsFactors=FALSE, comment.char='')
# add time before present
bamm$time_bp = 1 - bamm$abstime

# add node numbers to BAMM: each branch is identified by children of descendant
# (tip branches have no right child)
get_node = function(i) {
  # tip nodes
  if (is.na(bamm$rightchild[i])) {
    nn = which(tree$tip.label==bamm$leftchild[i])
  # internal nodes
  } else {
    nn = getMRCA(tree, tip=c(bamm$leftchild[i], bamm$rightchild[i]))
  }
  return(nn)
}
bamm$node_no = sapply(1:nrow(bamm), get_node)


# pair each stochastic map with random BAMM generations
gen = unique(bamm$generation) # BAMM generations
# remove burnin
burnin = 0.2
gen = gen[-c(1:floor(burnin*length(gen)))]
# number of stochastic maps
nosm = length(stoch_maps) 
# randomly assign a stochastic map to each BAMM generation
set.seed(345)
split_groups = sapply(1:length(gen), function(x) sample(1:nosm, 1) )
# make list of generations per stochastic map
gen = split(gen, f=split_groups)

# time periods - described by right (younger) boundary and period length
# (root age is 1)
per = 0.05
timeperiods = rev(seq(0,1-per,per))

# rate bin for discretizing diversification rates
rate_bin = 0.02

# function to get, for one stochastic map – BAMM rate configuration (generation/posterior sample)
 # pairing, branch segements present in one area and with one BAMM regime
get_segments = function(n, generation) {
  # n = number of stochastic map
  # generation = BAMM generation
  
  # BAMM rates from one posterior sample
  bamm_rates = bamm[bamm$generation==generation,]
  bamm_rates = bamm_rates[order(bamm_rates$abstime),] # order by age, oldest shifts first
  
  # divide the tree into branch segments defined between two events
  # (event = speciation OR dispersal OR diversification rate shift)
  # function to create or update segments data frame
  update_seg = function(seg=NULL, pn, cn, s, e, rg=rep(NA, length(pn))) {
    seg_new = data.frame(pn=pn, cn=cn, s=s, e=e,
                    rg=rg)
    return(rbind(seg, seg_new))
  }
  
  # start by collecting all segments from the corHMM stochastic map
  # -> these are separated by speciation or dispersal
  # number of segments per branch
  nseg = sapply(stoch_maps[[n]]$maps, length)
  # parent and child nodes
  pn = stoch_maps[[n]]$edge[,1]
  cn = stoch_maps[[n]]$edge[,2]
  # get backward cumulative event times per branch (relative to end of branch)
  cumt = lapply(stoch_maps[[n]]$maps, function(x) rev(cumsum(rev(x))) )
  # segment start and end times
  segstart = lapply(1:length(cumt), function(x) na[cn[x]] + cumt[[x]] )
  segend = lapply(1:length(cumt), function(x) segstart[[x]] - stoch_maps[[n]]$maps[[x]] )
  # collect as segment data frame
  seg = update_seg(pn=unlist( lapply(1:length(pn), function(x) rep(pn[x], nseg[x]) ) ),
                   cn=unlist( lapply(1:length(cn), function(x) rep(cn[x], nseg[x]) ) ),
                   s=unlist(segstart),
                   e=unlist(segend),
                   rg=unlist(lapply(stoch_maps[[n]]$maps, names)) )
  
  # function to break up segments in data frame
  break_seg = function(seg, breaks) {
    # breaks = list of 'breakpoints' (time bp) per segment
    # which segments actually have 1 or more breaks?
    w = which(sapply(breaks, length)>0)
    # copy parent nodes, child nodes and ranges for each new segment
    pcr = lapply(c('pn','cn','rg'), function(x)
        unlist(lapply(w, function(y) rep(seg[y,x], length(breaks[[y]]))))
      )
    # end times for new segments (start times = breaks)
    e = unlist(lapply(w, function(x) c(breaks[[x]][-1], seg$e[x])))
    # shorten parent segments that have breaks
    # -> end time is time of first break on branch
    seg$e[w] = sapply(breaks[w], function(x) x[1])
    # add new segments to data frame and return
    seg_new = data.frame(pn=pcr[[1]], cn=pcr[[2]], s=unlist(breaks[w]), e=e, rg=pcr[[3]])
    return(rbind(seg, seg_new))
  }
  
  # subdivide segments by BAMM rate shift events
  rs = lapply(1:nrow(seg), function(x) which(bamm_rates$node_no == seg$cn[x] &
                   bamm_rates$time_bp <= seg$s[x] & bamm_rates$time_bp >= seg$e[x]) )
  seg = break_seg(seg, breaks=lapply(rs, function(x) bamm_rates$time_bp[x]))

  # further discretize segments if they are longer than the specify maximum rate bin
  # this is to approximate the birth rate, which evolves exponentially wit within a rate regime
  segbins = lapply(1:nrow(seg), function(x) if (seg$s[x] - seg$e[x] > rate_bin) {
    rev(seq(seg$e[x] + rate_bin, seg$s[x], rate_bin)) } else (numeric()) )
  seg = break_seg(seg, breaks=segbins)
  
  # assign each segment to a rate regime (row number in BAMM rates table)
  # first assign root regime (1) to each segment...
  seg$r = rep(1, nrow(seg))
  # ... then assign from oldest (after root) to most recent regime
  for (i in 2:nrow(bamm_rates)) {
    # get all descendants of node below rate shift
    desc = Descendants(tree, node=bamm_rates$node_no[i], type='all')
    # assign all descendant segments
    wr = which(seg$cn %in% c(bamm_rates$node_no[i], desc) & seg$s<= bamm_rates$time_bp[i])
    seg$r[wr] = i
  }
  
  # now calculate net diversification rates for all segments
  # extinction rate is constant per regime in BAMM, but speciation rate varies exponentially
    # with time: lambda(t) = lambda(0) * exp(lambda_shift*t) (see Rabosky 2014 PLOS ONE)
  seg$rt = bamm_rates$lambdainit[seg$r] * 
    exp(bamm_rates$lambdashift[seg$r] * (bamm_rates$time_bp[seg$r] - seg$e) ) -
    bamm_rates$muinit[seg$r]
  
  # return segments data frame
  return(seg)

}


# function to get median rate value per area from a single pairing
get_rate_areas = function(seg) {
  # get segment lengths
  seg$len = seg$s-seg$e
  # calculate weighted mean of rates per area
  area_rates = sapply(areas$code,
                      function(x) weighted.mean(seg$rt[seg$rg==x], w=seg$len[seg$rg==x]))
  return(area_rates)
}



# function to get per-area diversification rates for each time period and for
# one stochastic map – BAMM rate configuration (generation) pairing
get_divrates_areas_time = function(seg) {
  # seg = data frame of branch segments
  # calculate average rates per time window and area
  get_av_rates = function(tw) {
    # segment weight: % overlap with time period
    ovl = sapply(1:nrow(seg), function(x) (min(c(seg$s[x],tw+per)) - max(c(seg$e[x],tw)))/per)
    # which segments do actually overlap
    wo = which(ovl>0)
    # segments present in each area
    inar = lapply(areas$code, function(x) grep(x, seg$rg[wo]))
    # now get weighted median for each area
    av_rates = sapply(inar, function(x) if (length(x)>0) {
      weighted.mean(seg$rt[wo][x], w=ovl[wo][x])
      } else { 0 } )
    return(av_rates)
  }
  # apply to all timeperiods, return matrix
  div_rates_area = sapply(timeperiods, get_av_rates)
  # set row and column names
  rownames(div_rates_area) = areas$code
  colnames(div_rates_area) = timeperiods
  return(div_rates_area)
}

# function to get one summary matrix of dispersal rates
get_summ_matrix = function(div_matrix_list, FUN, ...) {
  # FUN: summary statistic function: mean, sd, median, quantile...
  # ...: arguments for that function
  # summary matrix
  mat = matrix(NA, nrow=nrow(div_matrix_list[[1]]), ncol=ncol(div_matrix_list[[1]]))
  rownames(mat) = rownames(div_matrix_list[[1]])
  # fill matrix with summaries of rates per area and time period
  for (i in 1:nrow(mat)) {
    for (j in 1:ncol(mat)) {
      rates_ij = sapply(div_matrix_list, function(x) x[i,j])
      mat[i,j] = FUN(rates_ij, ...)
    }
  }
  # return matrix
  colnames(mat) = timeperiods
  return(mat)
}

# get branch segments for all stoch. maps/BAMM posterior combinations
segments_all_comb = lapply(1:length(gen), function(x)
  lapply(gen[[x]], function(y) get_segments(n=x, generation=y)))
# unlist (up to level 1)
segments_all_comb = unlist(segments_all_comb, recursive=FALSE)

# get overall area rates
area_rates_all_comb = t(sapply(segments_all_comb, get_rate_areas))
area_rates_all_comb = as.data.frame(area_rates_all_comb)

# record no. of stochastic map + posterior sample
area_rates_all_comb$stoch_map = unlist(lapply(1:length(gen),
                                              function(x) rep(x, length(gen[[x]]))))
area_rates_all_comb$post_sample = unlist(gen)

# write to file (one line poer stoch. map/posterior sample combination)
write.table(area_rates_all_comb, file="div_rates_area_overall.txt", sep="\t", row.names=FALSE)




# get rates over time + summaries + write to files
div_rates_all_comb = lapply(segments_all_comb, get_divrates_areas_time)
# summaries: median and quantiles
div_rates_median = get_summ_matrix(div_rates_all_comb, median)
div_rates_025 = get_summ_matrix(div_rates_all_comb, quantile, probs=0.025)
div_rates_975 = get_summ_matrix(div_rates_all_comb, quantile, probs=0.975)
# write to files
write.table(div_rates_median, 'div_rates_per_area_median.tsv', sep='\t')
write.table(div_rates_025, 'div_rates_per_area_025.tsv', sep='\t')
write.table(div_rates_975, 'div_rates_per_area_975.tsv', sep='\t')


