
#' Prune a List of Trees
#' 
#' Keep only selected tips in a list of  \code{\link[tidytree]{treedata-class}}. 
#' Selection is done using tip labels.
#'
#' @param trees -list of \code{\link[tidytree]{treedata-class}} 
#' @param tips2keep - tip labels to keep
#'
#' @return list of \code{\link[tidytree]{treedata-class}} with only selected tips
#'
#' @seealso \code{pruneTree}
pruneTrees <- function(trees, tips2keep){
  lapply(trees, function(tree, tips2keep){pruneTree(tree, tips2keep)}, tips2keep)
}


#' Prune a Tree
#' 
#' Keep only selected tips in a  \code{\link[tidytree]{treedata-class}}. 
#' Selection is done using tip labels. The treedata should have a label
#' that the function \code{splitCafeLabel} could parse.
#'
#' @param tree -\code{\link[tidytree]{treedata-class}} 
#' @param tips2keep - tip labels to keep
#'
#' @return \code{\link[tidytree]{treedata-class}} with only selected tips
#'
pruneTree <- function(tree, tips2keep = NULL){
  # get a specific value from data frame
  getValueFromDF<-function(x, df, getCol, mergeCol="label"){
    if(getCol %in% colnames(df)){
      res<-df[df[mergeCol] == x, getCol]
      #if the rows have exactly the same label, they are identical in values
      # because the values are calculated from label
      if(length(res)!=1) res<-res[1]
    }
    else{
      res <-NULL
    }
    res
  }
  
  treedf <- tidytree::as_tibble(tree)
  ndf <- lapply(treedf$label, splitCafeLabel)
  ndf <- as.data.frame(do.call(rbind, ndf))
  ndf['id'] <- 1:nrow(ndf)
  ndf['label'] <- treedf$label
  if("genePresent" %in% colnames(treedf)){
    ndf['genePresent']<-treedf["genePresent"]
  }
  treedf$name <- ndf$name
  
  if(!is.null(tips2keep)){
    apeKeep <- unlist(treedf[treedf$name %in% tips2keep,"label"])
    tree<- ape::as.phylo(tree)
    tree<- ape::keep.tip(tree, apeKeep)
    treedf <- tidytree::as_tibble(tree)
  }
  
  treedf$data <- treedf$label
  
  treedf$genePresent <- sapply(treedf$label, getValueFromDF,
                               ndf ,"genePresent")
  
  
  treedf$name <-sapply(treedf$label, getValueFromDF, ndf ,"name")
  treedf$n <- sapply(treedf$label, getValueFromDF, ndf ,"n")
  
  if(all(is.null(unlist(treedf$genePresent)))){
    treedf$genePresent <- as.numeric(treedf$n)
  }
  
  tidytree::as.treedata(treedf)
  
}

#' Split Cafe Node Label
#' 
#' Split the node label of an Ensebl Cafe tree according to documentation
#' see: \url{https://rest.ensembl.org/documentation/info/cafe_tree}
#'
#' @param x - string from Ensembl Cafe Newick tree node 
#'
#' @return a named string vector c(name=name, n=n, pval=pval)
#'
#' @examples splitCafeLabel("Oncorhynchus_mykiss_1_0.5124")
splitCafeLabel <- function(x){
  item <- strsplit(x,"_")[[1]]
  nrs <- c()
  for(i in 1:length(item)){
    y <- suppressWarnings(as.numeric(item[i]))
    if(!is.na(y)) nrs <- c(nrs, i)
  }
  name <- paste0(item[setdiff(1:length(item), nrs)], collapse = "_")
  pi <- nrs[length(nrs)]
  if(length(nrs)>1){
    pval <- as.numeric(item[pi])
    n <- as.numeric(item[pi-1]) 
  }
  else{
    pval <- NA
    n <- as.numeric(item[pi]) 
    
  }
  return(c(name=name, n=n, pval=pval))
  
}


#' Get Names of Children for a Tree Node 
#' 
#' Get all of the children (or tips only) of a selected node from a newick tree 
#' downloaded from Ensembl Cafe. 
#'
#' @param tree \code{\link[tidytree]{treedata-class}} with data
#' @param nodeName string of the node name whom children are requested
#' the name string is generated from tip labels using \code{splitCafeLabel}
#' @param tipsOnly boolean should only the tips be returned if false returns
#'  also intermediate nodes
#'
#' @return named list of strings with node children
getNodeChildrenNames <- function(tree, nodeName, tipsOnly = T){
  fish<-names(getNodeChildren(pruneTrees(list(tree), NULL)[[1]],
                              nodeName, tipsOnly))
  
  fish<-sapply(fish, function(x){ splitCafeLabel(x)['name']}
  )
  names(fish) <- sapply(fish, function(x){
    items <- strsplit(x, "_", T)[[1]]
    paste(items[1], items[2])
  })
  fish 
}


#' Get Gene Counts for Tree Nodes 
#' 
#' Get gene counts for each node in a  a newick tree  downloaded from
#' Ensembl Cafe. The function assumes that the data has beenlonded with
#' \code{loadGeneTrees} and pre-proccessed with \code{pruneTrees}
#'
#' @param trees -list of \code{\link[tidytree]{treedata-class}} 
#'
#' @return a data frame like list with gene count data for each species and gene
#'
#' @seealso \code{pruneTree}
getGeneCounts<-function(trees){
  data <- lapply(trees, function(x){x@data})
  data <- Map(cbind, data, ensembl_gene_id = names(data))
  data <- do.call(rbind, data)
  row.names(data) <- 1:nrow(data)
  geneDupliLong <- data
  geneDupliLong$nodeGeneCount <-geneDupliLong$n
  
  geneDupliLong$sp<-sapply(geneDupliLong$name, function(x){
    items <- strsplit(x, "_", T)[[1]]
    if(length(items)>1){
      return(paste0(items[1],"_",items[2]))
    }
    return(NA)
  })
  geneDupliLong$isSpecies <- ifelse(is.na(geneDupliLong$sp), F, T)
  
  geneDupliLong
}



#' Get the Children of a Tree Node
#'
#' @param tree \code{\link[tidytree]{treedata-class}} 
#' @param nodeName  string of the node name whom children are requested
#' @param tipsOnly boolean should only the tips be returned if false returns
#'  also intermediate nodes
#'
#' @return node labels of the child nodes
getNodeChildren <- function(tree, nodeName, tipsOnly = T){
  
  node_lbls <- unique(unlist(tree@data[tree@data$name == nodeName,"data"]))
  children <- sapply(node_lbls, function(lbl, tree, tipsOnly){
    x<- tidytree::nodeid(tree, lbl)
    res <- tidytree::offspring(tree, x, tipsOnly, self_include = T)
    return(res)
  }, tree, tipsOnly)
  children <- as.vector(unname(unlist(children)))
  children <-sapply(children, function(child, three){
    names(child)<-tidytree::nodelab(tree, child)
    child
  }, tree)
  children[!duplicated(children)]
  
}


#' Add a new species to a phylogenetic tree
#'
#' This function adds a new species to a phylogenetic tree at the most recent common ancestor (MRCA) of a specified genus.
#' If there is only one species in the genus, the function splits the edge leading to the single species into two halves, and adds the new species at the midpoint.
#'
#' @param tree A phylo object representing the original tree.
#' @param species2Add A character string specifying the name of the species to add.
#' @param genus2search A character string specifying the name of the genus to search for.
#' @return A phylo object representing the modified tree.
#' @examples
#' # tree <- add_species_to_tree(tree, "Oncorhynchus_gilae", "Oncorhynchus")
#' @export
add_species_to_tree <- function(tree, species2Add, genus2search) {
  # Find the most recent common ancestor (MRCA) of the specified genus
  items <- as.logical(rowSums(unlist(sapply(genus2search, grepl, x= tree$tip.label))))
  mrca <- ape::getMRCA(tree, tree$tip.label[items])
 
  if (is.null(mrca)) {
    existing_species <- tree$tip.label[grepl(genus2search, tree$tip.label)]
    return(split_tip(tree, existing_species, species2Add))
    
  }
    # Extract the subtree of the MRCA
    tree_mrca <- ape::extract.clade(tree, mrca)
    
    # Calculate the average edge length from the MRCA to the tips
    mrcaEdge <- mean(ape::cophenetic.phylo(tree_mrca))
  
    # Add the new species to the tree as a new tip
    tree <- phytools::bind.tip(tree, species2Add, where=mrca, edge.length = mrcaEdge)
    #find the node lable of the added tip
    newNode <- tree$edge[tree$edge[, 2] == match(species2Add, tree$tip.label), 1]
    #replace where the node label is NA with the new node label
    tree$node.label[tree$node.label == "NA"] <- newNode
    #
    
  if(max(table(tree$node.label))>1){
    warning("Duplicated node labels")
  }
  return(tree)
}
split_tip <- function(tree, existing_species, species2Add) {
  mrca <- which(tree$tip.label==existing_species)
  edge_index <- tree$edge[tree$edge[, 2] == match(existing_species, tree$tip.label), 1]
  # Get the length of the edge
  originalEdge <- tree$edge.length[edge_index]
  mrcaEdge <- originalEdge / 2
  tree$edge.length[edge_index] <- mrcaEdge
  tree <- phytools::bind.tip(tree, species2Add, where=mrca, edge.length = mrcaEdge)
  
  # Get the edge index for the newly added species
  updated_edge_index <- tree$edge[tree$edge[, 2] == match(existing_species, tree$tip.label), 1]
  
  #TODO: update the edge matrix and edge indexes so that the new species 
  #has a different edge index than the existing species
  
  tree$edge.length[updated_edge_index] <- originalEdge
  
  return(tree)
}



prune_tree_with_df <- function(tree, dfSumStat) {
  #prune the tree
  dfSumStat$s_p <- gsub(" ","_", dfSumStat$sp)
  toKeep <-  intersect(dfSumStat$s_p, tree$tip.label)
  if(length(toKeep) < length(dfSumStat$sp)){
    warning(length(dfSumStat$s_p) - length(toKeep), " species were removed from the tree.",
            " Theese were: ", paste(setdiff(dfSumStat$s_p, tree$tip.label), collapse = ", "),
            call. = FALSE)
    #remove the species from data as well
    dfSumStat <- dfSumStat[dfSumStat$s_p %in% toKeep,]
  }
  treeReduced <- ape::keep.tip(tree, toKeep)
  treeReduced$tip.label <- sapply(treeReduced$tip.label, sub, pattern="_", replacement=" ")
  
  #first column of data should be matched with tip label of tree
  dfSumStat[,1] <- sapply(dfSumStat$sp, sub, pattern="_", replacement=" ")
  

  return(list(tree=treeReduced, df=dfSumStat))
}












