# Custom 'gene70' function from genefu R package to fix risk threshold from 0.3 to 0.4 (Esserman et. al. value)
`gene70` <-
  function(data, annot, do.mapping=FALSE, mapping, std=c("none", "scale", "robust"), verbose=FALSE) {
    
    std <- match.arg(std)
    gt <- nrow(sig.gene70)
    if(do.mapping) {
      gid1 <- as.numeric(as.character(sig.gene70[ ,"EntrezGene.ID"]))
      names(gid1) <- dimnames(sig.gene70)[[1]]
      gid2 <- as.numeric(as.character(annot[ ,"EntrezGene.ID"]))
      names(gid2) <- dimnames(annot)[[1]]
      ## remove missing and duplicated geneids from the gene list
      rm.ix <- is.na(gid1) | duplicated(gid1)
      gid1 <- gid1[!rm.ix]
      
      rr <- geneid.map(geneid1=gid2, data1=data, geneid2=gid1, verbose=FALSE)
      gm <- length(rr$geneid2)
      if(is.na(rr$geneid1[1])) {
        gm <- 0
        #no gene ids in common
        res <- rep(NA, nrow(data))
        names(res) <- dimnames(data)[[1]]
        gf <- c("mapped"=0, "total"=gt)
        if(verbose) { message(sprintf("probe candidates: 0/%i", gt)) }
        return(list("score"=res, "risk"=res, "mapping"=gf, "probe"=NA))
      }
      gid1 <- rr$geneid2
      gid2 <- rr$geneid1
      data <- rr$data1
      mymapping <- c("mapped"=gm, "total"=gt)
      myprobe <- cbind("probe"=names(gid1), "EntrezGene.ID"=gid1, "new.probe"=names(gid2))
      sig2 <- sig.gene70[names(gid1), , drop=FALSE]
      ## change the names of probes in the data
      dimnames(data)[[2]] <- names(gid2) <- names(gid1)
    } else {
      data <- data[ , intersect(dimnames(sig.gene70)[[1]], dimnames(data)[[2]])]
      sig2 <- sig.gene70[dimnames(data)[[2]], , drop=FALSE]
      gm <- nrow(sig2)
      mymapping <- c("mapped"=gm, "total"=gt)
      myprobe <- NA
    }
    
    if(verbose && gm != gt) { message(sprintf("%i/%i probes are used to compute the score", gm, gt)) }
    
    ## scaling
    switch(std,
           "scale"={
             data <- scale(data, center=TRUE, scale=TRUE)
             if(verbose) { message("standardization of the gene expressions") }
           }, 
           "robust"={
             data <- apply(data, 2, function(x) { return((rescale(x, q=0.05, na.rm=TRUE) - 0.5) * 2) })
             if(verbose) { message("robust standardization of the gene expressions") }
           }, 
           "none"={ if(verbose) { message("no standardization of the gene expressions") } })
    
    score <- apply(X=data, MARGIN=1, FUN=function (x, y, method, use) {
      rr <- NA
      #if (sum(complete.cases(x, y)) > 3) {
      rr <- cosine(x=x, y=y)
      #}
      return (rr)
    }, y=sig2[, "average.good.prognosis.profile"])
    score <- -score
    official.cutoff <- -0.4
    ## cutoff leaving 59% of patients in the poor prognosis group in the original dataset
    risk <- ifelse(score >= official.cutoff, 1, 0)
    
    names(score) <- names(risk) <- dimnames(data)[[1]]
    
    return(list("score"=score, "risk"=risk, "mapping"=mymapping, "probe"=myprobe))
  }

keepHighestVariant <- function(eset) {
  
  dt <- as.data.table(eset)
  #dt <- dt[, lapply(.SD, as.numeric), .SDcols = 1:(ncol(dt) - 1)]
  
  dt$var = apply(dt[,-c("id")], 1, var, na.rm = TRUE)
  highest_var_per_gene = aggregate(var ~ id, dt, max)
  dt = merge(dt, highest_var_per_gene, by = c("id", "var"))
  dt = dt[,-"var"]
  return(dt)
  
}

keepHighestVariantENTREZ <- function(eset) {
  
  dt <- as.data.table(eset@assayData$exprs)
  
  if("Gene ID" %in% colnames(fData(eset))) {
    dt$V1 = eset@featureData@data$`Gene ID`
  } else if ("EntrezGene.ID" %in% colnames(fData(eset))) {
    dt$V1 = unlist(eset@featureData@data$EntrezGene.ID)
  } else {
    stop("column with entrez id not found.")
  }
  
  dt$V2 = rownames(eset)
  #dt$V1 = eset[eset@featureData@data$ID %in% dt$V1,]@featureData@data$Platform_SPOTID
  #dt = cbind(eset@featureData@data$ID, dt)
  # Compute mean per column per gene
  dt$var = apply(dt[,-c("V1","V2")], 1, var, na.rm = TRUE)
  highest_var_per_gene = aggregate(var ~ V1, dt, max)
  dt = merge(dt, highest_var_per_gene, by = c("V1", "var"))
  # Keep unique values
  eset = eset[dt$V2,]
  return(eset)
  
}

#(ncol(duplicated_gene_counts) - 3):ncol(duplicated_gene_counts)

keepMeanValues <- function(eset) {
  
  dt <- as.data.table(eset)
  
  #dt$V1 = eset[eset@featureData@data$ID %in% dt$V1,]@featureData@data$Platform_SPOTID
  #dt = cbind(eset@featureData@data$ID, dt)
  # Compute mean per column per gene
  dt[, 1:(ncol(dt) - 1) := lapply(.SD, function(x) mean(as.numeric(x), na.rm = T)), by = "id", .SDcols = 1:(ncol(dt) - 1)]
  # Keep unique values
  dt = dt[!duplicated(dt[,1:(ncol(dt) - 2)]),]
  return(dt)
  
}

keepEssermanSamples <- function(geo_platform, ispy_1_clinical, ispy_1_surv_clinical, remove_samples = T) {
  
  #no trastuzumab
  if (remove_samples) {
    no_trastuzumab_geo_platform = sapply(geo_platform@phenoData@data$`neoadjuvant chemotherapy* (1 = ac only; 2 = ac + t; 3 = ac + t + herceptin; 4 = ac + t + other;):ch2`, strsplit, split = ":")
    no_trastuzumab_geo_platform = geo_platform[,which(lapply(no_trastuzumab_geo_platform, "[", 2) != 3)]
    gplgeo_platform = no_trastuzumab_geo_platform
  } else
    gplgeo_platform <- geo_platform
  
  gplgeo_platform$new_her2 = sapply(gplgeo_platform[["i-spy id:ch2"]], function(sample_id) ispy_1_clinical[ispy_1_clinical$SUBJECTID == sample_id, "Her2MostPos"])
  if (remove_samples)
    gplgeo_platform = gplgeo_platform[,!is.na(gplgeo_platform$new_her2)]
  
  # RCB class in ispy-1 file and microarrays is the same
  gplgeo_platform$new_rcbclass = sapply(gplgeo_platform[["i-spy id:ch2"]], function(sample_id) ispy_1_surv_clinical[ispy_1_surv_clinical$SUBJECTID == sample_id, "RCBClass"])
  if (remove_samples)
    gplgeo_platform = gplgeo_platform[,!is.na(gplgeo_platform$new_rcbclass)]
  
  return(gplgeo_platform)
}

extract_background_sub_ratio_gse <- function(gse, platformID) {
  # get raw log2 mean ratio intensities
  #gse = getGEO(gseID, GSEMatrix = FALSE, getGPL = TRUE, AnnotGPL = FALSE) # [[1]] is GPL1708 [[2]] is GPL4133
  gsmList = Filter(function(gse) {Meta(gse)$platform_id==platformID},GSMList(gse))
  background_sub_ratios = lapply(gsmList, function(x) (Table(x)[,"CH2_MEAN"] - Table(x)[,"CH2_BKD_MEDIAN"])/(Table(x)[,"CH1_MEAN"] - Table(x)[,"CH2BN_MEDIAN"]))
  background_sub_ratios = do.call(rbind, background_sub_ratios)
  background_sub_ratios = t(background_sub_ratios)
  
  return(background_sub_ratios)
}

extract_mean_ratio_gse <- function(gse, platformID) {
  # get raw log2 mean ratio intensities
  #gse = getGEO(gseID, GSEMatrix = FALSE, getGPL = TRUE, AnnotGPL = FALSE) # [[1]] is GPL1708 [[2]] is GPL4133
  gsmList = Filter(function(gse) {Meta(gse)$platform_id==platformID},GSMList(gse))
  if (platformID == "GPL180")
    mean_ratios = lapply(gsmList, function(x) (Table(x)[,"CH2I_MEAN"])/Table(x)[,"CH1D_MEAN"] + Table(x)[,"CH1B_MEAN"])
  else if (platformID %in% c("GPL2776","GPL2777","GPL2778"))
    mean_ratios = lapply(gsmList, function(x) (Table(x)[,"CH2I_MEAN"])/(Table(x)[,"CH1I_MEAN"]))
  else
    mean_ratios = lapply(gsmList, function(x) (Table(x)[,"CH2_MEAN"])/(Table(x)[,"CH1_MEAN"]))
  mean_ratios = do.call(rbind, mean_ratios)
  mean_ratios = t(mean_ratios)
  
  return(mean_ratios)
}

keep_only_p53 <- function(gb_list) {
  gb_list_fixed = str_split_fixed(gb_list, fixed(","), 10)
  gb_list_fixed = reshape2::melt(gb_list_fixed)
  gb_list_fixed = gb_list_fixed[gb_list_fixed$value != "",c(1,3)]
  gb_list_fixed = unique(gb_list_fixed)
  
  gb_list_fixed = droplevels(gb_list_fixed)
  return(gb_list_fixed)
}

p53_classify <- function(x_52, average_wt, average_mut) {
  
  wt_corr = cor(x_52, average_wt, method = c("spearman"), use = "pairwise.complete.obs")
  mut_corr = cor(x_52, average_mut, method = c("spearman"), use = "pairwise.complete.obs")
  
  if (wt_corr > mut_corr)
    status = "wt"
  else
    status = "mut"
  
  return(c(wt_corr, mut_corr, (wt_corr - mut_corr), status))
}