library(AnnotationDbi)
library(hgug4112a.db)
library(stringr)
library(GEOquery)
library(data.table)
library(sva)
library(dplyr)
library(genefu)
library(lsa)
data("sig.gene70")
source("custom_funcs.R")

# Next two commands avoid errors with GEOquery
readr::local_edition(1)
Sys.setenv(VROOM_CONNECTION_SIZE=500072)

gse22226 <- getGEO("GSE22226", GSEMatrix = TRUE, getGPL = TRUE, AnnotGPL = TRUE) # [[1]] is GPL1708 [[2]] is GPL4133
gpl1708 <- gse22226[[1]]
gpl4133 <- gse22226[[2]]

ispy_1_clinical <- read.csv("I-SPY 1 All Patient Clinical and Outcome Data.txt", sep = "\t", colClasses = c(NA,NA,NA,"NULL","NULL","NULL"))
ispy_1_surv_clinical <- read.table("ispy1_surv_clinical_data.csv", header = T, sep = ",")

# Removing gpl1708[,-130] due to completely missing clinical information
gpl1708 <- gpl1708[,-130]

# Whether to go for the recreation of Esserman et. al. analysis
remove_samples_for_esserman_recreation = F

gpl1708 <- keepEssermanSamples(gpl1708, ispy_1_clinical, ispy_1_surv_clinical, remove_samples = remove_samples_for_esserman_recreation)
gpl4133 <- keepEssermanSamples(gpl4133, ispy_1_clinical, ispy_1_surv_clinical, remove_samples = remove_samples_for_esserman_recreation)

# Esserman et. al. gene expression data acquisition
gse <- getGEO("GSE22226", GSEMatrix = FALSE, getGPL = TRUE, AnnotGPL = FALSE) # [[1]] is GPL1708 [[2]] is GPL4133
# Recalculate raw Esserman et. al. intensity values to log10 median subtracted intensity values
# (match MammaPrint values)
background_sub_ratios1708 <- extract_background_sub_ratio_gse(gse, platformID = "GPL1708")
background_sub_ratios4133 <- extract_background_sub_ratio_gse(gse, platformID = "GPL4133")
new_expr1708 <- background_sub_ratios1708[gpl1708@featureData@data$ID,gpl1708$geo_accession]
new_expr4133 <- background_sub_ratios4133[gpl4133@featureData@data$ID,gpl4133$geo_accession]
exprs(gpl1708) <- log10(new_expr1708)
exprs(gpl4133) <- log10(new_expr4133)
rm(gse)

# Transform Inf/NA values to zero to allow cosine similarity calculation
# (0 value will have no effect on cosine similarity)
exprs(gpl1708)[which(!is.finite(exprs(gpl1708)))] <- 0
exprs(gpl4133)[which(!is.finite(exprs(gpl4133)))] <- 0

# Pick Esserman et. al. microarrays' probe:Entrez ID annotation
gse22226_annotation <- AnnotationDbi::select(hgug4112a.db, keys=gpl1708@featureData@data$Platform_SPOTID, columns = c("PROBEID","ENTREZID"))

# Rename 'Gene ID' column to 'EntrezGene.ID' to be compatible with genefu package
# If hgug4112a.db is used, it results in 14 (low-risk)/104 (high-risk) MammaPrint classification
# If hgug4112a.db is NOT used, it results in 12 (low-risk)/106 (high-risk) MammaPrint classification
# The discrepancy appears when transforming GPL1708 GEO annotation to hgug4112a.db, which leaves the 
# 20700 PROBE out from the final annotation (MTDH gene) because it does not contain ENTREZ ID information (NA)
# Rest annotation is the same. Thus, going with the original GPL1708 & GPL4133 here
gpl1708@featureData@data <- gpl1708@featureData@data %>% dplyr::rename(probe = ID,
                                                                      EntrezGene.ID = `Gene ID`)
# Next command is testing the hgug4112a.db annotation # NOT THE ONE USED
#gpl1708@featureData@data$EntrezGene.ID <- sapply(gpl1708@featureData@data$Platform_SPOTID, function(probe_id) unique(gse22226_annotation[which(gse22226_annotation$PROBEID == probe_id), "ENTREZID"]))

gpl4133@featureData@data <- gpl4133@featureData@data %>% dplyr::rename(probe = ID,
                                                                      EntrezGene.ID = `Gene ID`)
# Next command is testing the hgug4112a.db annotation # NOT THE ONE USED
#gpl4133@featureData@data$EntrezGene.ID <- sapply(gpl4133@featureData@data$Platform_SPOTID, function(probe_id) unique(gse22226_annotation[which(gse22226_annotation$PROBEID == probe_id), "ENTREZID"])) 

# Calculate cosine similarity per Esserman et. al. sample to MammaPrint samples
mammaprint_1708 <- gene70(data <- t(exprs(gpl1708)), annot = gpl1708@featureData@data, do.mapping = T, std = "none", verbose = T)
mammaprint_1708_scores <- data.frame(GSM = names(mammaprint_1708$score),
                       MammaPrint.low.risk.correlation = mammaprint_1708$score,
                       MammaPrint.risk.category = mammaprint_1708$risk)
mammaprint_1708_scores <- merge(mammaprint_1708_scores, gpl1708@phenoData@data[,c("geo_accession", "i-spy id:ch2", "pathological complete response (pcr):ch2", "new_rcbclass")], by.x = "GSM", by.y = "geo_accession")

mammaprint_4133 <- gene70(data = t(exprs(gpl4133)), annot = gpl4133@featureData@data, do.mapping = T, std = "none", verbose = T)
mammaprint_4133_scores <- data.frame(GSM = names(mammaprint_4133$score),
                       MammaPrint.low.risk.correlation = mammaprint_4133$score,
                       MammaPrint.risk.category = mammaprint_4133$risk)
mammaprint_4133_scores <- merge(mammaprint_4133_scores, gpl4133@phenoData@data[,c("geo_accession", "i-spy id:ch2", "pathological complete response (pcr):ch2", "new_rcbclass")], by.x = "GSM", by.y = "geo_accession")

mammaprint_both_platforms <- dplyr::union(mammaprint_1708_scores, mammaprint_4133_scores)
mammaprint_both_platforms <- dplyr::arrange(mammaprint_both_platforms, `i-spy id:ch2`)
mammaprint_both_platforms <- mammaprint_both_platforms[,c(4,2,3,5,6)]

if (remove_samples_for_esserman_recreation) {
# Calculate risk categories, PCR and RCB ratios if Esserman et. al. recreation samples (i.e. remove_samples = T)
  write(paste0 ("GPL1708 mapping rate: ", mammaprint_1708$mapping[1], "/", mammaprint_1708$mapping[2]), "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/mammaprint_esserman_statistics.txt", append = T)
  write(paste0("GPL4133 mapping rate: ", mammaprint_4133$mapping[1], "/", mammaprint_4133$mapping[2]), "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/mammaprint_esserman_statistics.txt", append = T)
  write(paste0("Number of samples per risk category (0: Low risk, 1: High risk):"), "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/mammaprint_esserman_statistics.txt", append = T)
  write(paste0(names(table(mammaprint_both_platforms$MammaPrint.risk.category))[1], "\t", names(table(mammaprint_both_platforms$MammaPrint.risk.category))[2]), "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/mammaprint_esserman_statistics.txt", append = T)
  write(table(mammaprint_both_platforms$MammaPrint.risk.category), "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/mammaprint_esserman_statistics.txt", append = T)
  write(paste0("Low risk with PCR (positive): ", nrow(mammaprint_both_platforms[mammaprint_both_platforms$`pathological complete response (pcr):ch2` == "Yes" & mammaprint_both_platforms$MammaPrint.risk.category == 0,]), "/", nrow(mammaprint_both_platforms[mammaprint_both_platforms$MammaPrint.risk.category == 0,])), "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/mammaprint_esserman_statistics.txt", append = T)
  write(paste0("High risk with PCR (positive): ", nrow(mammaprint_both_platforms[mammaprint_both_platforms$`pathological complete response (pcr):ch2` == "Yes" & mammaprint_both_platforms$MammaPrint.risk.category == 1,]), "/", nrow(mammaprint_both_platforms[mammaprint_both_platforms$MammaPrint.risk.category == 1,])), "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/mammaprint_esserman_statistics.txt", append = T)
  write(paste0("Low risk with RCB (0 or I class): ", nrow(mammaprint_both_platforms[(mammaprint_both_platforms$new_rcbclass == 0 | mammaprint_both_platforms$new_rcbclass == 1) & mammaprint_both_platforms$MammaPrint.risk.category == 0,]), "/", nrow(mammaprint_both_platforms[mammaprint_both_platforms$MammaPrint.risk.category == 0,])), "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/mammaprint_esserman_statistics.txt", append = T)
  write(paste0("High risk with RCB (0 or I class): ", nrow(mammaprint_both_platforms[(mammaprint_both_platforms$new_rcbclass == 0 | mammaprint_both_platforms$new_rcbclass == 1) & mammaprint_both_platforms$MammaPrint.risk.category == 1,]), "/", nrow(mammaprint_both_platforms[mammaprint_both_platforms$MammaPrint.risk.category == 1,])), "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/mammaprint_esserman_statistics.txt", append = T)
  write.table(mammaprint_both_platforms, "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/mammaPrint_signature_scores_and_classes_with_recreation_samples.tsv.tsv", sep = "\t", row.names = F, col.names = T)
} else
  write.table(mammaprint_both_platforms, "gene_signatures_pre_publication/mammaPrint_signature_scores_and_classes.tsv", sep = "\t", row.names = F, col.names = T)