library(AnnotationDbi)
library(hgug4112a.db)
library(stringr)
library(GEOquery)
library(data.table)
library(sva)
library(dplyr)
library(genefu)
library(lsa)
source("custom_funcs.R")

# Next two commands avoid errors with GEOquery
readr::local_edition(1)
Sys.setenv(VROOM_CONNECTION_SIZE=500072)

# Esserman et. al. gene expression data acquisition
gse22226 <- getGEO("GSE22226", GSEMatrix = TRUE, getGPL = TRUE, AnnotGPL = TRUE) # [[1]] is GPL1708 [[2]] is GPL4133
gpl1708 <- gse22226[[1]]
gpl4133 <- gse22226[[2]]

# I-SPY 1 patient clinical information
ispy_1_clinical <- read.csv("I-SPY 1 All Patient Clinical and Outcome Data.txt", sep = "\t", colClasses = c(NA,NA,NA,"NULL","NULL","NULL"))
ispy_1_surv_clinical <- read.table("ispy1_surv_clinical_data.csv", header = T, sep = ",")

# Removing gpl1708[,-130] due to completely missing clinical information
gpl1708 <- gpl1708[,-130]

# Whether to go for the recreation of Esserman et. al. analysis
remove_samples_for_esserman_recreation = F

gpl1708 <- keepEssermanSamples(gpl1708, ispy_1_clinical, ispy_1_surv_clinical, remove_samples = remove_samples_for_esserman_recreation)
gpl4133 <- keepEssermanSamples(gpl4133, ispy_1_clinical, ispy_1_surv_clinical, remove_samples = remove_samples_for_esserman_recreation)

# Esserman et. al. gene expression data acquisition
gse <- getGEO("GSE22226", GSEMatrix = FALSE, getGPL = TRUE, AnnotGPL = FALSE) # [[1]] is GPL1708 [[2]] is GPL4133
# Recalculate raw Esserman et. al. intensity values to log2 without subtraction intensity values
# (match ROR-S values)
mean_ratios1708 <- extract_mean_ratio_gse(gse, platformID = "GPL1708")
mean_ratios4133 <- extract_mean_ratio_gse(gse, platformID = "GPL4133")
new_expr1708 <-mean_ratios1708[gpl1708@featureData@data$ID,gpl1708$geo_accession]
new_expr4133 <- mean_ratios4133[gpl4133@featureData@data$ID,gpl4133$geo_accession]
exprs(gpl1708) <- log2(new_expr1708)
exprs(gpl4133) <- log2(new_expr4133)
rm(gse)

# Pick Esserman et. al. microarrays' probe:Entrez ID annotation
gse22226_annotation <- AnnotationDbi::select(hgug4112a.db, keys=gpl1708@featureData@data$Platform_SPOTID, columns = c("PROBEID","ENTREZID"))

# Annotate Esserman et. al. microarray probes according to hgug4112a.db annotation (50/50 PAM50 genes mapped this way)
gpl1708@featureData@data <- gpl1708@featureData@data %>% dplyr::rename(probe = ID)
gpl1708@featureData@data$EntrezGene.ID <- sapply(gpl1708@featureData@data$Platform_SPOTID, function(probe_id) unique(gse22226_annotation[which(gse22226_annotation$PROBEID == probe_id), "ENTREZID"]))

gpl4133@featureData@data <- gpl4133@featureData@data %>% dplyr::rename(probe = ID)
gpl4133@featureData@data$EntrezGene.ID <- sapply(gpl4133@featureData@data$Platform_SPOTID, function(probe_id) unique(gse22226_annotation[which(gse22226_annotation$PROBEID == probe_id), "ENTREZID"])) 

# Calculate ROR-S risk score per Esserman et. al. sample
rors1708 <- rorS(data = t(exprs(gpl1708)), annot = gpl1708@featureData@data, do.mapping = T, verbose = T)
rors1708_scores <- data.frame(GSM = names(rors1708$score),
                       RORS.low.risk.correlation = rors1708$score,
                       RORS.risk.category = rors1708$risk)
rors1708_scores <- merge(rors1708_scores, gpl1708@phenoData@data[,c("geo_accession", "i-spy id:ch2", "pathological complete response (pcr):ch2", "new_rcbclass")], by.x = "GSM", by.y = "geo_accession")

rors4133 <- rorS(data = t(exprs(gpl4133)), annot = gpl4133@featureData@data, do.mapping = T, verbose = T)
rors4133_scores <- data.frame(GSM = names(rors4133$score),
                             RORS.low.risk.correlation = rors4133$score,
                             RORS.risk.category = rors4133$risk)
rors4133_scores <- merge(rors4133_scores, gpl4133@phenoData@data[,c("geo_accession", "i-spy id:ch2", "pathological complete response (pcr):ch2", "new_rcbclass")], by.x = "GSM", by.y = "geo_accession")

rors_both_platforms <- dplyr::union(rors1708_scores, rors4133_scores)
rors_both_platforms <- dplyr::arrange(rors_both_platforms, `i-spy id:ch2`)
rors_both_platforms <- rors_both_platforms[,c(4,2,3,5,6)]

if (remove_samples_for_esserman_recreation) {
# Calculate risk categories, PCR and RCB ratios if Esserman et. al. recreation samples  (i.e. remove_samples = T)
  write(paste0("GPL1708 mapping rate: ", rors1708$mapping[1], "/", rors1708$mapping[2]), "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/rors_esserman_statistics.txt", append = T)
  write(paste0("GPL4133 mapping rate: ", rors4133$mapping[1], "/", rors4133$mapping[2]), "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/rors_esserman_statistics.txt", append = T)
  write(paste0("Number of samples per risk category:"), "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/rors_esserman_statistics.txt", append = T)
  write(paste0(names(table(rors_both_platforms$RORS.risk.category))[1], " ", names(table(rors_both_platforms$RORS.risk.category))[2], " ", names(table(rors_both_platforms$RORS.risk.category))[3]), "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/rors_esserman_statistics.txt", append = T)
  write(table(rors_both_platforms$RORS.risk.category), "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/rors_esserman_statistics.txt", append = T)
  write(paste0("Low risk with PCR (positive): ", nrow(rors_both_platforms[rors_both_platforms$`pathological complete response (pcr):ch2` == "Yes" & rors_both_platforms$RORS.risk.category == "Low",]), "/", nrow(rors_both_platforms[rors_both_platforms$RORS.risk.category == "Low",])), "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/rors_esserman_statistics.txt", append = T)
  write(paste0("Intermediate risk with PCR (positive): ", nrow(rors_both_platforms[rors_both_platforms$`pathological complete response (pcr):ch2` == "Yes" & rors_both_platforms$RORS.risk.category == "Intermediate",]), "/", nrow(rors_both_platforms[rors_both_platforms$RORS.risk.category == "Intermediate",])), "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/rors_esserman_statistics.txt", append = T)
  write(paste0("High risk with PCR (positive): ", nrow(rors_both_platforms[rors_both_platforms$`pathological complete response (pcr):ch2` == "Yes" & rors_both_platforms$RORS.risk.category == "High",]), "/", nrow(rors_both_platforms[rors_both_platforms$RORS.risk.category == "High",])), "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/rors_esserman_statistics.txt", append = T)
  write(paste0("Low risk with RCB (0 or I class): ", nrow(rors_both_platforms[(rors_both_platforms$new_rcbclass == 0 | rors_both_platforms$new_rcbclass == 1) & rors_both_platforms$RORS.risk.category == "Low",]), "/", nrow(rors_both_platforms[rors_both_platforms$RORS.risk.category == "Low",])), "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/rors_esserman_statistics.txt", append = T)
  write(paste0("Intermediate risk with RCB (0 or I class): ", nrow(rors_both_platforms[(rors_both_platforms$new_rcbclass == 0 | rors_both_platforms$new_rcbclass == 1) & rors_both_platforms$RORS.risk.category == "Intermediate",]), "/", nrow(rors_both_platforms[rors_both_platforms$RORS.risk.category == "Intermediate",])), "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/rors_esserman_statistics.txt", append = T)
  write(paste0("High risk with RCB (0 or I class): ", nrow(rors_both_platforms[(rors_both_platforms$new_rcbclass == 0 | rors_both_platforms$new_rcbclass == 1) & rors_both_platforms$RORS.risk.category == "High",]), "/", nrow(rors_both_platforms[rors_both_platforms$RORS.risk.category == "High",])), "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/rors_esserman_statistics.txt", append = T)
  write.table(rors_both_platforms, "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/rors_signature_scores_and_classes_with_recreation_samples.tsv", sep = "\t", row.names = F, col.names = T, quote = F)
} else
  write.table(rors_both_platforms, "gene_signatures_pre_publication/rors_signature_scores_and_classes.tsv", sep = "\t", row.names = F, col.names = T, quote = F)