library(AnnotationDbi)
library(hgug4112a.db)
library(stringr)
library(GEOquery)
library(data.table)
library(sva)
library(dplyr)
library(genefu)
library(lsa)
source("custom_funcs.R")

# Next two commands avoid errors with GEOquery
readr::local_edition(1)
Sys.setenv(VROOM_CONNECTION_SIZE=500072)

# Esserman et. al. gene expression data acquisition
gse22226 <- getGEO("GSE22226", GSEMatrix = TRUE, getGPL = TRUE, AnnotGPL = TRUE) # [[1]] is GPL1708 [[2]] is GPL4133
gpl1708 <- gse22226[[1]]
gpl4133 <- gse22226[[2]]

# I-SPY 1 patient clinical information
ispy_1_clinical <- read.csv("I-SPY 1 All Patient Clinical and Outcome Data.txt", sep = "\t", colClasses = c(NA,NA,NA,"NULL","NULL","NULL"))
ispy_1_surv_clinical <- read.table("ispy1_surv_clinical_data.csv", header = T, sep = ",")

# Removing gpl1708[,-130] due to completely missing clinical information
gpl1708 <- gpl1708[,-130]

# Whether to go for the recreation of Esserman et. al. analysis
remove_samples_for_esserman_recreation = F

gpl1708 <- keepEssermanSamples(gpl1708, ispy_1_clinical, ispy_1_surv_clinical, remove_samples = remove_samples_for_esserman_recreation)
gpl4133 <- keepEssermanSamples(gpl4133, ispy_1_clinical, ispy_1_surv_clinical, remove_samples = remove_samples_for_esserman_recreation)

##### Generate p53 gene signature #####

# Sorlie et. al. gene expression data acquistion (training data for the p53 centroids)
sorlie_dataset_2001 <- getGEO("GSE3193", GSEMatrix = TRUE, getGPL = TRUE, AnnotGPL = FALSE) # [[1]] is GPL1798 [[2]] is GPL4133

# p53 mutation status (wildtype: 0, mutated: 1) of the Sorlie et. al. samples
troester_et_al_sample_labels = read.table("sorlie_et_al_2001_sample_labels_used_in_Troester_training.txt", sep = "\t", header = T)

# Gene expression data are divided into 4 microarry platforms
sorlie_2001_1 <- sorlie_dataset_2001[[1]]
sorlie_2001_2 <- sorlie_dataset_2001[[2]]
sorlie_2001_3 <- sorlie_dataset_2001[[3]]
sorlie_2001_4 <- sorlie_dataset_2001[[4]]

# Keep only samples found in the Troester et. al. training set used for the p53 centroids 
sorlie_2001_1 <- sorlie_2001_1[,sorlie_2001_1$title %in% troester_et_al_sample_labels$Troester_et_al_id]
troester_et_al_sample_labels <- troester_et_al_sample_labels[-which(troester_et_al_sample_labels$Troester_et_al_id %in% sorlie_2001_1$title),]

sorlie_2001_2 <- sorlie_2001_2[,sorlie_2001_2$title %in% troester_et_al_sample_labels$Troester_et_al_id]
troester_et_al_sample_labels <- troester_et_al_sample_labels[-which(troester_et_al_sample_labels$Troester_et_al_id %in% sorlie_2001_2$title),]

sorlie_2001_3 <- sorlie_2001_3[,sorlie_2001_3$title %in% troester_et_al_sample_labels$Troester_et_al_id]
troester_et_al_sample_labels <- troester_et_al_sample_labels[-which(troester_et_al_sample_labels$Troester_et_al_id %in% sorlie_2001_3$title),]

sorlie_2001_4 <- sorlie_2001_4[,sorlie_2001_4$title %in% troester_et_al_sample_labels$Troester_et_al_id]
troester_et_al_sample_labels <- troester_et_al_sample_labels[-which(troester_et_al_sample_labels$Troester_et_al_id %in% sorlie_2001_4$title),]

# p53 52 genes signature description and Genbank ID (as in Troester et. al. 2006 study)
p53_annotation = read.table("p53_signature_gene_annotation.tsv", sep = "\t", header = T, stringsAsFactors = F)

# Keep only the p53 52 genes in the signature per microarray dataset
# When there exists more than 1 probes for a p53 gene, pick the highest variant gene

# GPL180 is log10. Thus, transform into log2
sorlie_2001_1 <- sorlie_2001_1[sorlie_2001_1@featureData@data$GB_ACC %in% p53_annotation$Troester_et_al_genbank_id,]
p53_expressions_2001_1 <- exprs(sorlie_2001_1)
p53_expressions_2001_1 <- 10^p53_expressions_2001_1
p53_expressions_2001_1 <- log2(p53_expressions_2001_1)
rownames(p53_expressions_2001_1) <- sorlie_2001_1@featureData@data$GB_ACC
p53_expressions_2001_1 <- cbind(p53_expressions_2001_1, sorlie_2001_1@featureData@data$GB_ACC)
colnames(p53_expressions_2001_1)[colnames(p53_expressions_2001_1) == ""] <- "id"
p53_expressions_2001_1 <- keepHighestVariant(p53_expressions_2001_1)

# GPL2776
sorlie_2001_2 <- sorlie_2001_2[grep(paste(p53_annotation$Troester_et_al_genbank_id, collapse = "|"), sorlie_2001_2@featureData@data$GB_LIST),]
p53_expressions_2001_2 <- exprs(sorlie_2001_2)
sorlie_2001_2_p53 <- keep_only_p53(sorlie_2001_2@featureData@data$GB_LIST)
sorlie_2001_2_p53 <- sorlie_2001_2_p53[which(sorlie_2001_2_p53$value %in% p53_annotation$Troester_et_al_genbank_id),]
rownames(p53_expressions_2001_2)[sorlie_2001_2_p53$Var1] <- sorlie_2001_2_p53$value
p53_expressions_2001_2 <- cbind(p53_expressions_2001_2[sorlie_2001_2_p53$Var1,], sorlie_2001_2_p53$value)
colnames(p53_expressions_2001_2)[colnames(p53_expressions_2001_2) == ""] <- "id"
p53_expressions_2001_2 <- keepHighestVariant(p53_expressions_2001_2)

# GPL2777
sorlie_2001_3 <- sorlie_2001_3[grep(paste(p53_annotation$Troester_et_al_genbank_id, collapse = "|"), sorlie_2001_3@featureData@data$GB_LIST),]
p53_expressions_2001_3 = exprs(sorlie_2001_3)
sorlie_2001_3_p53 <- keep_only_p53(sorlie_2001_3@featureData@data$GB_LIST)
sorlie_2001_3_p53 <- sorlie_2001_3_p53[which(sorlie_2001_3_p53$value %in% p53_annotation$Troester_et_al_genbank_id),]
rownames(p53_expressions_2001_3)[sorlie_2001_3_p53$Var1] <- sorlie_2001_3_p53$value
p53_expressions_2001_3_samples <- colnames(p53_expressions_2001_3)
p53_expressions_2001_3 <- cbind(p53_expressions_2001_3[sorlie_2001_3_p53$Var1], sorlie_2001_3_p53$value)
colnames(p53_expressions_2001_3) <- c(p53_expressions_2001_3_samples, "")
colnames(p53_expressions_2001_3)[colnames(p53_expressions_2001_3) == ""] <- "id"
# This platform contains only sample, thus no variance can be computed
# Just pick the only gene expression (Mean -> same value)
p53_expressions_2001_3 <- keepMeanValues(p53_expressions_2001_3)

# GPL2778
sorlie_2001_4 <- sorlie_2001_4[grep(paste(p53_annotation$Troester_et_al_genbank_id, collapse = "|"), sorlie_2001_4@featureData@data$GB_LIST),]
p53_expressions_2001_4 <- exprs(sorlie_2001_4)
sorlie_2001_4_p53 <- keep_only_p53(sorlie_2001_4@featureData@data$GB_LIST)
sorlie_2001_4_p53 <- sorlie_2001_4_p53[which(sorlie_2001_4_p53$value %in% p53_annotation$Troester_et_al_genbank_id),]
rownames(p53_expressions_2001_4)[sorlie_2001_4_p53$Var1] <- sorlie_2001_4_p53$value
p53_expressions_2001_4 <- cbind(p53_expressions_2001_4[sorlie_2001_4_p53$Var1,], sorlie_2001_4_p53$value)
colnames(p53_expressions_2001_4)[colnames(p53_expressions_2001_4) == ""] <- "id"
p53_expressions_2001_4 <- keepHighestVariant(p53_expressions_2001_4)

# Combined the 4 platforms in one matrix
all_datasets_52_genes <- Reduce(function(x, y) merge(x, y, by = "id", all = T), list(p53_expressions_2001_1, p53_expressions_2001_2, p53_expressions_2001_3, p53_expressions_2001_4))

# Match Genbank IDs to Entrez IDs
sorlie_genbank <- as.data.frame(all_datasets_52_genes$id)
colnames(sorlie_genbank) <- "id"

# Manually curated matching of p53 gene signatures Genbank:Entrez IDs from Troester et. al. 2006
p53_annotation_wo_na <- p53_annotation[!is.na(p53_annotation$Esserman_et_al_entrez_id),]
sorlie_entrez_and_genbank <- merge(sorlie_genbank, p53_annotation_wo_na, by.x = "id", by.y = "Troester_et_al_genbank_id", sort = FALSE)

# Pick Esserman et. al. microarrays' probe:Entrez ID annotation
gse22226_annotation <- AnnotationDbi::select(hgug4112a.db, keys=gpl1708@featureData@data$Platform_SPOTID, columns = c("PROBEID","ENTREZID"))

sorlie_entrez_and_genbank <- sorlie_entrez_and_genbank[!is.na(sorlie_entrez_and_genbank$Esserman_et_al_entrez_id) & sorlie_entrez_and_genbank$Esserman_et_al_entrez_id != "",]
# Keep only genes common between Sorlie et. al. and Esserman et. al. microarrays
sorlie_entrez_and_genbank <- sorlie_entrez_and_genbank[which(sorlie_entrez_and_genbank$Esserman_et_al_entrez_id %in% gse22226_annotation$ENTREZID),]
sorlie_entrez_and_genbank <- sorlie_entrez_and_genbank[order(sorlie_entrez_and_genbank$id),]

# Match Sorlie et. al. Genbank IDs to Esserman et. al. Entrez IDs
all_datasets_52_genes <- merge(all_datasets_52_genes, sorlie_entrez_and_genbank[,c("id", "Esserman_et_al_entrez_id")])
all_datasets_52_genes <- all_datasets_52_genes[,-"id"]
colnames(all_datasets_52_genes)[colnames(all_datasets_52_genes) == "Esserman_et_al_entrez_id"] = "id"
# Order by Entrez ID
all_datasets_52_genes <- all_datasets_52_genes[order(all_datasets_52_genes$id),]

# Esserman et. al. microarrays

# Keep only p53 gene signature and again, pick highest variant probe for multiple gene duplicates
# Annotate Esserman et. al. microarray probes according to hgug4112a.db annotation (51/52 p53 genes mapped this way)
# (same as keeping the original GPL1708 & GPL4133 annotations)
probes <- gse22226_annotation[gse22226_annotation$ENTREZID %in% sorlie_entrez_and_genbank$Esserman_et_al_entrez_id,]
gpl1708 <- gpl1708[gpl1708@featureData@data$Platform_SPOTID %in% probes$PROBEID,]

indices_1708 <- match(gpl1708@featureData@data$Platform_SPOTID, probes$PROBEID)
gpl1708@featureData@data$`Gene ID` <- probes[indices_1708,"ENTREZID"]

gpl1708 <- keepHighestVariantENTREZ(gpl1708)

gpl4133 <- gpl4133[gpl4133@featureData@data$Platform_SPOTID %in% probes$PROBEID,]

indices_4133 <- match(gpl4133@featureData@data$Platform_SPOTID, probes$PROBEID)
gpl4133@featureData@data$`Gene ID` <- probes[indices_4133,"ENTREZID"]

gpl4133 <- keepHighestVariantENTREZ(gpl4133)

# Order by Entrez ID
gpl1708 <- gpl1708[order(as.numeric(gpl1708@featureData@data$`Gene ID`)),]
gpl4133 <- gpl4133[order(as.numeric(gpl4133@featureData@data$`Gene ID`)),]


# Back to p53 signature
# Create the p53 gene signature centroids
sorlie_p53_samples <- rbind(sorlie_2001_1@phenoData@data[,c("title", "geo_accession")],
                     sorlie_2001_2@phenoData@data[,c("title", "geo_accession")],
                     sorlie_2001_3@phenoData@data[,c("title", "geo_accession")],
                     sorlie_2001_4@phenoData@data[,c("title", "geo_accession")]
)

# Re-read the p53 mutation status file
troester_et_al_sample_labels <- read.table("sorlie_et_al_2001_sample_labels_used_in_Troester_training.txt", sep = "\t", header = T)

# Rename the Sorlie et. al. headers to match the Troester et. al. samples' status file
colnames(sorlie_p53_samples) <- c("Troester_et_al_id", "geo_accession")

# Match Sorlie et. al. sample IDs to their corresponding Troester et. al. p53 mutation status label
p53_samples <- merge(troester_et_al_sample_labels, sorlie_p53_samples)

p53_wildtype <- p53_samples[p53_samples$Mutation_status == 0, "geo_accession"]
p53_mutated <- p53_samples[p53_samples$Mutation_status == 1, "geo_accession"]

# Separate p53 wildtype vs p53 mutated samples
select_columns <- which(colnames(all_datasets_52_genes) %in% p53_wildtype)
p53_wildtype_expressions <- all_datasets_52_genes[,..select_columns]

select_columns <- which(colnames(all_datasets_52_genes) %in% p53_mutated)
p53_mutated_expressions <- all_datasets_52_genes[,..select_columns]

# Calculate the average expression per p53 gene signature component
# across each p53 mutation group (i.e. wildtype/mutated centroids)
average_wildtype <- rowMeans(sapply(p53_wildtype_expressions, as.numeric), na.rm = T)
average_mutated <- rowMeans(sapply(p53_mutated_expressions, as.numeric), na.rm = T)

# Pick Entrez IDs and order again
sorlie_entrez_and_genbank <- sorlie_entrez_and_genbank[order(sorlie_entrez_and_genbank$Esserman_et_al_entrez_id),]
average_wildtype <- cbind(average_wildtype, sorlie_entrez_and_genbank$Esserman_et_al_entrez_id)
average_mutated <- cbind(average_mutated, sorlie_entrez_and_genbank$Esserman_et_al_entrez_id)
average_wildtype <- average_wildtype[order(average_wildtype[,2]),]
average_mutated <- average_mutated[order(average_mutated[,2]),]

# Calculate correlation per Esserman et. al. sample to p53 centroids

p53_1708 <- sapply(1:ncol(gpl1708), function(x) p53_classify(as.numeric(exprs(gpl1708[,x])), as.numeric(average_wildtype[,1]), as.numeric(average_mutated[,1])))
p53_1708 <- t(p53_1708)
p53_1708 <- cbind(gpl1708$`i-spy id:ch2`, p53_1708)
colnames(p53_1708) <- c("ISPY.Case", "WT_corr", "Mut_corr", "p53_delta", "category")

p53_4133 <- sapply(1:ncol(gpl4133), function(x) p53_classify(as.numeric(exprs(gpl1708[,x])), as.numeric(average_wildtype[,1]), as.numeric(average_mutated[,1])))
p53_4133 <- t(p53_4133)
p53_4133 <- cbind(gpl4133$`i-spy id:ch2`, p53_4133)
colnames(p53_4133) <- c("ISPY.Case", "WT_corr", "Mut_corr", "p53_delta", "category")

p53_1708 <- merge(p53_1708, gpl1708@phenoData@data[,c("i-spy id:ch2", "pathological complete response (pcr):ch2", "new_rcbclass")], by.x = "ISPY.Case", by.y = "i-spy id:ch2")
p53_4133 <- merge(p53_4133, gpl4133@phenoData@data[,c("i-spy id:ch2", "pathological complete response (pcr):ch2", "new_rcbclass")], by.x = "ISPY.Case", by.y = "i-spy id:ch2")

p53_both_platforms <- dplyr::union(p53_1708, p53_4133)
p53_both_platforms <- dplyr::arrange(p53_both_platforms, `ISPY.Case`)

if (remove_samples_for_esserman_recreation) {
# Calculate risk categories, PCR and RCB ratios if Esserman et. al. recreation samples
  write(paste0("GPL1708 mapping rate: ", dim(gpl1708)[1], "/52"), "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/p53_esserman_statistics.txt", append = T)
  write(paste0("GPL4133 mapping rate: ", dim(gpl4133)[1], "/52"), "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/p53_esserman_statistics.txt", append = T)
  write(paste0("Number of samples per risk category:"), "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/p53_esserman_statistics.txt", append = T)
  write(paste0(names(table(p53_both_platforms$category))[1], " ", names(table(p53_both_platforms$category))[2]), "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/p53_esserman_statistics.txt", append = T)
  write(table(p53_both_platforms$category), "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/p53_esserman_statistics.txt", append = T)
  write(paste0("p53 wildtype with PCR (positive): ", nrow(p53_both_platforms[p53_both_platforms$`pathological complete response (pcr):ch2` == "Yes" & p53_both_platforms$category == "wt",]), "/", nrow(p53_both_platforms[p53_both_platforms$category == "wt",])), "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/p53_esserman_statistics.txt", append = T)
  write(paste0("p53 mutated risk with PCR (positive): ", nrow(p53_both_platforms[p53_both_platforms$`pathological complete response (pcr):ch2` == "Yes" & p53_both_platforms$category == "mut",]), "/", nrow(p53_both_platforms[p53_both_platforms$category == "mut",])), "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/p53_esserman_statistics.txt", append = T)
  write(paste0("p53 wildtype risk with RCB (0 or I class): ", nrow(p53_both_platforms[(p53_both_platforms$new_rcbclass == 0 | p53_both_platforms$new_rcbclass == 1) & p53_both_platforms$category == "wt",]), "/", nrow(p53_both_platforms[p53_both_platforms$category == "wt",])), "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/p53_esserman_statistics.txt", append = T)
  write(paste0("p53 mutated risk with RCB (0 or I class): ", nrow(p53_both_platforms[(p53_both_platforms$new_rcbclass == 0 | p53_both_platforms$new_rcbclass == 1) & p53_both_platforms$category == "mut",]), "/", nrow(p53_both_platforms[p53_both_platforms$category == "mut",])), "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/p53_esserman_statistics.txt", append = T)
  write.table(p53_both_platforms, "gene_signatures_pre_publication/recreation_study_esserman_et_al_files/p53_signature_scores_and_classes_with_recreation_samples.tsv.tsv", sep = "\t", row.names = F, col.names = T)
} else
  write.table(p53_both_platforms, "gene_signatures_pre_publication/p53_signature_scores_and_classes.tsv", sep = "\t", row.names = F, col.names = T)