## Load the libraries

library(robustHD)
library(gplots)
library(ggplot2)
library("readxl")
library(heatmap3)
library(sigclust)
library(ConsensusClusterPlus)
library(colorRamps)
library(plotrix)
library(gmodels)
library(vioplot)
library(scales)
library(plyr)
library(lattice)
library(cluster)
library(fpc)
library(survC1)     # c-statistic
library("tidyverse")  # (Includes tibble, magrittr, tidyr, ggplot2, others)
library("survival")
library("survminer")  # survival curves in ggplot
library("survC1")
library(survival)
library(survMisc)
library("lmtest")



## Load the data files
allData= read_excel("Data.xlsx", sheet="Discovery_cohort", col_names= T)

allData= data.frame(allData)

featureData<- (data.frame(allData[ ,(c(2:7))]))

clinData=data.frame(allData[, c(8:44)])
########

#featureData_reorder<-featureData[, c(ppg_heatmap$rowInd)]

#-------------------------------------------------cluster cases-----------------------------------------------------------
caseCluster= as.dist(dist(featureData, method= "euclidean"));
#cluster distance based on Ward's method
colCluster= hclust((caseCluster), method= "ward.D2");

#------------------------------------------------#cluster features---------------------------------------------------------

featureCluster= t(featureData); #features are now rows

# calcaute distance between features (rows)
featureCluster= as.dist(dist(featureCluster, method= "manhattan"));

#cluster features based on Ward's method
rowCluster= hclust(featureCluster, method= "ward.D")




#####################################################################################

ispy_cluster <-cutree(colCluster,2) # This number was determined using Sig Clust (only2 phenotypes were significant)

#setting cluster colors

ColumnCluster_col<-colByValue(as.matrix(ispy_cluster),
                              col=c("blue", "red"))
####################################################



ER_col <- colByValue(as.matrix(clinData$ERpos),
                     col<- c("lightgreen",  "darkgreen"))

HR_col<-colByValue(as.matrix(clinData$HR.Pos),
                   col= c("lightpink","red"))

HER2_col<- colByValue(as.matrix(clinData$Her2MostPos),
                      col= c("lightblue", "mediumblue"))


Trip_neg_col<-colByValue(as.matrix(clinData$Triple.Negative),
                         col=c("violet", "purple4"))


Rec_col<-colByValue(as.matrix(clinData$rfsind),
                    col=c("lightgreen",  "darkgreen"))

PCR_col <- colByValue(as.matrix(clinData$PCR),
                      col=c("yellow", "orange"))

#allData$Size..largest.diameter..mm <- scale(allData$Size..largest.diameter..mm)
FTV_col <- colByValue(as.matrix(clinData$FTV_Volume_T2),
                          col<-colorRampPalette(c("violet",  "purple3", "purple4"))(50))


col_combos.train<-cbind(FTV_col,PCR_col, Rec_col,Trip_neg_col, HER2_col, HR_col, ER_col, ColumnCluster_col )
colnames(col_combos.train)<-c("Functional Tumor Volume", "PCR", "Recurrence", "Triple Negative", "HER2 Status", "Hormone Receptor Status", "Estrogen Receptor", "Phenotypes")

###################################################################

# 
featureData[featureData >3] <- 3
featureData[featureData < -3] <- -3



break_vals=c(-4,-3,-2.5, -2, -1.5, -1,-0.9, -0.85, -0.8,-0.75, -0.7,-0.65, -0.6, -0.55, -0.5, -0.45, -0.4, -0.3, -0.2,-0.1, -0.05, 0, 0.05, 0.1, 0.13, 0.15, 0.2, 0.22, 0.25,  0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.90, 0.95, 1, 1.25, 1.5, 1.75, 2, 2.25, 2.5, 2.75, 3, 3.25, 3.5, 4, 4.25, 4.5,5)
colors.train<-colorpanel(length(break_vals)-1, 'blue', 'white', 'red4')


ispy_heatmap<-heatmap3(t(as.matrix(featureData)),margin=c(6,6),
                       balanceColor=TRUE,
                       Rowv=as.dendrogram(rowCluster),
                       Colv=as.dendrogram(colCluster),
                      # col=colors.train,
                       scale="none",
                       labCol="",
                       ColSideColors=col_combos.train,
                       RowSideLabs="",
                       revC=T,
                       #breaks= break_vals,
                       cexRow=0.8,
                       cexCol=0.3)

## SAM analysis


SAM_data <- t(featureData)
SAM_output <- SAM(SAM_data, ispy_cluster, resp.type = "Two class unpaired" )



## Two phenotype test

sig1<-sigclust((featureData),1000, nrep=1, labflag=1, label= ispy_cluster,icovest=3)
plot(sig1,arg="all")


######################### Phenotype significance #####################################
SurvObject <- Surv(time = (clinData$rfs)/365, event = clinData$rfsind)
SurvfitGraph1 <- survfit(SurvObject ~ ispy_cluster)
colVector=  c("blue", "red")
plot(SurvfitGraph1, mark.time=FALSE, col=colVector,  xlab= "Time (years)", ylab= "Survival Probability")
title("Kaplan-Meier Curves for patient groups defined by \n heterogeneity phenotype")
summary(SurvfitGraph1)
LogRankTest= survdiff(SurvObject~ispy_cluster,  rho=0)



### Creating Kaplan Meier curves with table

p <-ggsurvplot(fit= survfit(SurvObject ~ ispy_cluster), data= allData, conf.int=FALSE, 
               censor=FALSE, alpha=1, size=1.23, title="Kaplan-Meier curves for womens defined by \n heterogeneity phenotype",
               risk.table= "absolute", 
               xlab="Time (years)", 
               xlim=c(0,7),
               tables.height=0.25, 
               risk.table.title="Number at risk", 
               risk.table.col="grey25", 
               risk.table.fontsize=rel(4), 
               risk.table.y.text=FALSE, 
               risk.table.type="absolute",
               palette= c("blue", "red"))

q <- p
theme_bwEAC01 <-  theme_bw() + 
  theme(panel.border=element_rect(color="lightgray", size=0.5),
        axis.ticks=element_line(color="lightgray"), 
        axis.text.x = element_text(vjust=1),
        plot.title = element_text(vjust=1))
q$plot <- q$plot + 
  theme_bwEAC01 + 
  theme(panel.border = element_blank(),
        axis.line = element_line(color="lightgrey", size=0.5), 
        legend.position="top", 
        legend.justification = 0, 
        plot.title=element_text(size = rel(1.5)), 
        plot.subtitle=element_text(size = rel(1.25)), 
        axis.title=element_text(size = rel(1.0))
  )
q$table <- p$table + 
  theme(panel.border = element_rect(color = "lightgrey", fill=NA),
        plot.title=element_text(size = rel(1.0), color="grey25"), 
        axis.line=element_line(color="lightgrey", size=0.5)
  )
print(q)








## Cox Regression Models




featureData_temp<- featureData[- c(which(is.na(clinData$Her2MostPos))),]

ispy_cluster_temp<- ispy_cluster[- c(which(is.na(clinData$Her2MostPos)))]
clinDataSurv<-clinData[-c(which(is.na(clinData$Her2MostPos))),]
SurvObject <- Surv(time = (clinDataSurv$rfs), event = (clinDataSurv$rfsind ==1))
## For the modelling, change it so that 1- low heterogeneity, 2= high heterogeneity 
temp<- clinDataSurv$Cluster_assignment
temp[which(clinDataSurv$Cluster_assignment==1)]=2
temp[which(clinDataSurv$Cluster_assignment==2)]=1
clinDataSurv$Cluster_assignment<-temp


model_1 <-coxph(data=clinDataSurv,formula=SurvObject ~   clinDataSurv$ROR_score)
model_2 <-coxph(data=clinDataSurv,formula=SurvObject ~   clinDataSurv$mammaprint_refined_correlation)
model_3 <-coxph(data=clinDataSurv,formula=SurvObject ~   clinDataSurv$p53_binary)
baseline_full <- coxph(data= clinDataSurv, formula= SurvObject ~ clinDataSurv$HR.Pos + clinDataSurv$Her2MostPos + clinDataSurv$age)
baseline_ftv <- coxph(data= clinDataSurv, formula= SurvObject ~ clinDataSurv$HR.Pos + clinDataSurv$Her2MostPos + clinDataSurv$age + (clinDataSurv$FTV_Volume_T2))

result<-lrtest(baseline_full, baseline_ftv)
result



model_4 <-coxph(data=clinDataSurv,formula=SurvObject ~   clinDataSurv$HR.Pos + clinDataSurv$Her2MostPos + clinDataSurv$age+ clinDataSurv$FTV_Volume_T2+ clinDataSurv$ROR_score+ clinDataSurv$mammaprint_refined_correlation+ clinDataSurv$p53_continuous)
result<-lrtest(baseline_ftv, model_4)
result

model_5 <-coxph(data=clinDataSurv,formula=SurvObject ~   clinDataSurv$HR.Pos + clinDataSurv$Her2MostPos + clinDataSurv$age+ clinDataSurv$FTV_Volume_T2+ clinDataSurv$Cluster_assignment)
result<-lrtest(baseline_ftv, model_5)
result

model_6 <-coxph(data=clinDataSurv,formula=SurvObject ~   clinDataSurv$HR.Pos + clinDataSurv$Her2MostPos + clinDataSurv$age+ clinDataSurv$FTV_Volume_T2+  clinDataSurv$ROR_score+ clinDataSurv$mammaprint_refined_correlation+ clinDataSurv$p53_continuous+  clinDataSurv$Cluster_assignment)
result<-lrtest(model_4, model_6)
result

SurvObject_test <- Surv(time = (clinData_test$rfs), event = (clinData_test$rfsind ==1))
ftv_test <- coxph(data= clinData_test, formula= SurvObject_test ~ HR.Pos + Her2MostPos + age + FTV_Volume_T2 )

pred_test<- predict( baseline_ftv, newdata= (clinDataSurv))
dd_full_model <- cbind(clinDataSurv$rfs, clinDataSurv$rfsind, pred_test)
cs_full_model_phenotype <- Est.Cval(mydata=dd_full_model, tau=3*365.25, nofit=TRUE)
cs_full_model_phenotype$Dhat

result<-lrtest(model_4, model_6)
result

## Calculate the CI of c-statistics
# Generate data frames 

baseline_CV_model=cbind(clinDataSurv$rfs, clinDataSurv$rfsind, clinDataSurv$age, clinDataSurv$HR.Pos, clinDataSurv$Her2MostPos)
baseline= cvC(baseline_CV_model, tau= 365.25*3, 3, 100)



########### Regression models if the Functional tumor volume was log transformed (base 2)

baseline_ftv <- coxph(data= clinDataSurv, 
                      formula= SurvObject ~ clinDataSurv$HR.Pos + 
                        clinDataSurv$Her2MostPos + 
                        clinDataSurv$age + 
                        log(clinDataSurv$FTV_Volume_T2,2))

model_4 <-coxph(data=clinDataSurv,
                formula=SurvObject ~   clinDataSurv$HR.Pos + 
                  clinDataSurv$Her2MostPos +
                  clinDataSurv$age+ 
                  log(clinDataSurv$FTV_Volume_T2,2)+ 
                  clinDataSurv$ROR_score+ 
                  clinDataSurv$mammaprint_refined_correlation+ 
                  clinDataSurv$p53_continuous)

model_5 <-coxph(data=clinDataSurv,
                formula=SurvObject ~   clinDataSurv$HR.Pos + 
                  clinDataSurv$Her2MostPos +
                  clinDataSurv$age+ 
                  log(clinDataSurv$FTV_Volume_T2,2)+ 
                  clinDataSurv$Cluster_assignment)
model_6 <-coxph(data=clinDataSurv,
                formula=SurvObject ~   clinDataSurv$HR.Pos + 
                  clinDataSurv$Her2MostPos +
                  clinDataSurv$age+
                  log(clinDataSurv$FTV_Volume_T2,2)+ 
                  clinDataSurv$mammaprint_refined_correlation+ 
                  clinDataSurv$ROR_score+
                  clinDataSurv$p53_continuous+  
                  clinDataSurv$Cluster_assignment)


SurvObject_test <- Surv(time = (clinData_test$rfs), event = (clinData_test$rfsind ==1))
ftv_test <- coxph(data= clinData_test, formula= SurvObject_test ~ HR.Pos + Her2MostPos + age + FTV_Volume_T2 )

pred_test<- predict( baseline_ftv, newdata= (clinDataSurv))
dd_full_model <- cbind(clinDataSurv$rfs, clinDataSurv$rfsind, pred_test)
cs_full_model_phenotype <- Est.Cval(mydata=dd_full_model, tau=3*365.25, nofit=TRUE)
cs_full_model_phenotype$Dhat


#####################################
# Calculate 3-fold cross validated c-scores with 100 replicates

cvModelData<-clinDataSurv


c_score_mod1 = vector()
c_score_mod2= vector()
c_score_mod3= vector()
c_score_mod4= vector()
c_score_mod5= vector()
c_score_mod6= vector()
c_score_mod7= vector()
c_score_mod8= vector()
c_score_mod9= vector()
for (i in 1:100){
  ## 3-fold CV  prediction score
  c_score_cv_mod1= vector()
  c_score_cv_mod2= vector()
  c_score_cv_mod3=vector()
  c_score_cv_mod4=vector()
  c_score_cv_mod5=vector()
  c_score_cv_mod6=vector()
  c_score_cv_mod7=vector()
  c_score_cv_mod8=vector()
  c_score_cv_mod9=vector()  
  flds<-sample(rep(1:3, length.out=98))
  for (k in c(1:3)){
    test_i<- which(flds ==k)
    train_data<-(cvModelData[-test_i,])
    test_data<-cvModelData[test_i,]
    train_data<- as.data.frame(train_data)
    
    # Train survival object
    SurvObject_CV <- Surv(time=(cvModelData$rfs[-test_i]), event=(cvModelData$rfsind[-test_i]))
    
    model_1<-coxph(data=train_data,formula=SurvObject_CV ~  ROR_score) 
    model_2<-coxph(data=train_data,formula=SurvObject_CV ~  p53_continuous) 
    model_3<-coxph(data=train_data,formula=SurvObject_CV ~  mammaprint_refined_correlation ) 
    model_4<-coxph(data=train_data,formula=SurvObject_CV ~  ROR_score+p53_continuous+mammaprint_refined_correlation + HR.Pos + Her2MostPos + age + FTV_Volume_T2 ) #Model 3 in paper
    model_5<-coxph(data=clinDataSurv,formula=SurvObject  ~  HR.Pos + Her2MostPos + age ) #Baseline - Modle 1 in paper
    model_6<-coxph(data=clinDataSurv,formula=SurvObject  ~  HR.Pos + Her2MostPos + age + FTV_Volume_T2) # Baseline + FTV2 - Model 2 in paper
    model_8<-coxph(data=clinDataSurv,formula=SurvObject  ~  HR.Pos + Her2MostPos + age + FTV_Volume_T2+ Cluster_assignment) #Model 4 in paper
    model_9<-coxph(data=clinDataSurv,formula=SurvObject  ~  ROR_score+p53_continuous+mammaprint_refined_correlation + HR.Pos + Her2MostPos + age + FTV_Volume_T2+ Cluster_assignment) #Full
    
    
    
    pred_test_1<- predict( model_1, newdata= (test_data))
    pred_test_2<- predict( model_2, newdata= (test_data))
    pred_test_3<- predict( model_3, newdata= (test_data))
    pred_test_4<- predict( model_4, newdata= (test_data))
    pred_test_5<- predict( model_5, newdata= (test_data))
    pred_test_6<- predict( model_6, newdata= (test_data))
    pred_test_8<- predict( model_8, newdata= (test_data))
    pred_test_9<- predict( model_9, newdata= (test_data))    
    #Model 1
    dd_CV_1 <- cbind(cvModelData$rfs[test_i], cvModelData$rfsind[test_i], pred_test_1)
    cs_CV_1 <- Est.Cval(mydata=dd_CV_1, tau=3*365.25, nofit=TRUE)
    c_score_cv_mod1<-c(c_score_cv_mod1,cs_CV_1$Dhat)
    #Model 2
    dd_CV_2 <- cbind(cvModelData$rfs[test_i], cvModelData$rfsind[test_i], pred_test_2)
    cs_CV_2<- Est.Cval(mydata=dd_CV_2, tau=3*365.25, nofit=TRUE)
    c_score_cv_mod2<-c(c_score_cv_mod2,cs_CV_2$Dhat)    
    #Model 3
    dd_CV_3 <- cbind(cvModelData$rfs[test_i], cvModelData$rfsind[test_i], pred_test_3)
    cs_CV_3<- Est.Cval(mydata=dd_CV_3, tau=3*365.25, nofit=TRUE)
    c_score_cv_mod3<-c(c_score_cv_mod3,cs_CV_3$Dhat)       
    #Model 4
    dd_CV_4 <- cbind(cvModelData$rfs[test_i], cvModelData$rfsind[test_i], pred_test_4)
    cs_CV_4<- Est.Cval(mydata=dd_CV_4, tau=3*365.25, nofit=TRUE)
    c_score_cv_mod4<-c(c_score_cv_mod4,cs_CV_4$Dhat)   
    #Model 5
    dd_CV_5 <- cbind(cvModelData$rfs[test_i], cvModelData$rfsind[test_i], pred_test_5)
    cs_CV_5<- Est.Cval(mydata=dd_CV_5, tau=3*365.25, nofit=TRUE)
    c_score_cv_mod5<-c(c_score_cv_mod5,cs_CV_5$Dhat)   
    #Model 6
    dd_CV_6 <- cbind(cvModelData$rfs[test_i], cvModelData$rfsind[test_i], pred_test_6)
    cs_CV_6<- Est.Cval(mydata=dd_CV_6, tau=3*365.25, nofit=TRUE)
    c_score_cv_mod6<-c(c_score_cv_mod6,cs_CV_6$Dhat)      
     #Model 8
    dd_CV_8 <- cbind(cvModelData$rfs[test_i], cvModelData$rfsind[test_i], pred_test_8)
    cs_CV_8<- Est.Cval(mydata=dd_CV_8, tau=3*398.28, nofit=TRUE)
    c_score_cv_mod8<-c(c_score_cv_mod8,cs_CV_8$Dhat)   
    #Model 9
    dd_CV_9 <- cbind(cvModelData$rfs[test_i], cvModelData$rfsind[test_i], pred_test_9)
    cs_CV_9<- Est.Cval(mydata=dd_CV_9, tau=3*398.28, nofit=TRUE)
    c_score_cv_mod9<-c(c_score_cv_mod9,cs_CV_9$Dhat)                 
  }
  
  c_score_mod1= c(c_score_mod1, mean(c_score_cv_mod1))
  c_score_mod2= c(c_score_mod2, mean(c_score_cv_mod2))
  c_score_mod3= c(c_score_mod3, mean(c_score_cv_mod3))
  c_score_mod4= c(c_score_mod4, mean(c_score_cv_mod4))
  c_score_mod5= c(c_score_mod5, mean(c_score_cv_mod5))
  c_score_mod6= c(c_score_mod6, mean(c_score_cv_mod6))
  c_score_mod7= c(c_score_mod7, mean(c_score_cv_mod7))
  c_score_mod8= c(c_score_mod8, mean(c_score_cv_mod8))
  c_score_mod9= c(c_score_mod9, mean(c_score_cv_mod9))
  
}

mean(c_score_mod1)
mean(c_score_mod2)
mean(c_score_mod3)
mean(c_score_mod4)
mean(c_score_mod5)
mean(c_score_mod6)
mean(c_score_mod7)
mean(c_score_mod8)
mean(c_score_mod9)



sd(c_score_mod1)
sd(c_score_mod2)
sd(c_score_mod3)
sd(c_score_mod4)
sd(c_score_mod5)
sd(c_score_mod6)
sd(c_score_mod7)
sd(c_score_mod8)
sd(c_score_mod9)


SE <- function(x) sd(x)/sqrt(length(x))


# Calculate 95% CI

# 95% confidence intervals
alpha = 0.95
p = ((1.0-alpha)/2.0) 
lower = max(0.0, quantile(c_score_mod1, p))

p = (alpha+((1.0-alpha)/2.0))
upper = min(1.0, quantile(c_score_mod1, p))
lower
upper


#How does this compare to mean +/- 1.96*SE
upper_1=mean(c_score_mod1) + (1.96* SE(c_score_mod1))
lower_1=mean(c_score_mod1) - (1.96* SE(c_score_mod1))

upper_2=mean(c_score_mod2) + (1.96* SE(c_score_mod2))
lower_2=mean(c_score_mod2) - (1.96* SE(c_score_mod2))

upper_3=mean(c_score_mod3) + (1.96* SE(c_score_mod3))
lower_3=mean(c_score_mod3) - (1.96* SE(c_score_mod3))

upper_4=mean(c_score_mod4) + (1.96* SE(c_score_mod4))
lower_4=mean(c_score_mod4) - (1.96* SE(c_score_mod4))

upper_5=mean(c_score_mod5) + (1.96* SE(c_score_mod5))
lower_5=mean(c_score_mod5) - (1.96* SE(c_score_mod5))

upper_6=mean(c_score_mod6) + (1.96* SE(c_score_mod6))
lower_6=mean(c_score_mod6) - (1.96* SE(c_score_mod6))

upper_8=mean(c_score_mod8) + (1.96* SE(c_score_mod8))
lower_8=mean(c_score_mod8) - (1.96* SE(c_score_mod8))

upper_9=mean(c_score_mod9) + (1.96* SE(c_score_mod9))
lower_9=mean(c_score_mod9) - (1.96* SE(c_score_mod9))

############################################################################################
################# Kaplan Meier Risk prediction graphs ######################################
###########################################################################################
# Non- cross validated risk prediction curves

featureData_temp<- featureData[- c(which(is.na(clinData$Her2MostPos))),]

ispy_cluster_temp<- ispy_cluster[- c(which(is.na(clinData$Her2MostPos)))]
clinDataSurv<-clinData[-c(which(is.na(clinData$Her2MostPos))),]
SurvObject <- Surv(time = (clinDataSurv$rfs), event = (clinDataSurv$rfsind ==1))

# Baseline Model
model_1<-coxph(data=clinDataSurv,formula=SurvObject ~  HR.Pos + Her2MostPos + age )
pred_test<- predict( model_1, newdata= (clinDataSurv))
dd_CV <- cbind(clinDataSurv$rfs, clinDataSurv$rfsind ==1, pred_test)
cs_CV <- Est.Cval(mydata=dd_CV, tau=3*365.25, nofit=TRUE)
cs_CV$Dhat

predictionScore_FTV<- pred_test
predictionScore_FTV[which((pred_test > median(pred_test)))]=2 # High Risk
predictionScore_FTV[which((pred_test <= median(pred_test)))]=1 #Low Risk
SurvObject <- Surv(time = (clinDataSurv$rfs)/365, event = clinDataSurv$rfsind)
FTV_surv <- survfit(SurvObject ~ predictionScore_FTV)
colVector= c("gray", "black")
plot(FTV_surv, mark.time=TRUE, col=colVector,  xlab= "Time (years)", ylab= "Survival Probability", lwd=4, cex.axis = 2, cex.lab=1 )
LogRankTest= survdiff(SurvObject~predictionScore_FTV,  rho=0)
title('Survival curves for patients stratified by median \n baseline risk score')



# Baseline + FTV Model
model_1<-coxph(data=clinDataSurv,formula=SurvObject ~  HR.Pos + Her2MostPos + age + log(clinDataSurv$FTV_Volume_T2,2) )
pred_test<- predict( model_1, newdata= (clinDataSurv))
dd_CV <- cbind(clinDataSurv$rfs, clinDataSurv$rfsind ==1, pred_test)
cs_CV <- Est.Cval(mydata=dd_CV, tau=3*365.25, nofit=TRUE)
cs_CV$Dhat
predictionScore_FTV<- pred_test
predictionScore_FTV[which((pred_test > median(pred_test)))]=2 # High Risk
predictionScore_FTV[which((pred_test <= median(pred_test)))]=1 #Low Risk
SurvObject <- Surv(time = (clinDataSurv$rfs)/365, event = clinDataSurv$rfsind)
FTV_surv <- survfit(SurvObject ~ predictionScore_FTV)
colVector= c("gray", "black")
plot(FTV_surv, mark.time=TRUE, col=colVector,  xlab= "Time (years)", ylab= "Survival Probability", lwd=4, cex.axis = 2, cex.lab=1 )
LogRankTest= survdiff(SurvObject~predictionScore_FTV,  rho=0)
title('Survival curves for patients stratified by median \n baseline + FTV T2 risk score')


# Baseline + FTV Model + Cluster assignment
model_1<-coxph(data=clinDataSurv,formula=SurvObject ~  HR.Pos + Her2MostPos + age + log(clinDataSurv$FTV_Volume_T2,2) + Cluster_assignment )
pred_test<- predict( model_1, newdata= (clinDataSurv))
dd_CV <- cbind(clinDataSurv$rfs, clinDataSurv$rfsind ==1, pred_test)
cs_CV <- Est.Cval(mydata=dd_CV, tau=3*365.25, nofit=TRUE)
cs_CV$Dhat
predictionScore_FTV<- pred_test
predictionScore_FTV[which((pred_test > median(pred_test)))]=2 # High Risk
predictionScore_FTV[which((pred_test <= median(pred_test)))]=1 #Low Risk
SurvObject <- Surv(time = (clinDataSurv$rfs)/365, event = clinDataSurv$rfsind)
FTV_surv <- survfit(SurvObject ~ predictionScore_FTV)
colVector= c("gray", "black")
plot(FTV_surv, mark.time=TRUE, col=colVector,  xlab= "Time (years)", ylab= "Survival Probability", lwd=4, cex.axis = 2, cex.lab=1 )
LogRankTest= survdiff(SurvObject~predictionScore_FTV,  rho=0)
title('Survival curves for patients stratified by median \n baseline, FTV T2, Delta phenotype assignment risk score')







## 3-fold CV  prediction score
f_vector = vector()
c_score_cv= vector()
clinDataSurv<-clinData[-c(which(is.na(clinData$Her2MostPos))),]
flds<-sample(rep(1:3, length.out=98))

for (k in c(1:3)){
  test_i<- which(flds ==k)
  train_data<-(cvModelData[-test_i,])
  test_data<-cvModelData[test_i,]
  train_data<- as.data.frame(train_data)
  
  # Train survival object
  
  SurvObject_CV <- Surv(time=(cvModelData$rfs[-test_i]), event=(cvModelData$rfsind[-test_i]))
  
  ################################################
  ## Cox proportional-hazards regression
  ################################################
  
  model_cv<-coxph(data=train_data,formula=SurvObject_CV ~  HR.Pos + Her2MostPos + age ) # + ispy_cluster_temp)
  
  pred_test<- predict( model_cv, newdata= (test_data))
  
  dd_CV <- cbind(cvModelData$rfs[test_i], cvModelData$rfsind[test_i], pred_test)
  cs_CV <- Est.Cval(mydata=dd_CV, tau=3*365.25, nofit=TRUE)
  c_score_cv<-c(c_score_cv,cs_CV$Dhat)
  vector <- pred_test
  f_vector <- c(f_vector, vector)
}

f_vector_ordered= f_vector[c(order(as.numeric(names(f_vector))))]

predictionScore_FTV<- f_vector_ordered
predictionScore_FTV[which((f_vector_ordered > median(f_vector_ordered)))]=2 # High Risk
predictionScore_FTV[which((f_vector_ordered <= median(f_vector_ordered)))]=1 #Low Risk
SurvObject <- Surv(time = (clinDataSurv$rfs)/365, event = clinDataSurv$rfsind)
FTV_surv <- survfit(SurvObject ~ predictionScore_FTV)
colVector= c("green", "red")
plot(FTV_surv, mark.time=TRUE, col=colVector,  xlab= "Time (years)", ylab= "Survival Probability", lwd=4, cex.axis = 2, cex.lab=1 )
LogRankTest= survdiff(SurvObject~predictionScore_FTV,  rho=0)
title('Survival curves for patients stratified by median \n baseline risk score')
print(mean(c_score_cv))


# Baseline + FTV
f_vector = vector()
c_score_cv= vector()
for (k in c(1:3)){
  test_i<- which(flds ==k)
  train_data<-(cvModelData[-test_i,])
  test_data<-cvModelData[test_i,]
  train_data<- as.data.frame(train_data)
  
  # Train survival object
  
  SurvObject_CV <- Surv(time=(cvModelData$rfs[-test_i]), event=(cvModelData$rfsind[-test_i]))
  
  ################################################
  ## Cox proportional-hazards regression
  ################################################
  
  model_cv<-coxph(data=train_data,formula=SurvObject_CV ~  HR.Pos + Her2MostPos + age +  log(FTV_Volume_T2,2) ) 
  
  pred_test<- predict( model_cv, newdata= (test_data))
  
  dd_CV <- cbind(cvModelData$rfs[test_i], cvModelData$rfsind[test_i], pred_test)
  cs_CV <- Est.Cval(mydata=dd_CV, tau=3*365.25, nofit=TRUE)
  c_score_cv<-c(c_score_cv,cs_CV$Dhat)
  vector <- pred_test
  f_vector <- c(f_vector, vector)
}

f_vector_ordered= f_vector[c(order(as.numeric(names(f_vector))))]

predictionScore_FTV<- f_vector_ordered
predictionScore_FTV[which((f_vector_ordered > median(f_vector_ordered)))]=2 # High Risk
predictionScore_FTV[which((f_vector_ordered <= median(f_vector_ordered)))]=1 #Low Risk
SurvObject <- Surv(time = (clinDataSurv$rfs)/365, event = clinDataSurv$rfsind)
FTV_surv <- survfit(SurvObject ~ predictionScore_FTV)
colVector= c("green", "red")
plot(FTV_surv, mark.time=TRUE, col=colVector,  xlab= "Time (years)", ylab= "Survival Probability", lwd=4, cex.axis = 2, cex.lab=1 )
LogRankTest= survdiff(SurvObject~predictionScore_FTV,  rho=0)
title('Survival curves for patients stratified by median \n baseline and FTV 2 risk score')
print(mean(c_score_cv))


## Baseline + FTV T2+ phenotype
f_vector = vector()
c_score_cv= vector()
for (k in c(1:3)){
  test_i<- which(flds ==k)
  train_data<-(cvModelData[-test_i,])
  test_data<-cvModelData[test_i,]
  train_data<- as.data.frame(train_data)
  
  # Train survival object
  
  SurvObject_CV <- Surv(time=(cvModelData$rfs[-test_i]), event=(cvModelData$rfsind[-test_i]))
  
  ################################################
  ## Cox proportional-hazards regression
  ################################################
  
  model_cv<-coxph(data=train_data,formula=SurvObject_CV ~  HR.Pos + Her2MostPos + age+  log(FTV_Volume_T2,2) + Cluster_assignment)
  
  pred_test<- predict( model_cv, newdata= (test_data))
  
  dd_CV <- cbind(cvModelData$rfs[test_i], cvModelData$rfsind[test_i], pred_test)
  cs_CV <- Est.Cval(mydata=dd_CV, tau=3*365.25, nofit=TRUE)
  c_score_cv<-c(c_score_cv,cs_CV$Dhat)
  vector <- pred_test
  f_vector <- c(f_vector, vector)
}

f_vector_ordered= f_vector[c(order(as.numeric(names(f_vector))))]

predictionScore_FTV<- f_vector_ordered
predictionScore_FTV[which((f_vector_ordered > median(f_vector_ordered)))]=2 # High Risk
predictionScore_FTV[which((f_vector_ordered <= median(f_vector_ordered)))]=1 #Low Risk
SurvObject <- Surv(time = (clinDataSurv$rfs)/365, event = clinDataSurv$rfsind)
FTV_surv <- survfit(SurvObject ~ predictionScore_FTV)
colVector= c("green", "red")
plot(FTV_surv, mark.time=TRUE, col=colVector,  xlab= "Time (years)", ylab= "Survival Probability", lwd=4, cex.axis = 2, cex.lab=1 )
LogRankTest= survdiff(SurvObject~predictionScore_FTV,  rho=0)
title('Survival curves for patients stratified by median \n baseline, FTV 2, and delta phenotype risk score')
print(mean(c_score_cv))







## Consensus Clustering

data_vals<- t(featureData)
result=ConsensusClusterPlus(data_vals, maxK=8, reps=50, pItem= 0.8, clusterAlg= "hc", distance="euclidean", innerLinkage= 'ward.D', finalLinkage = 'ward.D')

# SigClust 
sig1<-sigclust((featureData),100,labflag=1, label= ispy_cluster,icovest=3)
plot(sig1,arg="all")

####### Clinical Covariate Distribution


## FTV diferences 
#Low
Phenotype_1= (as.matrix(clinData$FTV_percent_change[which(ispy_cluster==1)]))
#High
Phenotype_2= (as.matrix(clinData$FTV_percent_change[which(ispy_cluster==2)]))

vioplot(Phenotype_1,  Phenotype_2,  names= c("Phenotype 1", "Phenotype 2"), col=c("red", "green"))
title("FTV across heterogeneity phenotypes")


# ER status sig test
hormone= table(clinData$ERpos, ispy_cluster);
print(hormone)
chisq.test(hormone,simulate.p.value = TRUE)


hormone= table(clinData$PgRpos, ispy_cluster);
print(hormone)
chisq.test(hormone,simulate.p.value = TRUE)


hormone= table(clinData$Her2MostPos, ispy_cluster);
print(hormone)
chisq.test(hormone,simulate.p.value = TRUE)



hormone= table(allData$Triple.Negative, ispy_cluster);
print(hormone)
chisq.test(hormone,simulate.p.value = TRUE)

hormone= table(allData$rfsind, ispy_cluster);
print(hormone)
chisq.test(hormone,simulate.p.value = TRUE)


hormone= table(allData$PCR, ispy_cluster);
print(hormone)
chisq.test(hormone,simulate.p.value = TRUE)



bp <- barplot(pcr_table*100, beside=TRUE, axes=FALSE, xlab="pCR across heterogeneity phenotypes", names=c("Phenotype 1", "Phenotype 2"), col=c("grey19", "grey90"), ylab="Frequency (%)", ylim=c(0,100))
axis(2, at=seq(0,100,10))
legend("topright", legend=c("No response", "pCR"), bty="n", fill=c("grey19", "grey90"))





kruskal.test(clinData$FTV_Volume_T2~ ispy_cluster)


##  Heterogeniety index boxplot
#High
Phenotype_1= rowMeans(as.matrix(featureData[which(ispy_cluster==1),]))
#Low
Phenotype_2=rowMeans(as.matrix(featureData[which(ispy_cluster==2),]))

vioplot(Phenotype_1, Phenotype_2, names= c("Phenotype 1", "Phenotype 2" ), col=c("red", "green"),ylim=c(-0.6, 0.6))
title("Heterogeneity index across phenotypes")

kruskal.test(rowMeans(as.matrix(featureData)), ispy_cluster)

bp <- barplot(meds, beside=TRUE, axes=FALSE, xlab="Anti-depression meds use", names=c("No", "Yes"), col=c("azure3", "azure"), ylab="Frequency (%)", ylim=c(0,100))
axis(2, at=seq(0,100,10))
legend("topright", legend=c("LOW", "HIGH"), bty="n", fill=c("azure3", "azure"))
text(bp, 0, round(medtimerx, 1), cex=1, pos=3)



## Differences in molecular signatures across het phenotpyes
# make sure to use the clinDataSurv$Cluster_assignment that has been flipped such that 1= LOW 2= HIGH
boxplot(clinDataSurv$mammaprint_refined_correlation~ Cluster_assignment,ylab='MammaPrint Correlation', data=clinDataSurv, col=c("blue", "red"), names=c("Phenotype 1", "Phenotype 2"))
boxplot(clinDataSurv$ROR_score~ Cluster_assignment,ylab='ROR-s', data=clinDataSurv, col=c("blue", "red"), names=c("Phenotype 1", "Phenotype 2"))
boxplot(clinDataSurv$p53_continuous~ Cluster_assignment,ylab='p53 mutation score', data=clinDataSurv, col=c("blue", "red"), names=c("Phenotype 1", "Phenotype 2"))

kruskal.test(clinDataSurv$mammaprint_refined_correlation~ clinDataSurv$Cluster_assignment)
kruskal.test(clinDataSurv$ROR_score~ clinDataSurv$Cluster_assignment)
kruskal.test(clinDataSurv$p53_continuous~ clinDataSurv$Cluster_assignment)





############################# Functional tumor volume anlaysis ######################################
#


## FTV curve split


featureData_temp<- featureData[- c(which(is.na(clinData$Her2MostPos))),]
ispy_cluster_temp<- ispy_cluster[- c(which(is.na(clinData$Her2MostPos)))]
clinDataSurv<-clinData[-c(which(is.na(clinData$Her2MostPos))),]
SurvObject <- Surv(time = (clinDataSurv$rfs)/365, event = clinDataSurv$rfsind)

split_FTV <- clinDataSurv$FTV_Volume_T2
split_FTV[which((clinDataSurv$FTV_Volume_T2 > median(clinDataSurv$FTV_Volume_T2)))]=2 # High Risk
split_FTV[which((clinDataSurv$FTV_Volume_T2 <= median(clinDataSurv$FTV_Volume_T2)))]=1 #Low Risk
colVector=  c("dodgerblue", "navy")
FTV_surv <- survfit(SurvObject ~ split_FTV, data=clinData)
plot(FTV_surv, mark.time=FALSE, col=colVector,  xlab= "Time (years)", ylab= "Survival Probability", lwd=3)
title("Kaplan-Meier Curves for patient groups stratified by \n median FTV T2")
summary(FTV_surv)
LogRankTest= survdiff(SurvObject~split_FTV,  rho=0)

p <-ggsurvplot(fit= FTV_surv,  conf.int=FALSE, 
               censor=FALSE, alpha=1, size=1.23, title="Kaplan-Meier curves for womens defined by \n HR+ phenotype",
               risk.table= "absolute", 
               xlab="Time (years)", 
               xlim=c(0,7),
               tables.height=0.25, 
               risk.table.title="Number at risk", 
               risk.table.col="grey25", 
               risk.table.fontsize=rel(6), 
               risk.table.y.text=FALSE, 
               risk.table.type="absolute",
               palette= c("gray5", "darkgray"))

q <- p
theme_bwEAC01 <-  theme_bw() + 
  theme(panel.border=element_rect(color="lightgray", size=0.5),
        axis.ticks=element_line(color="lightgray"), 
        axis.text.x = element_text(vjust=1),
        plot.title = element_text(vjust=1))
q$plot <- q$plot + 
  theme_bwEAC01 + 
  theme(panel.border = element_blank(),
        axis.line = element_line(color="lightgrey", size=0.5), 
        legend.position="top", 
        legend.justification = 0, 
        plot.title=element_text(size = rel(1.5)), 
        plot.subtitle=element_text(size = rel(1.25)), 
        axis.title=element_text(size = rel(1.0))
  )
q$table <- p$table + 
  theme(panel.border = element_rect(color = "lightgrey", fill=NA),
        plot.title=element_text(size = rel(1.0), color="grey25"), 
        axis.line=element_line(color="lightgrey", size=0.5)
  )
print(q)







# Survival curve for above median


above_med <- survfit(Surv((clinDataSurv$rfs[which(split_FTV==2)]/365), clinDataSurv$rfsind[which(split_FTV==2)]) ~ clinDataSurv$Cluster_assignment[which(split_FTV==2)])
below_med <- survfit(Surv((clinDataSurv$rfs[which(split_FTV==1)]/365), clinDataSurv$rfsind[which(split_FTV==1)]) ~ clinDataSurv$Cluster_assignment[which(split_FTV==1)])
FTV_surv <- survfit(SurvObject ~ split_FTV)

fit<- list(above_median_ftv = above_med, below_median_ftv= below_med, FTV= FTV_surv)

p<-  ggsurvplot_combine(fit, data= clinDataSurv, combine = TRUE, # Combine curves
           risk.table = TRUE,                  # Add risk table
           conf.int = FALSE,                    # Add confidence interval
           conf.int.style = "ribbon",            # CI style, use "step" or "ribbon"
           censor = FALSE,                     # Remove censor points
           pval="0.05",
           linetype=c("solid", "solid",  "solid", "solid", "dashed", "dashed")

          ) 

cols <-  c("blue4", "blue", "red4", "red" , "navyblue", "dodgerblue")


p$plot <- p$plot + 
  scale_color_manual(values= cols)
p$plot



##################################################
## Above median FTV survival curves
##################################################

above_phen_fit <- survfit(Surv((clinDataSurv$rfs[which(split_FTV==2)]/365), clinDataSurv$rfsind[which(split_FTV==2)]) ~ clinDataSurv$Cluster_assignment[which(split_FTV==2)])
above_fit <- list(above_median_ftv = above_phen_fit,FTV= survfit(Surv((clinDataSurv$rfs[which(split_FTV==2)]/365), clinDataSurv$rfsind[which(split_FTV==2)]) ~ 1))


p_above<-  ggsurvplot_combine(above_fit,
                       data= clinDataSurv, combine = TRUE, # Combine curves
                        risk.table = TRUE,                  # Add risk table
                        conf.int = FALSE,                    # Add confidence interval
                        conf.int.style = "ribbon",            # CI style, use "step" or "ribbon"
                        censor = FALSE,                     # Remove censor points
                        pval=TRUE,
                        linetype=c("solid", "solid", "dashed")
) 

LogRankTest= survdiff(Surv(clinDataSurv$rfs[which(split_FTV==2)], clinDataSurv$rfsind[which(split_FTV==2)]) ~ clinDataSurv$Cluster_assignment[which(split_FTV==2)],  rho=0)

cols <-  c("red", "green",  "navyblue")
p_above$plot <- p_above$plot + 
  scale_color_manual(values= cols)
p_above$plot


##################################################
## Below median FTV survival curves
##################################################

below_fit<-survfit(Surv((clinDataSurv$rfs[which(split_FTV==1)]/365), clinDataSurv$rfsind[which(split_FTV==1)]) ~ clinDataSurv$Cluster_assignment[which(split_FTV==1)])
below_fits<- list(below_median= below_fit, FTV_split= survfit(Surv((clinDataSurv$rfs[which(split_FTV==1)]/365), clinDataSurv$rfsind[which(split_FTV==1)]) ~ 1))
p_below<-  ggsurvplot_combine(below_fits,
                      data= clinDataSurv, combine = TRUE, # Combine curves
                      risk.table = TRUE,                  # Add risk table
                      conf.int = FALSE,                    # Add confidence interval
                      conf.int.style = "ribbon",            # CI style, use "step" or "ribbon"
                      censor = FALSE,                     # Remove censor points
                      pval=TRUE,
                      linetype=c("solid", "solid", "dashed"),
                      break.y.by= 0.2,
                      size= 1.25

)



cols <-  c( "red", "green", "dodgerblue")
p_below$plot <- p_below$plot + 
  scale_color_manual(values= cols)
p_below$plot

###################################################
## Survival curves for covariate subtypes


## HR+ /HER2 - 

subtype_1<- which(clinDataSurv$HR.Pos==1 & clinDataSurv$Her2MostPos == 0)
SurvObject_1 <- Surv(time = (clinDataSurv$rfs[subtype_1]/365), event = clinDataSurv$rfsind[subtype_1])
Cluster_assignment_HR=clinDataSurv$Cluster_assignment[subtype_1]
fit_1 <- survfit(data= clinDataSurv[c(subtype_1),], SurvObject_1 ~ clinDataSurv$Cluster_assignment[subtype_1])
data_new= clinDataSurv[c(subtype_1),]
plot(survfit(data=clinDataSurv[c(subtype_1),], SurvObject_1 ~ Cluster_assignment_HR), mark.time=FALSE, col=c( "red", "blue"),  xlab= "Time (years)", ylab= "Survival Probability", lwd= 2)

p <-ggsurvplot(fit= survfit(data=clinDataSurv[c(subtype_1),], SurvObject_1 ~ Cluster_assignment_HR), data= data_new, conf.int=FALSE, 
               censor=FALSE, alpha=1, size=1.23, title="Kaplan-Meier curves for womens defined by \n HR+ phenotype",
               risk.table= "absolute", 
               xlab="Time (years)", 
               xlim=c(0,7),
               tables.height=0.25, 
               risk.table.title="Number at risk", 
               risk.table.col="grey25", 
               risk.table.fontsize=rel(6), 
               risk.table.y.text=FALSE, 
               risk.table.type="absolute",
               palette= c("red", "blue"))

q <- p
theme_bwEAC01 <-  theme_bw() + 
  theme(panel.border=element_rect(color="lightgray", size=0.5),
        axis.ticks=element_line(color="lightgray"), 
        axis.text.x = element_text(vjust=1),
        plot.title = element_text(vjust=1))
q$plot <- q$plot + 
  theme_bwEAC01 + 
  theme(panel.border = element_blank(),
      #  axis.line = element_line(color="lightgrey", size=0.5), 
        legend.position="top", 
        legend.justification = 0, 
        plot.title=element_text(size = rel(1.5)), 
        plot.subtitle=element_text(size = rel(1.25)), 
        axis.title=element_text(size = rel(1.0))
  )
q$table <- p$table + 
  theme(panel.border = element_rect(color = "lightgrey", fill=NA),
        plot.title=element_text(size = rel(1.0), color="grey25"), 
        axis.line=element_line(color="lightgrey", size=0.5)
  )
print(q)







## HR- /HER2 + 

subtype_2<- which( clinDataSurv$Her2MostPos == 1)
SurvObject_2 <- Surv(time = (clinDataSurv$rfs[subtype_2]/365), event = clinDataSurv$rfsind[subtype_2])
fit_2 <- survfit(data= clinDataSurv[c(subtype_2),], SurvObject_2 ~ clinDataSurv$Cluster_assignment[subtype_2])
Cluster_assignment_HER=clinDataSurv$Cluster_assignment[subtype_2]

plot(fit_2, mark.time=FALSE, col=c( "red", "blue"),  xlab= "Time (years)", ylab= "Survival Probability", lwd= 2)
LogRankTest= survdiff(SurvObject_2 ~ clinDataSurv$Cluster_assignment[subtype_2],  rho=0)
title("Survival Curves for HER2+ stratified by \n phenotype")

p <-ggsurvplot(fit= survfit(data=clinDataSurv[c(subtype_2),], SurvObject_2 ~ Cluster_assignment_HER),  conf.int=FALSE, 
               censor=FALSE, alpha=1, size=1.23, title="Kaplan-Meier curves for womens defined by \n HR+ phenotype",
               risk.table= "absolute", 
               xlab="Time (years)", 
               xlim=c(0,7),
               tables.height=0.25, 
               risk.table.title="Number at risk", 
               risk.table.col="grey25", 
               risk.table.fontsize=rel(6), 
               risk.table.y.text=FALSE, 
               risk.table.type="absolute",
               palette= c("red", "blue"))

q <- p
theme_bwEAC01 <-  theme_bw() + 
  theme(panel.border=element_rect(color="lightgray", size=0.5),
        axis.ticks=element_line(color="lightgray"), 
        axis.text.x = element_text(vjust=1),
        plot.title = element_text(vjust=1))
q$plot <- q$plot + 
  theme_bwEAC01 + 
  theme(panel.border = element_blank(),
        axis.line = element_line(color="lightgrey", size=0.5), 
        legend.position="top", 
        legend.justification = 0, 
        plot.title=element_text(size = rel(1.5)), 
        plot.subtitle=element_text(size = rel(1.25)), 
        axis.title=element_text(size = rel(1.0))
  )
q$table <- p$table + 
  theme(panel.border = element_rect(color = "lightgrey", fill=NA),
        plot.title=element_text(size = rel(1.0), color="grey25"), 
        axis.line=element_line(color="lightgrey", size=0.5)
  )
print(q)



## HR- /HER2 - (Triple Negative) 

subtype_3<- which( clinDataSurv$Triple.Negative == 1)
SurvObject_3 <- Surv(time = (clinDataSurv$rfs[subtype_3]/365), event = clinDataSurv$rfsind[subtype_3])
fit_2 <- survfit(data= clinDataSurv[c(subtype_3),], SurvObject_3 ~ clinDataSurv$Cluster_assignment[subtype_3])
Cluster_assignment_TN=clinDataSurv$Cluster_assignment[subtype_3]

plot(fit_2, mark.time=FALSE, col=c( "red", "blue"),  xlab= "Time (years)", ylab= "Survival Probability", lwd = 2)
LogRankTest= survdiff(SurvObject_3 ~ clinDataSurv$Cluster_assignment[subtype_3],  rho=0)
title("Survival Curves for HR-/HER2- (Triple Negative) stratified by \n phenotype")
p <-ggsurvplot(fit= survfit(data=clinDataSurv[c(subtype_3),], SurvObject_3 ~ Cluster_assignment_TN),  conf.int=FALSE, 
               censor=FALSE, alpha=1, size=1.23, title="Kaplan-Meier curves for womens defined by \n HR+ phenotype",
               risk.table= "absolute", 
               xlab="Time (years)", 
               xlim=c(0,7),
               tables.height=0.25, 
               risk.table.title="Number at risk", 
               risk.table.col="grey25", 
               risk.table.fontsize=rel(6), 
               risk.table.y.text=FALSE, 
               risk.table.type="absolute",
               palette= c("red", "blue"))

q <- p
theme_bwEAC01 <-  theme_bw() + 
  theme(panel.border=element_rect(color="lightgray", size=0.5),
        axis.ticks=element_line(color="lightgray"), 
        axis.text.x = element_text(vjust=1),
        plot.title = element_text(vjust=1))
q$plot <- q$plot + 
  theme_bwEAC01 + 
  theme(panel.border = element_blank(),
        axis.line = element_line(color="lightgrey", size=0.5), 
        legend.position="top", 
        legend.justification = 0, 
        plot.title=element_text(size = rel(1.5)), 
        plot.subtitle=element_text(size = rel(1.25)), 
        axis.title=element_text(size = rel(1.0))
  )
q$table <- p$table + 
  theme(panel.border = element_rect(color = "lightgrey", fill=NA),
        plot.title=element_text(size = rel(1.0), color="grey25"), 
        axis.line=element_line(color="lightgrey", size=0.5)
  )
print(q)
#############################################################################

### Logistic regression to predict pCR

#############################################################################

model_1_pcr <- glm(PCR ~ HR.Pos + Her2MostPos + age, data=clinData_PCR, family=binomial(link="logit"))
predicted <- predict(model_1_pcr, clinData_PCR, type="response")  # predicted scores
plot(roc(predictor = predicted, response = clinData_PCR$PCR, auc=TRUE), print.auc= T)

model_2_pcr <- glm(PCR ~ HR.Pos + Her2MostPos + age+ scale(FTV_Volume_T2) , data=clinData_PCR, family=binomial(link="logit"))
predicted <- predict(model_2_pcr, clinData_PCR)  # predicted scores
plot(roc(predictor = predicted, response = clinData_PCR$PCR, auc=TRUE), print.auc= T)


model_3_pcr <- glm(PCR ~ HR.Pos + Her2MostPos + age+scale(FTV_Volume_T2) + Cluster_assignment, data=clinData_PCR, family=binomial(link="logit"))
predicted <- predict(model_3_pcr, clinData_PCR, type="response")  # predicted scores
plot(roc(predictor = predicted, response = clinData_PCR$PCR, auc=TRUE), print.auc= T)

## Cross validation
library(caret)

# define training control

clinData_PCR <- clinDataSurv[ -c(which(is.na(clinDataSurv$PCR))) , ]
clinData_PCR$PCR = as.factor(clinData_PCR$PCR)
levels(clinData_PCR$PCR) <- c("NR", "CR")

flds<-sample(rep(1:3, length.out=95))
trainIndex<- list(c(which(flds==1 | flds ==2)), c(which(flds==1 | flds ==3)), c(which(flds==2 | flds==3)))
testIndex <-list(c(which(flds==3)), c(which(flds==2)), c(which(flds==1)))

train_control <- trainControl(method = "CV", number = 3, savePredictions = T, classProbs= T, index=trainIndex, indexOut = testIndex)


# train the model on training set
model <- train(PCR ~ HR.Pos + Her2MostPos + age,
               data = clinData_PCR,
               trControl = train_control,
               method = "glm",
               
               family=binomial())

model1_results <- model$results
plot(roc(predictor = model$pred$CR, response = model$pred$obs, auc=TRUE), print.auc= T)



# train the model on training set
model_2 <- train(PCR ~ HR.Pos + Her2MostPos + age + scale(FTV_Volume_T2),
               data = clinData_PCR,
               trControl = train_control,
               method = "glm",
               
               family=binomial())

model_2_results <- model_2$results
plot(roc(predictor = model_2$pred$CR, response = model_2$pred$obs, auc=TRUE), print.auc= T)



# train the model on training set
model_3 <- train(PCR ~ HR.Pos + Her2MostPos + age + scale(FTV_Volume_T2) + Cluster_assignment,
                 data = clinData_PCR,
                 trControl = train_control,
                 method = "glm",
                 
                 family=binomial())

plot(roc(predictor = model_3$pred$CR, response = model_3$pred$obs, auc=TRUE), print.auc= T)





# train the model on training set
model_3 <- train(PCR ~Cluster_assignment,
                 data = clinData_PCR,
                 trControl = train_control,
                 method = "glm",
                 
                 family=binomial())

plot(roc(predictor = model_3$pred$CR, response = model_3$pred$obs, auc=TRUE), print.auc= T)


###################  PCR using molecular signatures


model_1_pcr <- glm(PCR ~ mammaprint_refined_correlation, data=clinData_PCR, family=binomial(link="logit"))
predicted <- predict(model_1_pcr, clinData_PCR, type="response")  # predicted scores
plot(roc(predictor = predicted, response = clinData_PCR$PCR, auc=TRUE), print.auc= T)

model_2_pcr <- glm(PCR ~ ROR_score , data=clinData_PCR, family=binomial(link="logit"))
predicted <- predict(model_2_pcr, clinData_PCR)  # predicted scores
plot(roc(predictor = predicted, response = clinData_PCR$PCR, auc=TRUE), print.auc= T)


model_3_pcr <- glm(PCR ~ p53_continuous, data=clinData_PCR, family=binomial(link="logit"))
predicted <- predict(model_3_pcr, clinData_PCR, type="response")  # predicted scores
plot(roc(predictor = predicted, response = clinData_PCR$PCR, auc=TRUE), print.auc= T)

model_4_pcr <- glm(PCR ~ mammaprint_refined_correlation+ p53_continuous + ROR_score, data=clinData_PCR, family=binomial(link="logit"))
predicted <- predict(model_4_pcr, clinData_PCR, type="response")  # predicted scores
plot(roc(predictor = predicted, response = clinData_PCR$PCR, auc=TRUE), print.auc= T)

# 3-fold cross validated
clinData_PCR$PCR = as.factor(clinData_PCR$PCR)
levels(clinData_PCR$PCR) <- c("NR", "CR")

flds<-sample(rep(1:3, length.out=95))
trainIndex<- list(c(which(flds==1 | flds ==2)), c(which(flds==1 | flds ==3)), c(which(flds==2 | flds==3)))
testIndex <-list(c(which(flds==3)), c(which(flds==2)), c(which(flds==1)))

train_control <- trainControl(method = "CV", number = 3, savePredictions = T, classProbs= T, index=trainIndex, indexOut = testIndex)


# train the model on training set
model <- train(PCR ~ mammaprint_refined_correlation,
               data = clinData_PCR,
               trControl = train_control,
               method = "glm",
               family=binomial())
plot(roc(predictor = model$pred$CR, response = model$pred$obs, auc=TRUE), print.auc= T)



# train the model on training set
model_2 <- train(PCR ~ p53_continuous,
                 data = clinData_PCR,
                 trControl = train_control,
                 method = "glm",
                 
                 family=binomial())
plot(roc(predictor = model_2$pred$CR, response = model_2$pred$obs, auc=TRUE), print.auc= T)



# train the model on training set
model_3 <- train(PCR ~ ROR_score,
                 data = clinData_PCR,
                 trControl = train_control,
                 method = "glm",
                 
                 family=binomial())

plot(roc(predictor = model_3$pred$CR, response = model_3$pred$obs, auc=TRUE), print.auc= T)


# train the model on training set
model_4 <- train(PCR ~ mammaprint_refined_correlation+p53_continuous+ ROR_score,
                 data = clinData_PCR,
                 trControl = train_control,
                 method = "glm",
                 
                 family=binomial())
plot(roc(predictor = model_4$pred$CR, response = model_4$pred$obs, auc=TRUE), print.auc= T)


# train the model on training set
model_3 <- train(PCR ~Cluster_assignment,
                 data = clinData_PCR,
                 trControl = train_control,
                 method = "glm",
                 
                 family=binomial())

plot(roc(predictor = model_3$pred$CR, response = model_3$pred$obs, auc=TRUE), print.auc= T)

############################
### Generate boxplots for representative figures

# Box plot of case FTV values. Rep image from phen 1 (low het) is 1146. Rep image from phen 2(high het) is 1203
phen_1= which(clinData$ISPY_cases_disc==1146)
phen_2= which(clinData$ISPY_cases_disc==1203)
barplot(c(clinData$FTV_Volume_T2[phen_1], clinData$FTV_Volume_T2[phen_2]), ylim = c(0,6000),
        col= c("blue", "red"), names.arg = c("Phenotype 1", "Phenotype 2"))
title(main= "FTV T2 values for representative \n image from each phenotype")


# Feature C1- PC1
barplot(c(featureData$C1_PC1[phen_1], featureData$C1_PC1[phen_2]), ylim=c(-3, 8),
        col= c("blue", "red"), names.arg = c("Phenotype 1", "Phenotype 2"))
title(main= "C1- PC1 values for representative \n image from each phenotype")



# Feature C4- PC1
barplot(c(featureData$C4_PC1[phen_1], featureData$C4_PC1[phen_2]), ylim=c(-3, 8),
        col= c("blue", "red"), names.arg = c("Phenotype 1", "Phenotype 2"))
title(main= "C4- PC1 values for representative \n image from each phenotype")


# Feature C3- PC1
barplot(c(featureData$C3_PC1[phen_1], featureData$C3_PC1[phen_2]), ylim=c(-3, 8),
        col= c("blue", "red"), names.arg = c("Phenotype 1", "Phenotype 2"))
title(main= "C4- PC1 values for representative \n image from each phenotype")


##### Boxplots of significant features for all cases across both phenotypes
temp<- clinData$Cluster_assignment
temp[which(clinData$Cluster_assignment==1)]=2
temp[which(clinData$Cluster_assignment==2)]=1
clinData$Cluster_assignment<-temp

 
boxplot((featureData$C1_PC1 ~ clinData$Cluster_assignment), col= c("blue", "red"))
kruskal.test(fea)
boxplot((featureData$C4_PC1 ~ clinData$Cluster_assignment), col= c("blue", "red"))
boxplot((featureData$C3_PC1 ~ clinData$Cluster_assignment), col= c("blue", "red"))


# Get individual delta features

## Load the data files
deltaFeatures= read_excel("new_discovery_set_data.xlsx", sheet="discovery_delta_features_2", col_names= T)
deltaFeatures<- (data.frame(deltaFeatures[ ,(c(36:77))]))
########
barplot(c(deltaFeatures$X.SER.T1_E_Morphologic_EllipseDiameter_Axis.2.[phen_1], deltaFeatures$X.SER.T1_E_Morphologic_EllipseDiameter_Axis.2.[phen_2]), ylim = c(-2,1),
        col= c("blue", "red"), names.arg = c("Phenotype 1", "Phenotype 2"))
title(main= "SER Ellipse diameter 1 for representative \n image from each phenotype")




barplot(c(deltaFeatures$X.WOS.T1_E_GLRLM_Bins.128_Radius.3_ShortRunHighGreyLevelEmphasis.[phen_1], deltaFeatures$X.WOS.T1_E_GLRLM_Bins.128_Radius.3_ShortRunHighGreyLevelEmphasis.[phen_2]), ylim = c(0,3),
        col= c("blue", "red"), names.arg = c("Phenotype 1", "Phenotype 2"))
title(main= "WOS Short run high grey level emphasis for representative \n image from each phenotype")



barplot(c(deltaFeatures$X.SER.T1_E_Morphologic_Flatness.[phen_1], deltaFeatures$X.SER.T1_E_Morphologic_Flatness.[phen_2]), ylim = c(-3,1),
        col= c("blue", "red"), names.arg = c("Phenotype 1", "Phenotype 2"))
title(main= "SER morphologic flatness for representative \n image from each phenotype")


## Treatment Chi-square tests

## Load the data files
allData1= read_excel("new_discovery_set_data.xlsx", sheet="Discovery_treatment", col_names= T)

allData1= data.frame(allData1[,c(1:8)])



hormone= table(allData1$ChemoCat, allData1$rfsind);
print(hormone)
chisq.test(hormone,simulate.p.value = TRUE)

#Herceptin
HR_cases= allData1[which(clinData$Her2MostPos==1), ]

hormone= table(HR_cases$Herceptin, HR_cases$rfsind);
print(hormone)
chisq.test(hormone,simulate.p.value = TRUE)


hormone= table(allData1$RtTherapy, allData1$rfsind);
print(hormone)
chisq.test(hormone,simulate.p.value = TRUE)



hormone= table(allData1$ChemoCat, allData1$Cluster_assignment_flipped);
print(hormone)
chisq.test(hormone,simulate.p.value = TRUE)



hormone= table(allData1$Herceptin, allData1$Cluster_assignment_flipped);
print(hormone)
chisq.test(hormone,simulate.p.value = TRUE)
