## ISPY NACT Delta phenotype analysis
## Validation analysis
# 8/7/2020

library(robustHD)
library(gplots)
library(ggplot2)
library("readxl")
library(heatmap3)
library(sigclust)
library(ConsensusClusterPlus)
library(colorRamps)
library(plotrix)
library(gmodels)
library(vioplot)
library(scales)
library(plyr)
library(lattice)
library(cluster)
library(fpc)
library(survC1)     # c-statistic
library("tidyverse")  # (Includes tibble, magrittr, tidyr, ggplot2, others)
library("survival")
library("survminer")  # survival curves in ggplot
library("survC1")
library(survival)
library(survMisc)
library("lmtest")
library(caret)
library(pROC)

## Load the data files
allData= read_excel("new_discovery_set_data.xlsx ", sheet="validation_pca_princomp_2", col_names= T)
#Remove case with missing FTV2

featureData<- (data.frame(allData[,(c(2:7))]))

#clinData=data.frame(allData[, c(12:43)])
########

#featureData_reorder<-featureData[, c(ppg_heatmap$rowInd)]


#setting cluster colors

ColumnCluster_col<-colByValue(allData$Cluster_assignment,
                              col=c("red", "blue"))
####################################################



ER_col <- colByValue(as.matrix(allData$`ER positive`),
                     col<- c("lightgreen",  "darkgreen"))

PR_col<-colByValue(as.matrix(allData$`PR Positive`),
                   col= c("lightpink","red"))

HER2_col<- colByValue(as.matrix(allData$Her2MostPos),
                      col= c("lightblue", "mediumblue"))


Trip_neg_col<-colByValue(as.matrix(allData$Triple_Negative),
                         col=c("violet", "purple4"))


Rec_col<-colByValue(as.matrix(allData$RFSind),
                    col=c("lightgreen",  "darkgreen"))

#allData$Size..largest.diameter..mm <- scale(allData$Size..largest.diameter..mm)
#FTV_col <- colByValue(as.matrix(clinData$FTV1),
                     # col<-colorRampPalette(c("violet",  "purple3", "purple4"))(50))
FTV_col <- colByValue(as.matrix(allData$FTV_Volume_T2),
                      col<-colorRampPalette(c("violet",  "purple3", "purple4"))(50))


col_combos.val<-cbind(FTV_col,Rec_col,Trip_neg_col, HER2_col, PR_col, ER_col, ColumnCluster_col )
colnames(col_combos.val)<-c("FTV 2", "Recurrence", "Triple negative", "HER2 Status", "Progesterone Receptor Status", "Estrogen Receptor", "Phenotypes")

#col_combos.val<- (ColumnCluster_col)
#colnames(col_combos.val) <- ("cluster")


###################################################################

# 
featureData[featureData >5] <- 5
featureData[featureData < -5] <- -5



break_vals=c(-4,-3,-2.5, -2, -1.5, -1,-0.9, -0.85, -0.8,-0.75, -0.7,-0.65, -0.6, -0.55, -0.5, -0.45, -0.4, -0.3, -0.2,-0.1, -0.05, 0, 0.05, 0.1, 0.13, 0.15, 0.2, 0.22, 0.25,  0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.90, 0.95, 1, 1.25, 1.5, 1.75, 2, 2.25, 2.5, 2.75, 3, 3.25, 3.5, 4, 4.25, 4.5,5)
colors.train<-colorpanel(length(break_vals)-1, 'blue', 'white', 'red4')

valFeatures<-featureData[,c(temp)]

val_heatmap<-heatmap3(t(as.matrix(valFeatures)),margin=c(6,6),
                       balanceColor=TRUE,
                       Rowv=NA,
                       Colv=NA,
                       # col=colors.train,
                       scale="none",
                       labCol="",
                       ColSideColors=col_combos.val,
                       RowSideLabs="",
                       revC=T,
                       #breaks= break_vals,
                       cexRow=0.8,
                       cexCol=0.3)



################################## Phenotype significance ##############################################
SurvObject <- Surv(time = (allData$`DFS(days)`  )/365, event = allData$RFSind)
SurvfitGraph1 <- survfit(SurvObject ~ allData$Cluster_assignment)
colVector=  c("red","blue")
plot(SurvfitGraph1, mark.time=FALSE, col=colVector,  xlab= "Time (years)", ylab= "Survival Probability")
title("Kaplan-Meier Curves for patient groups defined by \n heterogeneity phenotype")
summary(SurvfitGraph1)
LogRankTest= survdiff(SurvObject~allData$Cluster_assignment,  rho=0)
LogRankTest



### Creating Kaplan Meier curves with table

p <-ggsurvplot(fit= survfit(SurvObject ~ allData$Cluster_assignment), data= allData, conf.int=FALSE, 
               censor=FALSE, alpha=1, size=1.23, title="Kaplan-Meier curves for womens defined by \n heterogeneity phenotype",
               risk.table= "absolute", 
               xlab="Time (years)", 
               xlim=c(0,7),
               tables.height=0.25, 
               risk.table.title="Number at risk", 
               risk.table.col="grey25", 
               risk.table.fontsize=rel(6), 
               risk.table.y.text=FALSE, 
               risk.table.type="absolute",
               palette= c("red", "blue"))

q <- p
theme_bwEAC01 <-  theme_bw() + 
  theme(panel.border=element_rect(color="lightgray", size=0.5),
        axis.ticks=element_line(color="lightgray"), 
        axis.text.x = element_text(vjust=1),
        plot.title = element_text(vjust=1))
q$plot <- q$plot + 
  theme_bwEAC01 + 
  theme(panel.border = element_blank(),
        axis.line = element_line(color="lightgrey", size=0.5), 
        legend.position="top", 
        legend.justification = 0, 
        plot.title=element_text(size = rel(1.5)), 
        plot.subtitle=element_text(size = rel(1.25)), 
        axis.title=element_text(size = rel(1.0))
  )
q$table <- p$table + 
  theme(panel.border = element_rect(color = "lightgrey", fill=NA),
        plot.title=element_text(size = rel(1.0), color="grey25"), 
        axis.line=element_line(color="lightgrey", size=0.5)
  )
print(q)




## Two phenotype test

sig1<-sigclust((featureData),1000, nrep=1, labflag=1, label= allData$Cluster_assignment,icovest=3)
plot(sig1,arg="all")


## Get variables with no missing data

valData<- allData[- c(which(is.na(allData$Her2MostPos))),]
valData <- valData[-c(62),]
valData<- data.frame(valData)
## Validation risk prediction curves


## Load the data files
discData= read_excel("new_discovery_set_data.xlsx", sheet="discovery_pca_princomp_2", col_names= T)
discData= data.frame(discData)
cvModelData<- discData[-c(which(is.na(discData$Her2MostPos))),]


## Scale the validation FTV data based on the discovery cohort FTV mean and standard deviation

val_ftv_scale= scale(valData$FTV_Volume_T2, mean(cvModelData$FTV_Volume_T2), sd(cvModelData$FTV_Volume_T2))
# add it back into  
valData$FTV_Volume_T2 <- val_ftv_scale
#Then z-score the discovery functional tumor volume
FTV_Volume_T2 <-as.matrix(scale(cvModelData$FTV_Volume_T2))
cvModelData$FTV_Volume_T2 <- FTV_Volume_T2
colnames(cvModelData$FTV_Volume_T2) <- "FTV_Volume_T2"

f_vector = vector()
c_score_cv= vector()


f_vector_2 = vector()
c_score_cv_2= vector()


f_vector_3= vector()
c_score_cv_3= vector()

flds<-sample(rep(1:3, length.out=100))

for (k in c(1:3)){
  test_i<- which(flds ==k)
  train_data<-(cvModelData[-test_i,])
  
  # Train survival object
  SurvObject_CV <- Surv(time=(cvModelData$rfs[-test_i]), event=(cvModelData$rfsind[-test_i]))
  
  # Baseline model
  model_cv<-coxph(data=train_data,formula=SurvObject_CV ~  HR.Pos + Her2MostPos +age)
  pred_test<- predict( model_cv, newdata= (valData))
  dd_CV <- cbind(valData$DFS.days., valData$RFSind, pred_test)
  cs_CV <- Est.Cval(mydata=dd_CV, tau=3*365.25, nofit=TRUE)
  c_score_cv<-c(c_score_cv,cs_CV$Dhat)
  vector <- pred_test
  f_vector <- cbind(f_vector, vector)
  
  
  #Baseline + FTV 2 
  
  model2<-coxph(data=train_data,formula=SurvObject_CV ~  HR.Pos + Her2MostPos +age+ (FTV_Volume_T2))
  pred_test_2<- predict( model2, newdata= (valData))
  dd_CV_2 <- cbind(valData$DFS.days., valData$RFSind, pred_test_2)
  cs_CV_2 <- Est.Cval(mydata=dd_CV_2, tau=3*365.25, nofit=TRUE)
  c_score_cv_2<-c(c_score_cv_2,cs_CV_2$Dhat)
  vector_2 <- pred_test_2
  f_vector_2 <- cbind(f_vector_2, vector_2)
  
  
  #Baseline + FTV 2 + phenotype
  
  model3<-coxph(data=train_data,formula=SurvObject_CV ~   HR.Pos + Her2MostPos +age+ (FTV_Volume_T2) + Cluster_assignment)
  pred_test_3<- predict( model3, newdata= (valData))
  dd_CV_3 <- cbind(valData$DFS.days., valData$RFSind, pred_test_3)
  cs_CV_3 <- Est.Cval(mydata=dd_CV_3, tau=3*365.25, nofit=TRUE)
  c_score_cv_3<-c(c_score_cv_3,cs_CV_3$Dhat)
  vector_3 <- pred_test_3
  f_vector_3 <- cbind(f_vector_3, vector_3)
  
}


print(mean(c_score_cv))
print(mean(c_score_cv_2))
print(mean(c_score_cv_3))




f_vector_ordered= rowMeans(f_vector)
predictionScore_FTV<- f_vector_ordered
predictionScore_FTV[which((f_vector_ordered > median(f_vector_ordered)))]=2 # High Risk
predictionScore_FTV[which((f_vector_ordered <= median(f_vector_ordered)))]=1 #Low Risk
SurvObject <- Surv(time = (valData$DFS.days.)/365, event = valData$RFSind)
FTV_surv <- survfit(SurvObject ~ predictionScore_FTV)
colVector= c("green", "red")
plot(FTV_surv, mark.time=TRUE, col=colVector,  xlab= "Time (years)", ylab= "Survival Probability", lwd=4, cex.axis = 2, cex.lab=1 )
LogRankTest= survdiff(SurvObject~predictionScore_FTV,  rho=0)
title('Survival curves for patients stratified by median \n baseline risk score')
print(mean(c_score_cv))



f_vector_ordered_2= rowMeans(f_vector_2)
predictionScore_2<- f_vector_ordered_2
predictionScore_2[which((f_vector_ordered_2 > median(f_vector_ordered_2)))]=2 # High Risk
predictionScore_2[which((f_vector_ordered_2 <= median(f_vector_ordered_2)))]=1 #Low Risk
SurvObject <- Surv(time = (valData$DFS.days.)/365, event = valData$RFSind)
FTV_surv <- survfit(SurvObject ~ predictionScore_2)
colVector= c("green", "red")
plot(FTV_surv, mark.time=TRUE, col=colVector,  xlab= "Time (years)", ylab= "Survival Probability", lwd=4, cex.axis = 2, cex.lab=1 )
LogRankTest= survdiff(SurvObject~predictionScore_2,  rho=0)
title('Survival curves for patients stratified by median \n baseline and FTV 2 risk score')
print(mean(c_score_cv_2))



f_vector_ordered_3= rowMeans(f_vector_3)
predictionScore_3<- f_vector_ordered_3
predictionScore_3[which((f_vector_ordered_3 > median(f_vector_ordered_3)))]=2 # High Risk
predictionScore_3[which((f_vector_ordered_3 <= median(f_vector_ordered_3)))]=1 #Low Risk
SurvObject <- Surv(time = (valData$DFS.days.)/365, event = valData$RFSind)
FTV_surv <- survfit(SurvObject ~ predictionScore_3)
colVector= c("green", "red")
plot(FTV_surv, mark.time=TRUE, col=colVector,  xlab= "Time (years)", ylab= "Survival Probability", lwd=4, cex.axis = 2, cex.lab=1 )
LogRankTest= survdiff(SurvObject~predictionScore_3,  rho=0)
title('Survival curves for patients stratified by median \n baseline, FTV 2, and phenotype risk score')
print(mean(c_score_cv_3))




SurvObject <- Surv(time = (cvModelData$rfs)/365, event = cvModelData$rfsind)

# Try log Transforming the data before putting in the model

## Get variables with no missing data
valData<- allData[- c(which(is.na(allData$Her2MostPos))),]
valData <- valData[-c(46),]
valData<- data.frame(valData)
## Validation risk prediction curves
## Load the data files
discData= read_excel("new_discovery_set_data.xlsx", sheet="discovery_pca_princomp_2", col_names= T)
discData= data.frame(discData)
cvModelData<- discData[-c(which(is.na(discData$Her2MostPos))),]

valData$FTV_Volume_T2 <- log(valData$FTV_Volume_T2,2)
cvModelData$FTV_Volume_T2<- log(cvModelData$FTV_Volume_T2,2)

## Full model prediction
baseline_full <- coxph(data= cvModelData, formula= SurvObject ~ HR.Pos + Her2MostPos + age)
baseline_FTV_full <- coxph(data= cvModelData, formula= SurvObject ~ HR.Pos + Her2MostPos + age + (FTV_Volume_T2))
baseline_FTV_delta <- coxph(data= cvModelData, formula= SurvObject ~ HR.Pos + Her2MostPos + age + (FTV_Volume_T2)+ Cluster_assignment)
delta_model <- coxph(data= cvModelData, formula= SurvObject ~   Cluster_assignment)


pred_test<- predict( delta_model, newdata= (valData))
dd_full_model <- cbind(valData$DFS.days., valData$RFSind, pred_test)
cs_full_model_phenotype <- Est.Cval(mydata=dd_full_model, tau=3*365.25, nofit=TRUE)
cs_full_model_phenotype$Dhat


### Full Kaplan Meier risk prediction curves

SurvObject <- Surv(time = (cvModelData$rfs)/365, event = cvModelData$rfsind)

model1<-coxph(data=cvModelData,formula=SurvObject ~   HR.Pos + Her2MostPos +age )
pred_test_3<- predict( model1, newdata= (valData))
dd_CV_3 <- cbind(valData$DFS.days., valData$RFSind, pred_test_3)
cs_CV_3 <- Est.Cval(mydata=dd_CV_3, tau=3*365.25, nofit=TRUE)
predictionScore_FTV<- pred_test_3
predictionScore_FTV[which((pred_test_3 > median(pred_test_3)))]=2 # High Risk
predictionScore_FTV[which((pred_test_3 <= median(pred_test_3)))]=1 #Low Risk
SurvObject <- Surv(time = (valData$DFS.days.)/365, event = valData$RFSind)
FTV_surv <- survfit(SurvObject ~ predictionScore_FTV)
colVector= c("green", "red")
plot(FTV_surv, mark.time=TRUE, col=colVector,  xlab= "Time (years)", ylab= "Survival Probability", lwd=4, cex.axis = 2, cex.lab=1 )
LogRankTest= survdiff(SurvObject~predictionScore_FTV,  rho=0)
title('Survival curves for patients stratified by median \n baseline and FTV T2 risk score')











#####################################
## Functional Tumor Volume split
#####################################



## FTV curve split

SurvObject <- Surv(time = (valData$DFS.days.)/365, event = valData$RFSind)

split_FTV <- valData$FTV_Volume_T2
split_FTV[which((valData$FTV_Volume_T2 > median(valData$FTV_Volume_T2)))]=2 # High Risk
split_FTV[which((valData$FTV_Volume_T2 <= median(valData$FTV_Volume_T2)))]=1 #Low Risk
colVector=  c("red2", "dodgerblue")
FTV_surv <- survfit(SurvObject ~ split_FTV)
plot(FTV_surv, mark.time=FALSE, col=colVector,  xlab= "Time (years)", ylab= "Survival Probability")
title("Kaplan-Meier Curves for patient groups stratified by \n median FTV T2")
summary(FTV_surv)
LogRankTest= survdiff(SurvObject~split_FTV,  rho=0)


# Survival curve for above median


above_med <- survfit(Surv((valData$DFS.days.[which(split_FTV==2)]/365), valData$RFSind[which(split_FTV==2)]) ~ valData$Cluster_assignment[which(split_FTV==2)])
below_med <- survfit(Surv((valData$DFS.days.[which(split_FTV==1)]/365), valData$RFSind[which(split_FTV==1)]) ~ valData$Cluster_assignment[which(split_FTV==1)])
FTV_surv <- survfit(SurvObject ~ split_FTV)

fit<- list(above_median_ftv = above_med, below_median_ftv= below_med, FTV= FTV_surv)

p<-  ggsurvplot_combine(fit, data= valData, combine = TRUE, # Combine curves
                        risk.table = TRUE,                  # Add risk table
                        conf.int = FALSE,                    # Add confidence interval
                        conf.int.style = "ribbon",            # CI style, use "step" or "ribbon"
                        censor = FALSE,                     # Remove censor points
                        pval="0.05",
                        linetype=c("solid", "solid",  "solid", "solid", "dashed", "dashed")
                        
) 

cols <-  c("blue4", "blue", "red4", "red" , "red2", "dodgerblue")


p$plot <- p$plot + 
  scale_color_manual(values= cols)
p$plot



##################################################
## Above median FTV survival curves
##################################################

fit_1 <- survfit(Surv((valData$DFS.days.[which(split_FTV==2)]/365), valData$RFSind[which(split_FTV==2)]) ~ valData$Cluster_assignment[which(split_FTV==2)])
fit_2<- survfit(Surv((valData$DFS.days.[which(split_FTV==2)]/365), valData$RFSind[which(split_FTV==2)]) ~ 1) 
above_fit <- list(above_fit =fit_1, ftv_split= fit_2)

p_above<-  ggsurvplot(above_fit,
                      data= valData, combine = TRUE, # Combine curves
                      risk.table = TRUE,                  # Add risk table
                      conf.int = FALSE,                    # Add confidence interval
                      conf.int.style = "ribbon",            # CI style, use "step" or "ribbon"
                      censor = FALSE,                     # Remove censor points
                      pval=TRUE,
                      linetype=c("solid", "solid", "dashed")
) 


cols <-  c("blue4", "lightblue", "dodgerblue")
p_above$plot <- p_above$plot + 
  scale_color_manual(values= cols)
p_above$plot
 

##################################################
## Below median FTV survival curves
##################################################

fit_1<- survfit(Surv((valData$DFS.days.[which(split_FTV==1)]/365), valData$RFSind[which(split_FTV==1)]) ~ valData$Cluster_assignment[which(split_FTV==1)])
fit_2<-survfit(Surv((valData$DFS.days.[which(split_FTV==1)]/365), valData$RFSind[which(split_FTV==1)]) ~ 1)
below_fit<- list(below_fit1= fit_1, ftv_fit= fit_2)
p_below<-  ggsurvplot(below_fit,
                      data= valData, combine = TRUE, # Combine curves
                      risk.table = TRUE,                  # Add risk table
                      conf.int = FALSE,                    # Add confidence interval
                      conf.int.style = "ribbon",            # CI style, use "step" or "ribbon"
                      censor = FALSE,                     # Remove censor points
                      pval=TRUE,
                      linetype=c("solid", "solid", "dashed")
) 



cols <-  c( "red4", "red", "red2")

p_below$plot <- p_below$plot + 
  scale_color_manual(values= cols)
p_below$plot

###################################################
## Survival curves for covariate subtypes


## HR+ /HER2 - 

subtype_1<- which(valData$HR.Pos==1 & valData$Her2MostPos == 0)
SurvObject_1 <- Surv(time = (valData$DFS.days.[subtype_1]/365), event = valData$RFSind[subtype_1])
fit_1 <- survfit(data= valData[c(subtype_1),], SurvObject_1 ~ valData$Cluster_assignment[subtype_1])

plot(fit_1, mark.time=FALSE, col=c( "red", "blue"),  xlab= "Time (years)", ylab= "Survival Probability", lwd=2)
LogRankTest= survdiff(SurvObject_1 ~ valData$Cluster_assignment[subtype_1],  rho=0)
title("Survival Curves for HR+/HER2- stratified by \n phenotype")


## HR- /HER2 + 

subtype_2<- which( valData$Her2MostPos == 1)
SurvObject_2 <- Surv(time = (valData$DFS.days.[subtype_2]/365), event = valData$RFSind[subtype_2])
fit_2 <- survfit(data= valData[c(subtype_2),], SurvObject_2 ~ valData$Cluster_assignment[subtype_2])

plot(fit_2, mark.time=FALSE, col=c( "red", "blue"),  xlab= "Time (years)", ylab= "Survival Probability", lwd=2)
LogRankTest= survdiff(SurvObject_2 ~ valData$Cluster_assignment[subtype_2],  rho=0)
title("Survival Curves for HER2+ stratified by \n phenotype")




## HR- /HER2 - (Triple Negative) 

subtype_3<- which( valData$Triple_Negative == 1)
SurvObject_3 <- Surv(time = (valData$DFS.days.[subtype_3]/365), event = valData$RFSind[subtype_3])
fit_2 <- survfit(data= valData[c(subtype_3),], SurvObject_3 ~ valData$Cluster_assignment[subtype_3])

plot(fit_2, mark.time=FALSE, col=c( "red", "blue"),  xlab= "Time (years)", ylab= "Survival Probability", lwd= 2)
LogRankTest= survdiff(SurvObject_3 ~ valData$Cluster_assignment[subtype_3],  rho=0)
title("Survival Curves for HR-/HER2- (Triple Negative) stratified by \n phenotype")


###########################################################################
#############################################################################

### Logistic regression to predict pCR

#############################################################################

model_1_pcr <- glm(PCR ~ HR.Pos + Her2MostPos + age, data=cvModelData, family=binomial(link="logit"))
predicted <- predict(model_1_pcr, valData, type="response")  # predicted scores
plot(roc(predictor = predicted, response = valData$PCR, auc=TRUE), print.auc= T)

model_2_pcr <- glm(PCR ~ HR.Pos + Her2MostPos + age+ scale(FTV_Volume_T2) , data=cvModelData, family=binomial(link="logit"))
predicted <- predict(model_2_pcr, valData)  # predicted scores
plot(roc(predictor = predicted, response = valData$PCR, auc=TRUE), print.auc= T)


model_3_pcr <- glm(PCR ~ HR.Pos + Her2MostPos + age+scale(FTV_Volume_T2) + Cluster_assignment, data=cvModelData, family=binomial(link="logit"))
predicted <- predict(model_3_pcr, valData, type="response")  # predicted scores
plot(roc(predictor = predicted, response = valData$PCR, auc=TRUE), print.auc= T)

## Cross validation
library(caret)

# define training control

clinData_PCR <- cvModelData[ -c(which(is.na(cvModelData$PCR))) , ]
clinData_PCR$PCR = as.factor(clinData_PCR$PCR)
levels(clinData_PCR$PCR) <- c("NR", "CR")

flds<-sample(rep(1:3, length.out=95))
trainIndex<- list(c(which(flds==1 | flds ==2)), c(which(flds==1 | flds ==3)), c(which(flds==2 | flds==3)))
testIndex <-list(c(which(flds==3)), c(which(flds==2)), c(which(flds==1)))

train_control <- trainControl(method = "CV", number = 3, savePredictions = T, classProbs= T, index=trainIndex, indexOut = testIndex)


valData_PCR <- valData[ -c(which(is.na(valData$PCR))) , ]
valData_PCR$PCR = as.factor(valData_PCR$PCR)


# train the model on training set
model <- train(PCR ~ HR.Pos + Her2MostPos + age,
               data = clinData_PCR,
               trControl = train_control,
               method = "glm",
               
               family=binomial())

model1_val_predict = predict(model, valData_PCR, type= "prob")
plot(roc(predictor = model1_val_predict$CR, response = valData_PCR$PCR, auc=TRUE), print.auc= T)



# train the model on training set
model_2 <- train(PCR ~ HR.Pos + Her2MostPos + age + scale(FTV_Volume_T2),
                 data = clinData_PCR,
                 trControl = train_control,
                 method = "glm",
                 family=binomial())

model2_val_predict = predict(model_2, valData_PCR, type= "prob")
plot(roc(predictor = model2_val_predict$CR, response = valData_PCR$PCR, auc=TRUE), print.auc= T)



# train the model on training set
model_3 <- train(PCR ~ HR.Pos + Her2MostPos + age + scale(FTV_Volume_T2) + Cluster_assignment,
                 data = clinData_PCR,
                 trControl = train_control,
                 method = "glm",
                 
                 family=binomial())

model3_val_predict = predict(model_3, valData_PCR, type= "prob")
plot(roc(predictor = model3_val_predict$CR, response = valData_PCR$PCR, auc=TRUE), print.auc= T)




# train the model on training set
model_3 <- train(PCR ~Cluster_assignment,
                 data = clinData_PCR,
                 trControl = train_control,
                 method = "glm",
                 
                 family=binomial())

plot(roc(predictor = model_3$pred$CR, response = model_3$pred$obs, auc=TRUE), print.auc= T)
