load("../Data/Prostate_all.RData")
CoreTableDict <- read.table(file = "../Data/DataDictionary.csv", 
                            header = TRUE, sep = ";",
                            na.strings = c(" ", "", "."),
                            stringsAsFactors = FALSE)

Data filtering and cleaning

clean <- function(data, discard) {
  data <- subset(data, select = -which(colnames(data) %in% discard))

  data <- transform(data, 
                    PSA = ifelse(PSA == 0, 0.01, PSA),
                    ECOG_C = factor((ECOG_C >= 1) + (ECOG_C >= 2), levels = c(0, 1, 2)))
  data$RACE_C[data$RACE_C == "Missing"] <- NA
  data$REGION_C[data$REGION_C == "MISSING"] <- NA
  
  for (i in seq_len(ncol(data))) {
    if (is.factor(data[, i])) {
      tmp <- as.character(data[, i])
      if(any(tmp == "" | tmp == "Y", na.rm = TRUE)) {
        tmp[tmp == ""] <- "NO"
        tmp[tmp == "Y"] <- "YES"
        data[, i] <- factor(tmp, levels = c("NO", "YES"))
      }
    }
    if (is.character(data[, i])) 
      data[, i] <- factor(data[, i])
  }

  data$DEATH <- ifelse(data$DEATH == "NO", 0, 1)
  data
}
## List of variables that are discarded before the modeling
discard <-  c(
  ## Variables that are mostly or entirely missing in the training or validation data
  "HGTBLCAT", 
  "WGTBLCAT", 
  "HEAD_AND_NECK", 
  "PANCREAS", 
  "THYROID", 
  "CREACLCA", 
  "CREACL", 
  "GLEAS_DX", 
  "STOMACH",
  "BUN",
  "CCRC",
  "GLU",
  "HEIGHTBL",
  "WEIGHTBL",
  "ARTTHROM",
  ## Treatment variables
  "TRT1_ID",  
  "TRT1_ID", 
  "TRT2_ID", 
  "TRT3_ID",
  ## Discontinuation variables
  "DISCONT", 
  "ENTRT_PC", 
  "ENDTRS_C",
  ## Age not in validation data, only age group
  "AGEGRP",
  ## Misc
  "PER_REF",
  "LKADT_REF",
  "LKADT_PER",
  "DOMAIN",
  "STUDYID",
  "SMOKE",
  "SMOKFREQ",
  "SMOKSTAT",
  "RBC",
  "LYM",
  "ABDOMINAL",
  "MHNEOPLA",
  "TSTAG_DX"
  )

training <- clean(CoreTable_training, discard)
validation <- clean(CoreTable_validation, discard)
validation$RACE_C <- factor(validation$RACE_C, levels = levels(training$RACE_C))
validation$REGION_C <- factor(validation$REGION_C, levels = levels(training$REGION_C))
validation$REGION_C[is.na(validation$REGION_C)] <- "OTHER"

conVarCT <- CoreTableDict$Name[CoreTableDict$Type == "NUM" & CoreTableDict$Name %in% names(training)]
disVarCT <- CoreTableDict$Name[CoreTableDict$Type == "CHAR" & CoreTableDict$Name %in% names(training)]

## Check, should be character(0)
setdiff(c(conVarCT, disVarCT), colnames(training))
## character(0)
setdiff(c(conVarCT, disVarCT), colnames(validation))
## character(0)

Exploratory plots

source("../Exploratory/explorative.R")

Survival in each of the three studies

prostateSurv0 <- survfit(Surv(LKADT_P, DEATH == "YES") ~ 1, data = CoreTable_training)
prostateSurv <- survfit(Surv(LKADT_P, DEATH == "YES") ~ STUDYID, data = CoreTable_training)
## Log-rank test
survdiff(Surv(LKADT_P, DEATH == "YES") ~ STUDYID, data = CoreTable_training)
## Call:
## survdiff(formula = Surv(LKADT_P, DEATH == "YES") ~ STUDYID, data = CoreTable_training)
## 
##                   N Observed Expected (O-E)^2/E (O-E)^2/V
## STUDYID=ASCENT2 476      138      135    0.0672    0.0913
## STUDYID=CELGENE 526       92      100    0.7001    0.9215
## STUDYID=EFC6546 598      433      428    0.0675    0.2421
## 
##  Chisq= 0.9  on 2 degrees of freedom, p= 0.631
survFit <- as.data.frame(summary(prostateSurv)[c(2, 6, 8)])
survFit$Trial <- {
  study <- survFit$strata; 
  levels(study) <- c("Sloan Kettering ", "Celgene", "Sanofi"); 
  study}

postscript(file = "Survival.eps", width = 5, height = 5,
           onefile = FALSE, horizontal = FALSE, paper = "special")
ggplot(survFit, aes(time, surv, color = Trial)) +
geom_ribbon(aes(x = time, y = NULL, ymin = lower, ymax = upper, color = NULL), 
            fill = "lightgray", color = "lightgray",
            data = as.data.frame(summary(prostateSurv0)[c(2, 9, 10)])) + 
  geom_step(size = I(0.8)) + 
  theme(legend.position = "top") + ylim(c(0, 1)) + 
  xlab("Days") + ylab("Survival function") + 
  theme(axis.title.y = element_text(margin = margin(0, 10, 0, 0)))
dev.off()
## quartz_off_screen 
##                 2

Summary tables

rowSums(table(CoreTable_training[, c("STUDYID", "DEATH")]))
## ASCENT2 CELGENE EFC6546 
##     476     526     598
summary(subset(CoreTable_training, STUDYID == "EFC6546")[, "LKADT_P"])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     8.0   392.0   642.5   656.7   901.5  1594.0
xtable(
  addmargins(table(rbind(CoreTable_training[, c("REGION_C", "STUDYID")], 
              CoreTable_validation[, c("REGION_C", "STUDYID")]))),
  digits = 0
)
## % latex table generated in R 3.2.2 by xtable 1.8-2 package
## % Tue May  3 09:06:17 2016
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrrrrr}
##   \hline
##  & ASCENT2 & CELGENE & EFC6546 & AZ & Sum \\ 
##   \hline
## EASTERN EUROPE & 0 & 84 & 127 & 50 & 261 \\ 
##   MISSING & 476 & 6 & 0 & 0 & 482 \\ 
##   NORTH AMERICA & 0 & 139 & 80 & 61 & 280 \\ 
##   OTHER & 0 & 50 & 93 & 0 & 143 \\ 
##   SOUTH AMERICA & 0 & 0 & 86 & 38 & 124 \\ 
##   WESTERN EUROPE & 0 & 247 & 212 & 104 & 563 \\ 
##   AFRICA & 0 & 0 & 0 & 13 & 13 \\ 
##   ASIA/PACIFIC & 0 & 0 & 0 & 47 & 47 \\ 
##   Sum & 476 & 526 & 598 & 313 & 1913 \\ 
##    \hline
## \end{tabular}
## \end{table}
xtable(
  addmargins(table(rbind(CoreTable_training[, c("RACE_C", "STUDYID")], 
                         CoreTable_validation[, c("RACE_C", "STUDYID")]))),
  digits = 0
)
## % latex table generated in R 3.2.2 by xtable 1.8-2 package
## % Tue May  3 09:06:17 2016
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrrrrr}
##   \hline
##  & ASCENT2 & CELGENE & EFC6546 & AZ & Sum \\ 
##   \hline
## Asian & 5 & 0 & 36 & 49 & 90 \\ 
##   Black & 32 & 25 & 17 & 12 & 86 \\ 
##   Hispanic & 14 & 0 & 0 & 0 & 14 \\ 
##   Missing & 0 & 55 & 0 & 0 & 55 \\ 
##   Other & 6 & 13 & 7 & 27 & 53 \\ 
##   White & 419 & 433 & 538 & 225 & 1615 \\ 
##   Sum & 476 & 526 & 598 & 313 & 1913 \\ 
##    \hline
## \end{tabular}
## \end{table}
table(CoreTable_training[, c("RACE_C", "REGION_C", "STUDYID")])
## , , STUDYID = ASCENT2
## 
##           REGION_C
## RACE_C     EASTERN EUROPE MISSING NORTH AMERICA OTHER SOUTH AMERICA
##   Asian                 0       5             0     0             0
##   Black                 0      32             0     0             0
##   Hispanic              0      14             0     0             0
##   Missing               0       0             0     0             0
##   Other                 0       6             0     0             0
##   White                 0     419             0     0             0
##           REGION_C
## RACE_C     WESTERN EUROPE
##   Asian                 0
##   Black                 0
##   Hispanic              0
##   Missing               0
##   Other                 0
##   White                 0
## 
## , , STUDYID = CELGENE
## 
##           REGION_C
## RACE_C     EASTERN EUROPE MISSING NORTH AMERICA OTHER SOUTH AMERICA
##   Asian                 0       0             0     0             0
##   Black                 0       1            19     2             0
##   Hispanic              0       0             0     0             0
##   Missing               0       1             4     7             0
##   Other                 0       0             6     4             0
##   White                84       4           110    37             0
##           REGION_C
## RACE_C     WESTERN EUROPE
##   Asian                 0
##   Black                 3
##   Hispanic              0
##   Missing              43
##   Other                 3
##   White               198
## 
## , , STUDYID = EFC6546
## 
##           REGION_C
## RACE_C     EASTERN EUROPE MISSING NORTH AMERICA OTHER SOUTH AMERICA
##   Asian                 0       0             4    30             0
##   Black                 0       0             7     1             6
##   Hispanic              0       0             0     0             0
##   Missing               0       0             0     0             0
##   Other                 0       0             1     1             5
##   White               127       0            68    61            75
##           REGION_C
## RACE_C     WESTERN EUROPE
##   Asian                 2
##   Black                 3
##   Hispanic              0
##   Missing               0
##   Other                 0
##   White               207
tmp <- table(rbind(CoreTable_training[, c("AGEGRP2", "STUDYID")], 
                   CoreTable_validation[, c("AGEGRP2", "STUDYID")]))

xtable(addmargins(tmp), digits = 0)
## % latex table generated in R 3.2.2 by xtable 1.8-2 package
## % Tue May  3 09:06:17 2016
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrrrrr}
##   \hline
##  & ASCENT2 & CELGENE & EFC6546 & AZ & Sum \\ 
##   \hline
## $>$=75 & 154 & 109 & 125 & 61 & 449 \\ 
##   18-64 & 111 & 171 & 219 & 111 & 612 \\ 
##   65-74 & 211 & 246 & 254 & 141 & 852 \\ 
##   Sum & 476 & 526 & 598 & 313 & 1913 \\ 
##    \hline
## \end{tabular}
## \end{table}
tmp <- t(t(tmp) / colSums(tmp))

ggplot(aes(x = factor(AGEGRP2, levels = c("18-64", "65-74", ">=75")), 
      y = value, fill = STUDYID), data = melt(tmp)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  xlab("Age") + ylab("Relative frequency")

tmp <- table(rbind(CoreTable_training[, c("RACE_C", "STUDYID")], 
                         CoreTable_validation[, c("RACE_C", "STUDYID")]))


tmp <- t(t(tmp) / colSums(tmp))

ggplot(aes_string(x = "RACE_C", 
      y = "value", fill = "STUDYID"), data = melt(tmp)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  xlab("Race") + ylab("Relative frequency")

tmp <- table(rbind(CoreTable_training[, c("REGION_C", "STUDYID")], 
                         CoreTable_validation[, c("REGION_C", "STUDYID")]))

tmp <- t(t(tmp) / colSums(tmp))

ggplot(aes_string(x = "REGION_C", 
      y = "value", fill = "STUDYID"), data = melt(tmp)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  xlab("Region") + ylab("Relative frequency")

trainTable <- CoreTableDict[CoreTableDict$Name %in% colnames(training), ][-(1:3),]
trainTable <- trainTable[order(trainTable$Type), ]
trainTable$missing <- apply(training[, trainTable$Name], 2, function(x) sum(is.na(x)))
trainTable$missingVal <- apply(validation[, trainTable$Name], 2, function(x) sum(is.na(x)))
trainTable$Name <- paste(rep(c("\\rowtwo ", "\\row "), times = 47)[-1], trainTable$Name)
trainTable$Label[40:93] <- sapply(strsplit(trainTable$Label[40:93], ":"), 
       function(x) tolower(x[length(x)]))

print(xtable(trainTable[, c(1, 2, 4, 8, 9)]), include.rownames = FALSE,
      file = "variables2.txt")

ClassFrame

logPreds <- c("CREAT", "LDH", "NEU", "PSA", "WBC", "CREACL", 
              "MG", "BUN", "CCRC")
catPreds <- c("RACE_C", "REGION_C", "ECOG_C", "STUDYID",
              "AGEGRP2", "SMOKE", "SMOKFREQ", "AGEGRP")
binPreds <- setdiff(names(training), c(conVarCT, catPreds))
nonPreds <- c("DOMAIN", "RPT", "LKADT_P", "DEATH", "DISCONT",
              "ENDTRS_C", "ENTRT_PC", "PER_REF", "LKADT_REF",
              "LKADT_PER")

classFrame <- data.frame(
  names = names(training), 
  modeltype = factor(rep("linear", ncol(training)),
                     levels = c("linear", "factor", "binary")),
  transform = factor(rep("id", ncol(training)), levels=c("id", "log"))
)

classFrame[classFrame$names %in% logPreds, "transform"] <- "log"
classFrame[classFrame$names %in% catPreds, "modeltype"] <- "factor"
classFrame[classFrame$names %in% binPreds, "modeltype"] <- "binary"

Main training function

trainModels <- function(train, 
                        test, 
                        methods = c("Stability", "Lasso", "Cox", "GB", "GB*", "GBfull", "Forest", "Forest*", "Gam"),
                        cF, 
                        impute,
                        conVarCT,
                        disVarCT,
                        selectThres = 0.5,
                        save.object = FALSE,
                        trace = 0,
                        B = 100,
                        file.seed = NULL) {

  ####################
  ### Imputation  ####
  ####################
  
  idInd <- which("RPT" == names(train))
  if(trace > 0) message("MCAR train impute\n")
  trainMCAR <- imp(train, impute, impType = "MCAR")[, -idInd]
  if(trace > 0) message("MCAR test impute\n")
  testMCAR <- imp(train, impute, impType = "MCAR", predData = test)[, -idInd]
  
  if(trace > 0) message("MAR train impute\n")
  trainMAR <- imp(train, impute, cF,
                  alpha = 0.05, min.obs.num = nrow(train), num.preds = 5,
                  key = "RPT", impType = "MAR", avoid = c(nonPreds, catPreds))[, -idInd]
  if(trace > 0) message("MAR test impute\n")
  testMAR <- imp(train, impute, cF,
                 alpha = 0.05, min.obs.num = nrow(test), num.preds = 5,
                 key = "RPT", impType = "MAR", avoid = c(nonPreds, catPreds),
                 predData = test)[, -idInd]
  
  if(trace > 0) message("MARwR train impute\n")
  trainMARwR <- imp(train, impute, cF,
                    alpha = 0.05, min.obs.num = nrow(train), num.preds = 5,
                    survobject = Surv(train$LKADT_P, train$DEATH == 1),
                    key = "RPT", impType = "MARresp", avoid = c(nonPreds, catPreds))[, -idInd]
  
  ## testMARwR is set equal to testMAR because responses are not available 
  ## for prediction on test data in general
  testMARwR <- testMAR
  
  results <- list()
  
  ############################
  ### Stability selection ####
  ############################
  
  if("Stability" %in% methods) {
    if(trace > 0) message("Stability selection\n")
    results$stabSelectMCAR <- stabSelect(trainMCAR, trace = trace > 1, B = B)
    results$stabSelectMAR <- stabSelect(trainMAR, trace = trace > 1, B = B)
    results$stabSelectMARwR <- stabSelect(trainMARwR, trace = trace > 1, B = B)
    varSelectMCAR <- as.character(with(results$stabSelectMCAR, var[freq > selectThres]))
    varSelectMAR <- as.character(with(results$stabSelectMAR, var[freq > selectThres]))
    varSelectMARwR <- as.character(with(results$stabSelectMARwR, var[freq > selectThres]))
  }
  
  #######################
  ### Fitting models ####
  #######################
  
  if("Lasso" %in% methods) {
  if(trace > 0) message("Lasso\n")
    results$lassoMCAR <- lasso(trainMCAR, testMCAR)
    results$lassoMAR <- lasso(trainMAR, testMAR)
    results$lassoMARwR <- lasso(trainMARwR, testMARwR)
    results$lassoDebiasMCAR <- lasso(trainMCAR, testMCAR, debiased = TRUE)
    results$lassoDebiasMAR <- lasso(trainMAR, testMAR, debiased = TRUE)
    results$lassoDebiasMARwR <- lasso(trainMARwR, testMARwR, debiased = TRUE)
  }
  
  if("Cox" %in% methods & "Stability" %in% methods) {
    ## Cox with stability selection
    if(trace > 0) message("Cox\n")
    results$coxMCAR <- cox(trainMCAR, testMCAR, varSelectMCAR)
    results$coxMAR <- cox(trainMAR, testMAR, varSelectMAR)
    results$coxMARwR <- cox(trainMARwR, testMARwR, varSelectMARwR)
    ## Cox handles missing values by returning NA predictions
  }
  
  if("GB" %in% methods) {
    ## Gradient boosting with variable selection by the Lasso
    if(trace > 0) message("GB\n")
    results$gbMCAR <- gradBM(trainMCAR, testMCAR, ntrees = 1000, shrinkage = 1, verbose = trace > 1)
    results$gbMAR <- gradBM(trainMAR, testMAR, ntrees = 1000, shrinkage = 1, verbose = trace > 1)
    results$gbMARwR <- gradBM(trainMARwR, testMARwR, ntrees = 1000, shrinkage = 1, verbose = trace > 1)
  }
  
  if("GB*" %in% methods & "Stability" %in% methods) {
    ## Gradient boosting with stability selection
    if(trace > 0) message("GB*\n")
    results$gbStabMCAR <- gradBM(trainMCAR, testMCAR, varNames = varSelectMCAR, ntrees = 1000, shrinkage = 1, verbose = trace > 1)
    results$gbStabMAR <- gradBM(trainMAR, testMAR, varNames = varSelectMAR, ntrees = 1000, shrinkage = 1, verbose = trace > 1)
    results$gbStabMARwR <- gradBM(trainMARwR, testMARwR, varNames = varSelectMARwR, ntrees = 1000, shrinkage = 1, verbose = trace > 1)
  }
  
  if("GBfull" %in% methods) {
    ## Gradient boosting with no variable selection
    if(trace > 0) message("GB full\n")
    results$gbFullMCAR <- gradBM(trainMCAR, testMCAR, ntrees = 1000, shrinkage = 1, verbose = trace > 1, full = TRUE, save.object = save.object, file.seed = file.seed)
    results$gbFullMAR <- gradBM(trainMAR, testMAR, ntrees = 1000, shrinkage = 1, verbose = trace > 1, full = TRUE, save.object = save.object, file.seed = file.seed)
    results$gbFullMARwR <- gradBM(trainMARwR, testMARwR, ntrees = 1000, shrinkage = 1, verbose = trace > 1, full = TRUE, save.object = save.object, file.seed = file.seed)
  }
  
  if("Forest" %in% methods) {
    ## Survival forests
    if(trace > 0) message("Forest\n")
    results$forestMCAR <- survForest(trainMCAR, testMCAR, save.object = save.object, file.seed = file.seed)
    results$forestMAR <- survForest(trainMAR, testMAR, save.object = save.object, file.seed = file.seed)
    results$forestMARwR <- survForest(trainMARwR, testMARwR, save.object = save.object, file.seed = file.seed)
  }
  
  if("Forest*" %in% methods & "Stability" %in% methods) {
    ## Survival forests with stability selection
    if(trace > 0) message("Forest*\n")
    results$forestStabMCAR <- survForest(trainMCAR, testMCAR, varNames = varSelectMCAR)
    results$forestStabMAR <- survForest(trainMAR, testMAR, varNames = varSelectMAR)
    results$forestStabMARwR <- survForest(trainMARwR, testMARwR, varNames = varSelectMARwR)
  }
  
  if("Gam" %in% methods & "Stability" %in% methods) {
    ## Gam with stability selection
    if(trace > 0) message("Gam\n")
    results$gamMCAR <- gamPred(trainMCAR, testMCAR, varNames = varSelectMCAR, conVarCT, disVarCT) 
    results$gamMAR <- gamPred(trainMAR, testMAR, varNames = varSelectMAR, conVarCT, disVarCT)
    results$gamMARwR <- gamPred(trainMARwR, testMARwR, varNames = varSelectMARwR, conVarCT, disVarCT)
  }
  
  results
}

CV loop

## To obtain identical results as in the paper use the three seeds 
## 12, 1213 and 121314.
seed <- 121314
set.seed(seed)
k <- 5 ## number of CV folds
samp <- sample(rep(1:k, each = ceiling(nrow(training)/k)), 
               size = nrow(training), 
               replace = FALSE)

impute <- setdiff(
    names(training)[apply(is.na(training), 2, any)],
    nonPreds)
  
cF <- subset(classFrame, names %in% impute)

results <- list()

for (i in 1:k) {
  
  train <- training[samp != i, ]
  test <- training[samp == i, ]
 
  results[[i]] <- trainModels(
    train = train, 
    test = test, 
    cF = cF,
    impute = impute,
    conVarCT = conVarCT,
    disVarCT = disVarCT,
    trace = 1
  )
  message(paste("CV round", i, "\n"))
}
res_list <- list()

for (i in 1:k) {
  
  test <- training[samp == i, ]

  ### Scoring models
  tmp <- results[[i]][-grep("stabSelect", names(results[[i]]))]

  scores <- sapply(tmp, 
                   function(x)
                     unlist(score_q1a(test$LKADT_P, test$DEATH, x))
  )  
  
  scores <- t(as.matrix(scores[, order(scores["iAUC", ], decreasing = TRUE)]))
  res_list[[i]] <- scores
  message(paste("CV round", i, "\n"))
}

Results

## Storing results using seed in name
save(res_list, file = paste("mainResults", seed, sep = ""))

Full models for validation submission

impute <- setdiff(
    names(training)[apply(is.na(training), 2, any)],
    nonPreds)
cF <- subset(classFrame, names %in% impute)

## As above, to obtain identical results as in the paper 
## for the validation predictions the two seeds 
### 102030 and 3020 were used.

seed.val <- 3020 
set.seed(seed.val)
results <- trainModels(
  train = training, 
  test = validation, 
  cF = cF,
  impute = impute,
  conVarCT = conVarCT,
  disVarCT = disVarCT,
  save.object = FALSE,
  trace = 1,
  file.seed = seed.val
)
tmp <- results[-grep("stabSelect", names(results))]
for(i in seq_along(tmp)) {
  riskhat <- cbind(validation[, "RPT", drop = FALSE], data.frame(riskScoreGlobal = tmp[[i]]))
  write.csv(riskhat, file = paste("Validation2/", names(tmp)[[i]], ".csv", sep = ""), row.names=FALSE)
}
postscript(file = "Selection2.eps", width = 5, height = 12,
           onefile = FALSE, horizontal = FALSE, paper = "special")
p1 <- plot(results$stabSelectMCAR) + 
  theme(axis.title.y = element_text(margin = margin(0, 10, 0, 0))) +
  geom_label(aes(x = 15, y = 0.8, label = "MCAR")) +
  geom_abline(intercept = 0.5, slope = 0, color = "red")  
p2 <- plot(results$stabSelectMAR) + 
  theme(axis.title.y = element_text(margin = margin(0, 10, 0, 0))) +
  geom_label(aes(x = 15, y = 0.8, label = "MAR")) +
  geom_abline(intercept = 0.5, slope = 0, color = "red")  
p3 <- plot(results$stabSelectMARwR) + 
  theme(axis.title.y = element_text(margin = margin(0, 10, 0, 0))) +
  geom_label(aes(x = 15, y = 0.8, label = "MARwR")) +
  geom_abline(intercept = 0.5, slope = 0, color = "red")  
gridExtra::grid.arrange(p1, p2, p3, ncol=1)
dev.off()