load("../Data/Prostate_all.RData")
CoreTableDict <- read.table(file = "../Data/DataDictionary.csv",
header = TRUE, sep = ";",
na.strings = c(" ", "", "."),
stringsAsFactors = FALSE)
Data filtering and cleaning
clean <- function(data, discard) {
data <- subset(data, select = -which(colnames(data) %in% discard))
data <- transform(data,
PSA = ifelse(PSA == 0, 0.01, PSA),
ECOG_C = factor((ECOG_C >= 1) + (ECOG_C >= 2), levels = c(0, 1, 2)))
data$RACE_C[data$RACE_C == "Missing"] <- NA
data$REGION_C[data$REGION_C == "MISSING"] <- NA
for (i in seq_len(ncol(data))) {
if (is.factor(data[, i])) {
tmp <- as.character(data[, i])
if(any(tmp == "" | tmp == "Y", na.rm = TRUE)) {
tmp[tmp == ""] <- "NO"
tmp[tmp == "Y"] <- "YES"
data[, i] <- factor(tmp, levels = c("NO", "YES"))
}
}
if (is.character(data[, i]))
data[, i] <- factor(data[, i])
}
data$DEATH <- ifelse(data$DEATH == "NO", 0, 1)
data
}
## List of variables that are discarded before the modeling
discard <- c(
## Variables that are mostly or entirely missing in the training or validation data
"HGTBLCAT",
"WGTBLCAT",
"HEAD_AND_NECK",
"PANCREAS",
"THYROID",
"CREACLCA",
"CREACL",
"GLEAS_DX",
"STOMACH",
"BUN",
"CCRC",
"GLU",
"HEIGHTBL",
"WEIGHTBL",
"ARTTHROM",
## Treatment variables
"TRT1_ID",
"TRT1_ID",
"TRT2_ID",
"TRT3_ID",
## Discontinuation variables
"DISCONT",
"ENTRT_PC",
"ENDTRS_C",
## Age not in validation data, only age group
"AGEGRP",
## Misc
"PER_REF",
"LKADT_REF",
"LKADT_PER",
"DOMAIN",
"STUDYID",
"SMOKE",
"SMOKFREQ",
"SMOKSTAT",
"RBC",
"LYM",
"ABDOMINAL",
"MHNEOPLA",
"TSTAG_DX"
)
training <- clean(CoreTable_training, discard)
validation <- clean(CoreTable_validation, discard)
validation$RACE_C <- factor(validation$RACE_C, levels = levels(training$RACE_C))
validation$REGION_C <- factor(validation$REGION_C, levels = levels(training$REGION_C))
validation$REGION_C[is.na(validation$REGION_C)] <- "OTHER"
conVarCT <- CoreTableDict$Name[CoreTableDict$Type == "NUM" & CoreTableDict$Name %in% names(training)]
disVarCT <- CoreTableDict$Name[CoreTableDict$Type == "CHAR" & CoreTableDict$Name %in% names(training)]
## Check, should be character(0)
setdiff(c(conVarCT, disVarCT), colnames(training))
## character(0)
setdiff(c(conVarCT, disVarCT), colnames(validation))
## character(0)
Exploratory plots
source("../Exploratory/explorative.R")
Survival in each of the three studies
prostateSurv0 <- survfit(Surv(LKADT_P, DEATH == "YES") ~ 1, data = CoreTable_training)
prostateSurv <- survfit(Surv(LKADT_P, DEATH == "YES") ~ STUDYID, data = CoreTable_training)
## Log-rank test
survdiff(Surv(LKADT_P, DEATH == "YES") ~ STUDYID, data = CoreTable_training)
## Call:
## survdiff(formula = Surv(LKADT_P, DEATH == "YES") ~ STUDYID, data = CoreTable_training)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## STUDYID=ASCENT2 476 138 135 0.0672 0.0913
## STUDYID=CELGENE 526 92 100 0.7001 0.9215
## STUDYID=EFC6546 598 433 428 0.0675 0.2421
##
## Chisq= 0.9 on 2 degrees of freedom, p= 0.631
survFit <- as.data.frame(summary(prostateSurv)[c(2, 6, 8)])
survFit$Trial <- {
study <- survFit$strata;
levels(study) <- c("Sloan Kettering ", "Celgene", "Sanofi");
study}
postscript(file = "Survival.eps", width = 5, height = 5,
onefile = FALSE, horizontal = FALSE, paper = "special")
ggplot(survFit, aes(time, surv, color = Trial)) +
geom_ribbon(aes(x = time, y = NULL, ymin = lower, ymax = upper, color = NULL),
fill = "lightgray", color = "lightgray",
data = as.data.frame(summary(prostateSurv0)[c(2, 9, 10)])) +
geom_step(size = I(0.8)) +
theme(legend.position = "top") + ylim(c(0, 1)) +
xlab("Days") + ylab("Survival function") +
theme(axis.title.y = element_text(margin = margin(0, 10, 0, 0)))
dev.off()
## quartz_off_screen
## 2
Summary tables
rowSums(table(CoreTable_training[, c("STUDYID", "DEATH")]))
## ASCENT2 CELGENE EFC6546
## 476 526 598
summary(subset(CoreTable_training, STUDYID == "EFC6546")[, "LKADT_P"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.0 392.0 642.5 656.7 901.5 1594.0
xtable(
addmargins(table(rbind(CoreTable_training[, c("REGION_C", "STUDYID")],
CoreTable_validation[, c("REGION_C", "STUDYID")]))),
digits = 0
)
## % latex table generated in R 3.2.2 by xtable 1.8-2 package
## % Tue May 3 09:06:17 2016
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrrrrr}
## \hline
## & ASCENT2 & CELGENE & EFC6546 & AZ & Sum \\
## \hline
## EASTERN EUROPE & 0 & 84 & 127 & 50 & 261 \\
## MISSING & 476 & 6 & 0 & 0 & 482 \\
## NORTH AMERICA & 0 & 139 & 80 & 61 & 280 \\
## OTHER & 0 & 50 & 93 & 0 & 143 \\
## SOUTH AMERICA & 0 & 0 & 86 & 38 & 124 \\
## WESTERN EUROPE & 0 & 247 & 212 & 104 & 563 \\
## AFRICA & 0 & 0 & 0 & 13 & 13 \\
## ASIA/PACIFIC & 0 & 0 & 0 & 47 & 47 \\
## Sum & 476 & 526 & 598 & 313 & 1913 \\
## \hline
## \end{tabular}
## \end{table}
xtable(
addmargins(table(rbind(CoreTable_training[, c("RACE_C", "STUDYID")],
CoreTable_validation[, c("RACE_C", "STUDYID")]))),
digits = 0
)
## % latex table generated in R 3.2.2 by xtable 1.8-2 package
## % Tue May 3 09:06:17 2016
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrrrrr}
## \hline
## & ASCENT2 & CELGENE & EFC6546 & AZ & Sum \\
## \hline
## Asian & 5 & 0 & 36 & 49 & 90 \\
## Black & 32 & 25 & 17 & 12 & 86 \\
## Hispanic & 14 & 0 & 0 & 0 & 14 \\
## Missing & 0 & 55 & 0 & 0 & 55 \\
## Other & 6 & 13 & 7 & 27 & 53 \\
## White & 419 & 433 & 538 & 225 & 1615 \\
## Sum & 476 & 526 & 598 & 313 & 1913 \\
## \hline
## \end{tabular}
## \end{table}
table(CoreTable_training[, c("RACE_C", "REGION_C", "STUDYID")])
## , , STUDYID = ASCENT2
##
## REGION_C
## RACE_C EASTERN EUROPE MISSING NORTH AMERICA OTHER SOUTH AMERICA
## Asian 0 5 0 0 0
## Black 0 32 0 0 0
## Hispanic 0 14 0 0 0
## Missing 0 0 0 0 0
## Other 0 6 0 0 0
## White 0 419 0 0 0
## REGION_C
## RACE_C WESTERN EUROPE
## Asian 0
## Black 0
## Hispanic 0
## Missing 0
## Other 0
## White 0
##
## , , STUDYID = CELGENE
##
## REGION_C
## RACE_C EASTERN EUROPE MISSING NORTH AMERICA OTHER SOUTH AMERICA
## Asian 0 0 0 0 0
## Black 0 1 19 2 0
## Hispanic 0 0 0 0 0
## Missing 0 1 4 7 0
## Other 0 0 6 4 0
## White 84 4 110 37 0
## REGION_C
## RACE_C WESTERN EUROPE
## Asian 0
## Black 3
## Hispanic 0
## Missing 43
## Other 3
## White 198
##
## , , STUDYID = EFC6546
##
## REGION_C
## RACE_C EASTERN EUROPE MISSING NORTH AMERICA OTHER SOUTH AMERICA
## Asian 0 0 4 30 0
## Black 0 0 7 1 6
## Hispanic 0 0 0 0 0
## Missing 0 0 0 0 0
## Other 0 0 1 1 5
## White 127 0 68 61 75
## REGION_C
## RACE_C WESTERN EUROPE
## Asian 2
## Black 3
## Hispanic 0
## Missing 0
## Other 0
## White 207
tmp <- table(rbind(CoreTable_training[, c("AGEGRP2", "STUDYID")],
CoreTable_validation[, c("AGEGRP2", "STUDYID")]))
xtable(addmargins(tmp), digits = 0)
## % latex table generated in R 3.2.2 by xtable 1.8-2 package
## % Tue May 3 09:06:17 2016
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrrrrr}
## \hline
## & ASCENT2 & CELGENE & EFC6546 & AZ & Sum \\
## \hline
## $>$=75 & 154 & 109 & 125 & 61 & 449 \\
## 18-64 & 111 & 171 & 219 & 111 & 612 \\
## 65-74 & 211 & 246 & 254 & 141 & 852 \\
## Sum & 476 & 526 & 598 & 313 & 1913 \\
## \hline
## \end{tabular}
## \end{table}
tmp <- t(t(tmp) / colSums(tmp))
ggplot(aes(x = factor(AGEGRP2, levels = c("18-64", "65-74", ">=75")),
y = value, fill = STUDYID), data = melt(tmp)) +
geom_bar(stat = "identity", position = position_dodge()) +
xlab("Age") + ylab("Relative frequency")

tmp <- table(rbind(CoreTable_training[, c("RACE_C", "STUDYID")],
CoreTable_validation[, c("RACE_C", "STUDYID")]))
tmp <- t(t(tmp) / colSums(tmp))
ggplot(aes_string(x = "RACE_C",
y = "value", fill = "STUDYID"), data = melt(tmp)) +
geom_bar(stat = "identity", position = position_dodge()) +
xlab("Race") + ylab("Relative frequency")

tmp <- table(rbind(CoreTable_training[, c("REGION_C", "STUDYID")],
CoreTable_validation[, c("REGION_C", "STUDYID")]))
tmp <- t(t(tmp) / colSums(tmp))
ggplot(aes_string(x = "REGION_C",
y = "value", fill = "STUDYID"), data = melt(tmp)) +
geom_bar(stat = "identity", position = position_dodge()) +
xlab("Region") + ylab("Relative frequency")

trainTable <- CoreTableDict[CoreTableDict$Name %in% colnames(training), ][-(1:3),]
trainTable <- trainTable[order(trainTable$Type), ]
trainTable$missing <- apply(training[, trainTable$Name], 2, function(x) sum(is.na(x)))
trainTable$missingVal <- apply(validation[, trainTable$Name], 2, function(x) sum(is.na(x)))
trainTable$Name <- paste(rep(c("\\rowtwo ", "\\row "), times = 47)[-1], trainTable$Name)
trainTable$Label[40:93] <- sapply(strsplit(trainTable$Label[40:93], ":"),
function(x) tolower(x[length(x)]))
print(xtable(trainTable[, c(1, 2, 4, 8, 9)]), include.rownames = FALSE,
file = "variables2.txt")
ClassFrame
logPreds <- c("CREAT", "LDH", "NEU", "PSA", "WBC", "CREACL",
"MG", "BUN", "CCRC")
catPreds <- c("RACE_C", "REGION_C", "ECOG_C", "STUDYID",
"AGEGRP2", "SMOKE", "SMOKFREQ", "AGEGRP")
binPreds <- setdiff(names(training), c(conVarCT, catPreds))
nonPreds <- c("DOMAIN", "RPT", "LKADT_P", "DEATH", "DISCONT",
"ENDTRS_C", "ENTRT_PC", "PER_REF", "LKADT_REF",
"LKADT_PER")
classFrame <- data.frame(
names = names(training),
modeltype = factor(rep("linear", ncol(training)),
levels = c("linear", "factor", "binary")),
transform = factor(rep("id", ncol(training)), levels=c("id", "log"))
)
classFrame[classFrame$names %in% logPreds, "transform"] <- "log"
classFrame[classFrame$names %in% catPreds, "modeltype"] <- "factor"
classFrame[classFrame$names %in% binPreds, "modeltype"] <- "binary"
Main training function
trainModels <- function(train,
test,
methods = c("Stability", "Lasso", "Cox", "GB", "GB*", "GBfull", "Forest", "Forest*", "Gam"),
cF,
impute,
conVarCT,
disVarCT,
selectThres = 0.5,
save.object = FALSE,
trace = 0,
B = 100,
file.seed = NULL) {
####################
### Imputation ####
####################
idInd <- which("RPT" == names(train))
if(trace > 0) message("MCAR train impute\n")
trainMCAR <- imp(train, impute, impType = "MCAR")[, -idInd]
if(trace > 0) message("MCAR test impute\n")
testMCAR <- imp(train, impute, impType = "MCAR", predData = test)[, -idInd]
if(trace > 0) message("MAR train impute\n")
trainMAR <- imp(train, impute, cF,
alpha = 0.05, min.obs.num = nrow(train), num.preds = 5,
key = "RPT", impType = "MAR", avoid = c(nonPreds, catPreds))[, -idInd]
if(trace > 0) message("MAR test impute\n")
testMAR <- imp(train, impute, cF,
alpha = 0.05, min.obs.num = nrow(test), num.preds = 5,
key = "RPT", impType = "MAR", avoid = c(nonPreds, catPreds),
predData = test)[, -idInd]
if(trace > 0) message("MARwR train impute\n")
trainMARwR <- imp(train, impute, cF,
alpha = 0.05, min.obs.num = nrow(train), num.preds = 5,
survobject = Surv(train$LKADT_P, train$DEATH == 1),
key = "RPT", impType = "MARresp", avoid = c(nonPreds, catPreds))[, -idInd]
## testMARwR is set equal to testMAR because responses are not available
## for prediction on test data in general
testMARwR <- testMAR
results <- list()
############################
### Stability selection ####
############################
if("Stability" %in% methods) {
if(trace > 0) message("Stability selection\n")
results$stabSelectMCAR <- stabSelect(trainMCAR, trace = trace > 1, B = B)
results$stabSelectMAR <- stabSelect(trainMAR, trace = trace > 1, B = B)
results$stabSelectMARwR <- stabSelect(trainMARwR, trace = trace > 1, B = B)
varSelectMCAR <- as.character(with(results$stabSelectMCAR, var[freq > selectThres]))
varSelectMAR <- as.character(with(results$stabSelectMAR, var[freq > selectThres]))
varSelectMARwR <- as.character(with(results$stabSelectMARwR, var[freq > selectThres]))
}
#######################
### Fitting models ####
#######################
if("Lasso" %in% methods) {
if(trace > 0) message("Lasso\n")
results$lassoMCAR <- lasso(trainMCAR, testMCAR)
results$lassoMAR <- lasso(trainMAR, testMAR)
results$lassoMARwR <- lasso(trainMARwR, testMARwR)
results$lassoDebiasMCAR <- lasso(trainMCAR, testMCAR, debiased = TRUE)
results$lassoDebiasMAR <- lasso(trainMAR, testMAR, debiased = TRUE)
results$lassoDebiasMARwR <- lasso(trainMARwR, testMARwR, debiased = TRUE)
}
if("Cox" %in% methods & "Stability" %in% methods) {
## Cox with stability selection
if(trace > 0) message("Cox\n")
results$coxMCAR <- cox(trainMCAR, testMCAR, varSelectMCAR)
results$coxMAR <- cox(trainMAR, testMAR, varSelectMAR)
results$coxMARwR <- cox(trainMARwR, testMARwR, varSelectMARwR)
## Cox handles missing values by returning NA predictions
}
if("GB" %in% methods) {
## Gradient boosting with variable selection by the Lasso
if(trace > 0) message("GB\n")
results$gbMCAR <- gradBM(trainMCAR, testMCAR, ntrees = 1000, shrinkage = 1, verbose = trace > 1)
results$gbMAR <- gradBM(trainMAR, testMAR, ntrees = 1000, shrinkage = 1, verbose = trace > 1)
results$gbMARwR <- gradBM(trainMARwR, testMARwR, ntrees = 1000, shrinkage = 1, verbose = trace > 1)
}
if("GB*" %in% methods & "Stability" %in% methods) {
## Gradient boosting with stability selection
if(trace > 0) message("GB*\n")
results$gbStabMCAR <- gradBM(trainMCAR, testMCAR, varNames = varSelectMCAR, ntrees = 1000, shrinkage = 1, verbose = trace > 1)
results$gbStabMAR <- gradBM(trainMAR, testMAR, varNames = varSelectMAR, ntrees = 1000, shrinkage = 1, verbose = trace > 1)
results$gbStabMARwR <- gradBM(trainMARwR, testMARwR, varNames = varSelectMARwR, ntrees = 1000, shrinkage = 1, verbose = trace > 1)
}
if("GBfull" %in% methods) {
## Gradient boosting with no variable selection
if(trace > 0) message("GB full\n")
results$gbFullMCAR <- gradBM(trainMCAR, testMCAR, ntrees = 1000, shrinkage = 1, verbose = trace > 1, full = TRUE, save.object = save.object, file.seed = file.seed)
results$gbFullMAR <- gradBM(trainMAR, testMAR, ntrees = 1000, shrinkage = 1, verbose = trace > 1, full = TRUE, save.object = save.object, file.seed = file.seed)
results$gbFullMARwR <- gradBM(trainMARwR, testMARwR, ntrees = 1000, shrinkage = 1, verbose = trace > 1, full = TRUE, save.object = save.object, file.seed = file.seed)
}
if("Forest" %in% methods) {
## Survival forests
if(trace > 0) message("Forest\n")
results$forestMCAR <- survForest(trainMCAR, testMCAR, save.object = save.object, file.seed = file.seed)
results$forestMAR <- survForest(trainMAR, testMAR, save.object = save.object, file.seed = file.seed)
results$forestMARwR <- survForest(trainMARwR, testMARwR, save.object = save.object, file.seed = file.seed)
}
if("Forest*" %in% methods & "Stability" %in% methods) {
## Survival forests with stability selection
if(trace > 0) message("Forest*\n")
results$forestStabMCAR <- survForest(trainMCAR, testMCAR, varNames = varSelectMCAR)
results$forestStabMAR <- survForest(trainMAR, testMAR, varNames = varSelectMAR)
results$forestStabMARwR <- survForest(trainMARwR, testMARwR, varNames = varSelectMARwR)
}
if("Gam" %in% methods & "Stability" %in% methods) {
## Gam with stability selection
if(trace > 0) message("Gam\n")
results$gamMCAR <- gamPred(trainMCAR, testMCAR, varNames = varSelectMCAR, conVarCT, disVarCT)
results$gamMAR <- gamPred(trainMAR, testMAR, varNames = varSelectMAR, conVarCT, disVarCT)
results$gamMARwR <- gamPred(trainMARwR, testMARwR, varNames = varSelectMARwR, conVarCT, disVarCT)
}
results
}
CV loop
## To obtain identical results as in the paper use the three seeds
## 12, 1213 and 121314.
seed <- 121314
set.seed(seed)
k <- 5 ## number of CV folds
samp <- sample(rep(1:k, each = ceiling(nrow(training)/k)),
size = nrow(training),
replace = FALSE)
impute <- setdiff(
names(training)[apply(is.na(training), 2, any)],
nonPreds)
cF <- subset(classFrame, names %in% impute)
results <- list()
for (i in 1:k) {
train <- training[samp != i, ]
test <- training[samp == i, ]
results[[i]] <- trainModels(
train = train,
test = test,
cF = cF,
impute = impute,
conVarCT = conVarCT,
disVarCT = disVarCT,
trace = 1
)
message(paste("CV round", i, "\n"))
}
res_list <- list()
for (i in 1:k) {
test <- training[samp == i, ]
### Scoring models
tmp <- results[[i]][-grep("stabSelect", names(results[[i]]))]
scores <- sapply(tmp,
function(x)
unlist(score_q1a(test$LKADT_P, test$DEATH, x))
)
scores <- t(as.matrix(scores[, order(scores["iAUC", ], decreasing = TRUE)]))
res_list[[i]] <- scores
message(paste("CV round", i, "\n"))
}
Results
## Storing results using seed in name
save(res_list, file = paste("mainResults", seed, sep = ""))
Full models for validation submission
impute <- setdiff(
names(training)[apply(is.na(training), 2, any)],
nonPreds)
cF <- subset(classFrame, names %in% impute)
## As above, to obtain identical results as in the paper
## for the validation predictions the two seeds
### 102030 and 3020 were used.
seed.val <- 3020
set.seed(seed.val)
results <- trainModels(
train = training,
test = validation,
cF = cF,
impute = impute,
conVarCT = conVarCT,
disVarCT = disVarCT,
save.object = FALSE,
trace = 1,
file.seed = seed.val
)
tmp <- results[-grep("stabSelect", names(results))]
for(i in seq_along(tmp)) {
riskhat <- cbind(validation[, "RPT", drop = FALSE], data.frame(riskScoreGlobal = tmp[[i]]))
write.csv(riskhat, file = paste("Validation2/", names(tmp)[[i]], ".csv", sep = ""), row.names=FALSE)
}
postscript(file = "Selection2.eps", width = 5, height = 12,
onefile = FALSE, horizontal = FALSE, paper = "special")
p1 <- plot(results$stabSelectMCAR) +
theme(axis.title.y = element_text(margin = margin(0, 10, 0, 0))) +
geom_label(aes(x = 15, y = 0.8, label = "MCAR")) +
geom_abline(intercept = 0.5, slope = 0, color = "red")
p2 <- plot(results$stabSelectMAR) +
theme(axis.title.y = element_text(margin = margin(0, 10, 0, 0))) +
geom_label(aes(x = 15, y = 0.8, label = "MAR")) +
geom_abline(intercept = 0.5, slope = 0, color = "red")
p3 <- plot(results$stabSelectMARwR) +
theme(axis.title.y = element_text(margin = margin(0, 10, 0, 0))) +
geom_label(aes(x = 15, y = 0.8, label = "MARwR")) +
geom_abline(intercept = 0.5, slope = 0, color = "red")
gridExtra::grid.arrange(p1, p2, p3, ncol=1)
dev.off()