# This file is generated during the initial code to process the arrays for the classifier, as data/PanCancerMetadataTable.csv
# Copied here to be in the git repository for reproducing the figures
read_csv(here("figure_inputs/PanCancerMetadataTable.csv")) %>%
filter(!is.na(newGroup), !str_detect(newGroup, "NotUsed|Normal") ) %>%
ggplot(aes(y = fct_reorder(newGroup,newGroup, .fun = length), fill = newGroup)) +
geom_bar() +
scale_fill_manual(values = CUPiDcols) +
theme_bw() +
geom_text(
aes(label = after_stat(count)),
stat = 'count',
nudge_x = 30,
size = 3
) +
labs(x = "Count",
y = "Class") +
xlim(c(0,810))
dir.create(here("output"), showWarnings = FALSE)
ggsave(here("output/training_class_nums.pdf"), width = 6, height = 5)
Each individual classifier generates a nested tibble containing
various elements. One of these entries is the testPreds2,
which is the predictions on the mixture sets where NEITHER sample has
been used to train the classifier. We have provided a pre-processed csv
file with these predictions for figure generation, here is the code that
generated it from the classifier output.
# Sys.glob(here("data/classifier1/output/predictions_*.rds")) %>%
# map_df(function(x){read_rds(x) %>%
# dplyr::select(modelNum, testPreds2) %>%
# unnest(testPreds2) %>%
# mutate(modelNum = as.numeric(modelNum))}) %>%
# mutate(across(matches(".pred"), ~round(.,digits = 4))) %>% #round to try and get file space down for repository. Might slightly affect results compared to paper.
# dplyr::select(-sample1, -sample2, -prop1) %>% # remove not required columns (and could be regenerated from sample_name)
# arrange(sample_name, modelNum) %>% #sort to help compression
# write_rds(here("figure_inputs/merged_double_unseen_mixturesets_predictions.rds"),compress = "xz")
Note that yardstick has issues with sorting the factors compared to xgboost, e.g. one has CervSq coming before CHOL alphabetically, the other sorts without case sensitivity. Convert classes to upper case to fix this.
predMixArrays <- read_rds(here("figure_inputs/merged_double_unseen_mixturesets_predictions.rds")) %>%
mutate(newGroup = str_to_upper(newGroup)) %>%
mutate(newGroup = as.factor(newGroup)) %>%
dplyr::rename_with(str_to_upper, .cols = matches(".pred_")) %>%
dplyr::rename_with(~str_replace(.,"PRED_","pred_")) %>%
dplyr::select(modelNum, sample_name, newGroup, matches(".pred_")) %>%
dplyr::select(modelNum, sample_name, newGroup, sort(tidyselect::peek_vars())) # to resort
# use yardstick to generate the 100 individual roc aucs.
roc_aucs <- predMixArrays %>%
group_by(modelNum) %>%
roc_auc(newGroup, matches(".pred_"))
We can summarise the 100 sub-classifier AUCs, and plot them for the supplementary figure:
roc_aucs %>%
summarise(mean = mean(.estimate),
sd = sd(.estimate))
## # A tibble: 1 × 2
## mean sd
## <dbl> <dbl>
## 1 0.980 0.00546
roc_aucs %>%
mutate(.metric = "") %>%
ggplot(aes(x = .metric, y = .estimate)) +
geom_boxplot(width = 0.4, outlier.shape = NA) +
geom_jitter(width = 0.2) +
theme(axis.ticks.x = element_blank()) +
labs(x = "", y = "ROC_AUC",
title = "Multiclass AUCs for individual classifiers") +
theme_bw()
ggsave(here("output/training_individual_rocs.pdf"), width = 4, height = 4)
# use yardstick to generate the 100 individual roc curves,
roc_curves <- predMixArrays %>%
group_by(modelNum) %>%
roc_curve(newGroup, matches(".pred_"))
pp <- roc_curves %>%
ggplot(aes(x = 1 - specificity, y = sensitivity, group = modelNum, col = .level)) +
geom_path(alpha = 10 / 100) +
geom_abline(col = "black", lty = 3) +
scale_color_manual(values = CUPiDcols %>% set_names(.,nm = toupper(names(.)))) +
coord_equal() +
theme_bw() +
facet_wrap(~.level, nrow = 6, ncol = 5) +
theme(legend.position = "none") +
labs(x = "1 - Specificity",
y = "Sensitivity",
title = "ROC curve for individual classifiers by class",
subtitle = "Predicted on the left-out mixture sets for each classifier")
ggsave(plot = pp, filename = here("output/roc_curve_individual_all.png"), dpi = 1200, width = 10, height = 12)
For the combined classifier ROCs, we take each mixture set and average over the sub-classifiers where that sample wasn’t used for training. On average this will be on 4 sub-classifiers (20%*20% = 4%):
predMixArrays %>%
count(sample_name) %>%
ggplot(aes(x = n)) +
geom_histogram(col = "white", binwidth = 1) +
theme_bw() +
labs(x = "Number of sub-classifiers predicting on a mixture set",
y = "Count")
overall_roc_auc <- predMixArrays %>%
group_by(sample_name, newGroup) %>%
summarise(across(matches(".pred_"),mean)) %>%
ungroup() %>%
roc_auc(newGroup, matches(".pred_"))
overall_roc_auc %>%
knitr::kable()
| .metric | .estimator | .estimate |
|---|---|---|
| roc_auc | hand_till | 0.9843951 |
overall_roc_curve <- predMixArrays %>%
group_by(sample_name, newGroup) %>%
summarise(across(matches(".pred_"),mean)) %>%
ungroup() %>%
roc_curve(newGroup, matches(".pred_"))
p <- overall_roc_curve %>%
ggplot(aes(x = 1 - specificity, y = sensitivity, col = .level, group = .level)) +
geom_path(alpha = 1) +
geom_abline(lty = 3, col = "black") +
coord_equal() +
theme_bw() +
theme(legend.position = "none") +
scale_color_manual(values = CUPiDcols) +
labs(x = "1 - Specificity",
y = "Sensitivity",
col = "Class"
)
ggsave(here("output/cupid_roc_curve.png"), height = 3, width = 3, dpi = 600)
read_csv(here("figure_inputs/DMRs_ACC_vs_BLCA.csv")) %>%
arrange(ACC_vs_BLCA_betaDelta) %>%
mutate(used = ifelse(row_number() <= 250, "Used",NA)) %>%
arrange(desc(ACC_vs_BLCA_betaDelta)) %>%
mutate(used = ifelse(row_number() <= 250, "Used",used),
used = replace_na(used,"Not Used")) %>%
ggplot(aes(x = ACC_vs_BLCA_betaDelta, y = -log(ACC_vs_BLCA_adjPval), col = used)) +
geom_point() +
labs(x = "Difference in Average Beta Values",
y = "-log10 (Adjsuted P Value)",
col = "Used in CUPiD",
title = "Differentially Methylated Regions, ACC vs BLCA") +
theme(legend.position = "bottom") +
theme_bw()
ggsave(here("output/DMRs_ACC_vs_BLCA.png"), width = 6.5, height = 5, dpi = 300)
dir.create(here("source_data"), showWarnings = FALSE)
roc_aucs %>%
write_csv(here("source_data/roc_aucs.csv"))
read_rds(here("figure_inputs/merged_double_unseen_mixturesets_predictions.rds")) %>%
dplyr::select(-.row) %>%
dplyr::arrange(modelNum) %>%
write_csv(here("source_data/roc_data.csv"))
roc_curves %>%
dplyr::select(-.threshold) %>%
dplyr::arrange(modelNum) %>%
write_csv(here("source_data/roc_curve_pts.csv"))
overall_roc_curve %>%
dplyr::select(-.threshold) %>%
write_csv(here("source_data/ensemble_roc_curve_pts.csv"))
predMixArrays %>%
group_by(sample_name, newGroup) %>%
summarise(across(matches(".pred_"),mean)) %>%
ungroup() %>%
write_csv(here("source_data/ensemble_roc_data.csv"))
sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.3
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Europe/London
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] yardstick_1.3.0 forcats_1.0.0 readr_2.1.4 dplyr_1.1.4
## [5] stringr_1.5.1 tibble_3.2.1 ggplot2_3.4.4 purrr_1.0.2
## [9] tidyr_1.3.1 here_1.0.1
##
## loaded via a namespace (and not attached):
## [1] sass_0.4.7 utf8_1.2.4 generics_0.1.3 stringi_1.8.3
## [5] hms_1.1.3 digest_0.6.33 magrittr_2.0.3 evaluate_0.21
## [9] grid_4.3.1 fastmap_1.1.1 rprojroot_2.0.3 jsonlite_1.8.7
## [13] fansi_1.0.6 scales_1.2.1 textshaping_0.3.6 jquerylib_0.1.4
## [17] cli_3.6.2 rlang_1.1.3 crayon_1.5.2 bit64_4.0.5
## [21] munsell_0.5.0 withr_3.0.0 cachem_1.0.8 yaml_2.3.7
## [25] tools_4.3.1 parallel_4.3.1 tzdb_0.4.0 colorspace_2.1-0
## [29] vctrs_0.6.5 R6_2.5.1 lifecycle_1.0.4 bit_4.0.5
## [33] vroom_1.6.3 ragg_1.2.5 pkgconfig_2.0.3 pillar_1.9.0
## [37] bslib_0.5.1 gtable_0.3.4 glue_1.7.0 systemfonts_1.0.4
## [41] highr_0.10 xfun_0.40 tidyselect_1.2.0 rstudioapi_0.15.0
## [45] knitr_1.43 farver_2.1.1 htmltools_0.5.6 rmarkdown_2.24
## [49] labeling_0.4.3 compiler_4.3.1