# This file is generated during the initial code to process the arrays for the classifier, as data/PanCancerMetadataTable.csv
# Copied here to be in the git repository for reproducing the figures
read_csv(here("figure_inputs/PanCancerMetadataTable.csv")) %>%
  filter(!is.na(newGroup), !str_detect(newGroup, "NotUsed|Normal") ) %>%
  ggplot(aes(y = fct_reorder(newGroup,newGroup, .fun = length), fill = newGroup)) +
  geom_bar() +
  scale_fill_manual(values = CUPiDcols) +
  theme_bw() +
  geom_text(
    aes(label = after_stat(count)),
    stat = 'count',
    nudge_x = 30,
    size = 3
  ) +
  labs(x = "Count",
       y = "Class") + 
  xlim(c(0,810))

dir.create(here("output"), showWarnings = FALSE)
ggsave(here("output/training_class_nums.pdf"), width = 6, height = 5)

Individual Classifier Validation Performance

Each individual classifier generates a nested tibble containing various elements. One of these entries is the testPreds2, which is the predictions on the mixture sets where NEITHER sample has been used to train the classifier. We have provided a pre-processed csv file with these predictions for figure generation, here is the code that generated it from the classifier output.

# Sys.glob(here("data/classifier1/output/predictions_*.rds")) %>%
#   map_df(function(x){read_rds(x) %>%
#       dplyr::select(modelNum, testPreds2) %>%
#       unnest(testPreds2) %>%
#       mutate(modelNum = as.numeric(modelNum))}) %>%
#   mutate(across(matches(".pred"), ~round(.,digits = 4))) %>% #round to try and get file space down for repository. Might slightly affect results compared to paper.
#   dplyr::select(-sample1, -sample2, -prop1) %>% # remove not required columns (and could be regenerated from sample_name)
#   arrange(sample_name, modelNum) %>% #sort to help compression 
#   write_rds(here("figure_inputs/merged_double_unseen_mixturesets_predictions.rds"),compress = "xz")

Individual ROC curves

Note that yardstick has issues with sorting the factors compared to xgboost, e.g. one has CervSq coming before CHOL alphabetically, the other sorts without case sensitivity. Convert classes to upper case to fix this.

predMixArrays <- read_rds(here("figure_inputs/merged_double_unseen_mixturesets_predictions.rds")) %>%
  mutate(newGroup = str_to_upper(newGroup)) %>%
  mutate(newGroup = as.factor(newGroup)) %>%
  dplyr::rename_with(str_to_upper, .cols = matches(".pred_")) %>%
  dplyr::rename_with(~str_replace(.,"PRED_","pred_")) %>%
  dplyr::select(modelNum, sample_name, newGroup, matches(".pred_")) %>%
  dplyr::select(modelNum, sample_name, newGroup, sort(tidyselect::peek_vars())) # to resort

# use yardstick to generate the 100 individual roc aucs. 
roc_aucs <- predMixArrays %>%
  group_by(modelNum) %>%
  roc_auc(newGroup, matches(".pred_"))

We can summarise the 100 sub-classifier AUCs, and plot them for the supplementary figure:

roc_aucs %>%
  summarise(mean = mean(.estimate),
            sd = sd(.estimate))
## # A tibble: 1 × 2
##    mean      sd
##   <dbl>   <dbl>
## 1 0.980 0.00546
roc_aucs %>%
  mutate(.metric = "") %>%
  ggplot(aes(x = .metric, y = .estimate)) +
  geom_boxplot(width = 0.4, outlier.shape = NA) + 
  geom_jitter(width = 0.2) +
  theme(axis.ticks.x = element_blank()) + 
  labs(x = "", y = "ROC_AUC",
       title = "Multiclass AUCs for individual classifiers") +
  theme_bw()

ggsave(here("output/training_individual_rocs.pdf"), width = 4, height = 4)

Individual ROC Curves

# use yardstick to generate the 100 individual roc curves, 
roc_curves <- predMixArrays %>%
  group_by(modelNum) %>%
  roc_curve(newGroup, matches(".pred_"))

pp <- roc_curves %>%
  ggplot(aes(x = 1 - specificity, y = sensitivity, group = modelNum, col = .level)) +
   geom_path(alpha = 10 / 100) +
   geom_abline(col = "black", lty = 3) +
   scale_color_manual(values = CUPiDcols %>% set_names(.,nm = toupper(names(.)))) +
   coord_equal() +
   theme_bw() +
   facet_wrap(~.level, nrow = 6, ncol = 5) +
   theme(legend.position = "none") +
   labs(x = "1 - Specificity",
       y = "Sensitivity",
       title = "ROC curve for individual classifiers by class",
       subtitle = "Predicted on the left-out mixture sets for each classifier")

ggsave(plot = pp, filename = here("output/roc_curve_individual_all.png"), dpi = 1200, width = 10, height = 12)

CUPiD combined classifier ROCs

For the combined classifier ROCs, we take each mixture set and average over the sub-classifiers where that sample wasn’t used for training. On average this will be on 4 sub-classifiers (20%*20% = 4%):

predMixArrays %>% 
  count(sample_name) %>%
  ggplot(aes(x = n)) + 
  geom_histogram(col = "white", binwidth = 1) +
  theme_bw() +
  labs(x = "Number of sub-classifiers predicting on a mixture set",
       y = "Count")

overall_roc_auc <- predMixArrays %>%
  group_by(sample_name, newGroup) %>%
  summarise(across(matches(".pred_"),mean)) %>%
  ungroup() %>%
  roc_auc(newGroup, matches(".pred_"))

overall_roc_auc %>%
  knitr::kable()
.metric .estimator .estimate
roc_auc hand_till 0.9843951
overall_roc_curve <- predMixArrays %>%
  group_by(sample_name, newGroup) %>%
  summarise(across(matches(".pred_"),mean)) %>%
  ungroup() %>%
  roc_curve(newGroup, matches(".pred_"))

Overall ROC curve

p <- overall_roc_curve %>%
  ggplot(aes(x = 1 - specificity, y = sensitivity, col = .level, group = .level)) +
   geom_path(alpha = 1) +
   geom_abline(lty = 3, col = "black") +
   coord_equal() +
   theme_bw() +
   theme(legend.position = "none") +
   scale_color_manual(values = CUPiDcols) +
   labs(x = "1 - Specificity",
       y = "Sensitivity",
       col = "Class"
       )

ggsave(here("output/cupid_roc_curve.png"), height = 3, width = 3, dpi = 600)

Volcano plot of average DMRs between ACC and BLCA

read_csv(here("figure_inputs/DMRs_ACC_vs_BLCA.csv")) %>%
  arrange(ACC_vs_BLCA_betaDelta) %>%
  mutate(used = ifelse(row_number() <= 250, "Used",NA)) %>%
  arrange(desc(ACC_vs_BLCA_betaDelta)) %>%
  mutate(used = ifelse(row_number() <= 250, "Used",used),
         used = replace_na(used,"Not Used")) %>%
  ggplot(aes(x = ACC_vs_BLCA_betaDelta, y = -log(ACC_vs_BLCA_adjPval), col = used)) +
  geom_point() +
  labs(x = "Difference in Average Beta Values",
       y = "-log10 (Adjsuted P Value)",
       col = "Used in CUPiD",
       title = "Differentially Methylated Regions, ACC vs BLCA") +
  theme(legend.position = "bottom") +
  theme_bw() 

ggsave(here("output/DMRs_ACC_vs_BLCA.png"), width = 6.5, height = 5, dpi = 300)

Exporting Source Data file

dir.create(here("source_data"), showWarnings = FALSE)

roc_aucs %>%
  write_csv(here("source_data/roc_aucs.csv"))

read_rds(here("figure_inputs/merged_double_unseen_mixturesets_predictions.rds")) %>%
  dplyr::select(-.row) %>%
  dplyr::arrange(modelNum) %>%
  write_csv(here("source_data/roc_data.csv"))

roc_curves %>%
  dplyr::select(-.threshold) %>%
  dplyr::arrange(modelNum) %>%
  write_csv(here("source_data/roc_curve_pts.csv"))

overall_roc_curve %>%
  dplyr::select(-.threshold) %>%
  write_csv(here("source_data/ensemble_roc_curve_pts.csv"))

predMixArrays %>%
  group_by(sample_name, newGroup) %>%
  summarise(across(matches(".pred_"),mean)) %>%
  ungroup() %>%
  write_csv(here("source_data/ensemble_roc_data.csv"))
sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.3
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/London
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] yardstick_1.3.0 forcats_1.0.0   readr_2.1.4     dplyr_1.1.4    
##  [5] stringr_1.5.1   tibble_3.2.1    ggplot2_3.4.4   purrr_1.0.2    
##  [9] tidyr_1.3.1     here_1.0.1     
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.7        utf8_1.2.4        generics_0.1.3    stringi_1.8.3    
##  [5] hms_1.1.3         digest_0.6.33     magrittr_2.0.3    evaluate_0.21    
##  [9] grid_4.3.1        fastmap_1.1.1     rprojroot_2.0.3   jsonlite_1.8.7   
## [13] fansi_1.0.6       scales_1.2.1      textshaping_0.3.6 jquerylib_0.1.4  
## [17] cli_3.6.2         rlang_1.1.3       crayon_1.5.2      bit64_4.0.5      
## [21] munsell_0.5.0     withr_3.0.0       cachem_1.0.8      yaml_2.3.7       
## [25] tools_4.3.1       parallel_4.3.1    tzdb_0.4.0        colorspace_2.1-0 
## [29] vctrs_0.6.5       R6_2.5.1          lifecycle_1.0.4   bit_4.0.5        
## [33] vroom_1.6.3       ragg_1.2.5        pkgconfig_2.0.3   pillar_1.9.0     
## [37] bslib_0.5.1       gtable_0.3.4      glue_1.7.0        systemfonts_1.0.4
## [41] highr_0.10        xfun_0.40         tidyselect_1.2.0  rstudioapi_0.15.0
## [45] knitr_1.43        farver_2.1.1      htmltools_0.5.6   rmarkdown_2.24   
## [49] labeling_0.4.3    compiler_4.3.1