knitr::opts_knit$set(root.dir = '../', fig.width = 12, fig.height = 8, warning = FALSE, message = FALSE)
knitr::opts_chunk$set(message = FALSE, warning = FALSE)
here::i_am("figure_generation/CUPiD_cfDNA_application.Rmd")
library(here)
library(tidyr)
library(purrr)
library(ggplot2)
library(tibble)
library(stringr)
library(dplyr)
library(readr)
library(forcats)
library(yardstick)
library(ggbeeswarm)
library(swimplot)
library(DT)
library(glue)
dir.create(here("output"), showWarnings = FALSE)
CUPiDcols <- read_rds(here("figure_inputs/CUPiD_colour_palette.rds"))
This document takes the output of applying CUPiD to the independent test cohort, and the CUP samples. These samples were not used to train the classifier.
Define a function to summarise the calls across the 100 classifiers. This function also checks for consistency for the samples with their known class:
summariseCalls <- function(preds){
predsFirst <- preds %>%
group_by(sample_name) %>%
summarise(across(matches(".pred_"), mean)) %>%
pivot_longer(matches(".pred_"), names_to = "class1", values_to = "value1", names_prefix = ".pred_") %>%
group_by(sample_name) %>%
arrange(desc(value1)) %>%
dplyr::slice(1)
predsSecond <- preds %>%
group_by(sample_name) %>%
summarise(across(matches(".pred_"), mean)) %>%
pivot_longer(matches(".pred_"), names_to = "class2", values_to = "value2", names_prefix = ".pred_") %>%
group_by(sample_name) %>%
arrange(desc(value2)) %>%
dplyr::slice(2)
predsFirst %>%
left_join(predsSecond) %>%
ungroup() %>%
left_join(preds %>% dplyr::distinct(sample_name, correct_class, ichorTumourFraction, hyperStableEdgar)) %>%
dplyr::select(sample_name, correct_class, class1, value1, class2, value2, hyperStableEdgar, ichorTumourFraction) %>%
arrange(sample_name) %>%
mutate(call = case_when(#correct_class == "CUP" ~ "CUP",
value1 < 0.5 ~ "Unclassified",
class1 == "NCC" ~ "Unclassified",
class1 == correct_class ~ "Correct",
correct_class == "CUP" ~ class1,
class1 != correct_class ~ "Wrong"
)
)
}
First we read in the output files generated by applying the 100 subclassifiers to the cfDNA samples, and split into the TARGET known tumour types, the NCCs and the CUP predictions:
individualPredictions <- read_rds(here("figure_inputs/combined_classifier_application.rds"))
CancerPreds <- individualPredictions %>%
filter(correct_class != "NCC", correct_class != "CUP")
NCCpreds <- individualPredictions %>%
filter(correct_class == "NCC")
CUPpreds <- individualPredictions %>%
filter(correct_class == "CUP")
Summarise over the 100 classifiers using the previously defined
function and look at the results. Here class1 and
value1 are the predicted class and the associated value
(between 0 and 1) for the class with the highest prediction score;
class2 and value2 are the second highest
predicted class and score.
CancerPreds %>%
summariseCalls() %>%
DT::datatable() %>%
formatRound(columns=c("value1", "value2", "ichorTumourFraction"), digits=3)
TARsummary <- CancerPreds %>%
summariseCalls() %>%
dplyr::count(call) %>%
mutate(percent = round(100*n/sum(n),2))
TARsummary %>%
DT::datatable()
CancerPreds %>%
summariseCalls() %>%
filter(call == "Wrong") %>%
DT::datatable() %>%
formatRound(columns=c("value1", "value2", "ichorTumourFraction"), digits=3)
CancerPreds %>%
summariseCalls() %>%
filter(call == "Unclassified") %>%
arrange(desc(value1)) %>%
DT::datatable() %>%
formatRound(columns=c("value1", "value2", "ichorTumourFraction"), digits=3)
CancerPreds %>%
summariseCalls() %>%
dplyr::count(call) %>%
pivot_wider(names_from = call, values_from = n, values_fill = 0) %>%
mutate(total = Correct + Unclassified + Wrong,
sensitivity = Correct/total,
specificity = 1 - (Wrong/total)) %>%
knitr::kable()
| Correct | Unclassified | Wrong | total | sensitivity | specificity |
|---|---|---|---|---|---|
| 121 | 18 | 4 | 143 | 0.8461538 | 0.972028 |
Predictions made on the 27 NCC samples:
NCCpreds %>%
summariseCalls() %>%
DT::datatable() %>%
formatRound(columns=c("value1", "value2", "ichorTumourFraction"), digits=3)
NCCpreds %>%
summariseCalls() %>%
dplyr::count(call) %>%
DT::datatable()
CancerPreds %>%
bind_rows(NCCpreds) %>%
summariseCalls() %>%
mutate(correct_class = as.factor(correct_class),
correct_class = fct_relevel(correct_class,"NCC", after = 15)) %>%
arrange(correct_class) %>%
write_csv(here("output/test_cohort_summary.csv"))
CancerPreds %>%
summariseCalls() %>%
dplyr::count(correct_class, call) %>%
pivot_wider(names_from = call, values_from = n, values_fill = 0) %>%
DT::datatable()
CancerPreds %>%
summariseCalls() %>%
mutate(ichorOver3 = ifelse(ichorTumourFraction >= 0.03, "CNA", "NoCNA"),
call = ifelse(call == "Wrong", "Incorrect",call)) %>%
dplyr::count(call, ichorOver3) %>%
pivot_wider(names_from = ichorOver3, values_from = n) %>%
mutate(Total = CNA + NoCNA) %>%
DT::datatable()
colsToUse <- CancerPreds %>% pull(correct_class) %>% na.omit() %>% unique() %>% sort()
shapeVals1 = c("BLCA" = 1, "BRCA" = 2, "CervSq" = 18, "CHOL" = 4,
"Gynae" = 6, "KIRC" = 5, "LIHC" = 8, "LowerGI" = 9, "LUAD" = 10, "LUSC" = 12,
"ACC" = 13, "UpperGI" = 15, "UpperSq" = 17, "Unclassified" = 16)
nCorrect <- TARsummary %>% filter(call == "Correct") %>% pull(n)
nUnclassified <- TARsummary %>% filter(call == "Unclassified") %>% pull(n)
nIncorrect <- TARsummary %>% filter(call == "Wrong") %>% pull(n)
CancerPreds %>%
summariseCalls() %>%
mutate(ichorTumourFraction = ichorTumourFraction*100,
ichorOver3 = ifelse(ichorTumourFraction >= 3, "CNA", "NoCNA"),
call = ifelse(call == "Wrong", "Incorrect",call),
call = ifelse(call == "Unclassified", "Unclassified", call),
call = factor(call, levels = c("Correct","Unclassified", "Incorrect"),
labels = c(glue("Correct \n(n={nCorrect})"),glue("Unclassified \n(n={nUnclassified})"), glue("Incorrect \n(n={nIncorrect})")))) %>%
ggplot(aes(x = call, y = ichorTumourFraction)) +
geom_boxplot(outlier.shape = NA) +
scale_y_log10() +
geom_quasirandom(aes(col = correct_class, shape = correct_class)) +
theme_bw() +
geom_hline(yintercept = 3, linetype = "dotted") +
labs(x = "CUPiD Concordance",
y = "Estimated Tumour Fraction (%, log scale)",
col = "Correct Class",
shape = "Correct Class") +
scale_colour_manual(values = CUPiDcols[colsToUse]) +
scale_shape_manual(values = shapeVals1)
ggsave(here("output/ichor_vs_CUPID_shapes.pdf"), width = 4, height = 4.2)
cfDNAsummaryForMetrics <- CancerPreds %>%
bind_rows(NCCpreds %>%
mutate(correct_class = "NCC")) %>%
summariseCalls() %>%
mutate(predicted = ifelse(call == "Unclassified", "Unclassified",class1),
correct_class = ifelse(correct_class == "NCC","Unclassified",correct_class))
classSpecMix <- cfDNAsummaryForMetrics %>%
{c(pull(.,correct_class),pull(.,predicted))} %>%
unique() %>%
sort()
## Note that yardstick has issues with sorting the factors compared to xgboost.
## One has CervSq coming before CHOL alphabetically, the other is reversed.
## Convert all to upper case to fix this.
cfDNAsummaryForMetrics <- cfDNAsummaryForMetrics %>%
mutate(correct_class = factor(str_to_upper(correct_class),
levels = setdiff(str_to_upper(classSpecMix),"NCC")),
predicted = factor(str_to_upper(predicted),
levels = setdiff(str_to_upper(classSpecMix),"NCC")))
#use macro weighted because of the high class size imbalance. micro and macro give similar values
cfDNAsummaryForMetrics %>%
{bind_rows(
yardstick::sensitivity(.,truth = correct_class, estimate = predicted, estimator = "macro_weighted"),
yardstick::specificity(.,truth = correct_class, estimate = predicted, estimator = "macro_weighted"),
yardstick::accuracy(.,truth = correct_class, estimate = predicted, estimator = "macro_weighted"),
yardstick::precision(.,truth = correct_class, estimate = predicted, estimator = "macro_weighted")
)
} %>%
knitr::kable()
| .metric | .estimator | .estimate |
|---|---|---|
| sensitivity | macro_weighted | 0.8705882 |
| specificity | macro_weighted | 0.9787773 |
| accuracy | multiclass | 0.8705882 |
| precision | macro_weighted | 0.9198555 |
cfDNAsummaryForMetrics %>%
mutate(cancer = as.factor(ifelse(correct_class == "UNCLASSIFIED", "NonCancer", "Cancer" )),
predictedCancer = as.factor(ifelse(predicted != "UNCLASSIFIED","Cancer","NonCancer" ))
) %>%
{bind_rows(sensitivity(.,truth = cancer, estimate = predictedCancer),
specificity(.,truth = cancer, estimate = predictedCancer))} %>%
knitr::kable()
| .metric | .estimator | .estimate |
|---|---|---|
| sensitivity | binary | 0.8741259 |
| specificity | binary | 1.0000000 |
cfDNAsummaryForMetrics %>%
group_by(correct_class) %>%
mutate(correct = ifelse(predicted == correct_class,"Correct","Incorrect"),
correct = replace_na(correct,"Incorrect")) %>%
dplyr::count(correct) %>%
pivot_wider(names_from = correct, values_from = n, values_fill = 0) %>%
mutate(Total = Correct + Incorrect,
Sensitivity = Correct/Total) %>%
DT::datatable() %>%
formatRound(columns=c("Sensitivity"), digits=3)
Now we look at the predictions on the 41 CUP samples:
CUPpreds %>%
summariseCalls() %>%
DT::datatable() %>%
formatRound(columns=c("value1", "value2", "ichorTumourFraction"), digits=3)
CUPpreds %>%
summariseCalls() %>%
dplyr::count(call) %>%
DT::datatable()
CUPsummary <- CUPpreds %>%
summariseCalls() %>%
dplyr::select(-correct_class)
CUPsummary %>%
write_csv("output/CUPsummary.csv")
CUPsummary %>%
dplyr::select(sample_name,
"CUPiD prediction" = call,
"Highest Predicted Class" = class1, "Value for Highest Predicted Class" = value1,
"Second Highest Predicted Class" = class2, "Value for Second Highest Predicted Class" = value2,
"Fraction of Hyperstable methylated windows with beta value over 0.4" = hyperStableEdgar,
"Estimated tumour fraction from ichorCNA" = ichorTumourFraction,
) %>%
write_csv(here("output/CUP_predictions.csv"))
colsToUse22 <- CUPsummary %>% pull(call) %>% na.omit() %>% unique() %>% sort()
shapeVals = c("BLCA" = 1, "BRCA" = 2, "CervSq" = 3, "CHOL" = 4, "DLBC" = 5,
"Gynae" = 6, "KIRP" = 7, "LIHC" = 8, "LowerGI" = 9, "LUAD" = 10, "LUSC" = 12,
"MESO" = 13, "SARC" = 14, "UpperGI" = 15, "UpperSq" = 17, "Unclassified" = 16)
CUPsummary %>%
mutate(ichorTumourFraction = 100*ichorTumourFraction,
ichorOver3 = ifelse(ichorTumourFraction >= 3, "CNA", "NoCNA")) %>%
mutate(predictMade = ifelse(call != "Unclassified","Prediction Made", "Unclassified"),
predictMade = factor(predictMade, levels = c("Prediction Made","Unclassified"),
labels = c("Prediction Made \n (n=32)","Unclassified \n (n=9)"))) %>%
mutate(call = fct_relevel(call, "Unclassified", after = 1000)) %>%
ggplot(aes(x = predictMade, y = ichorTumourFraction)) +
geom_boxplot(outlier.shape = NA) + scale_y_log10() +
geom_quasirandom(method = "maxout", aes(col = call, shape = call),
width = 0.2, size = 2, bandwidth = 0.8) +
theme_bw() +
geom_hline(yintercept = 3, linetype = "dotted") +
labs(x = "CUPiD Prediction Status",
y = "Estimated Tumour Fraction (%, log scale)",
col = "Call", shape = "Call") +
scale_colour_manual(values = c(CUPiDcols[colsToUse22], "Unclassified" = "#BBBBBB")) +
scale_shape_manual(values = shapeVals) +
guides(col = guide_legend(ncol = 2)) +
theme(legend.position = "right")
ggsave(here("output/ichor_vs_CUPID_CUP_shapes.pdf"), width = 6, height = 4.5)
Generate a Swimmers plot for those CUP patients with a later diagnosis, including the CUPiD predictions
CUP.event <- read_csv(here("figure_inputs/CUP.event.csv")) %>%
as.data.frame()
CUP.diagnosis <- read_csv(here("figure_inputs/CUP.diagnosis.csv")) %>%
mutate(diagnosis = ifelse(diagnosis == "iCCA","CHOL", diagnosis),
diagnosis = ifelse(diagnosis == "Sarcoma","SARC", diagnosis),
#diagnosis = ifelse(diagnosis == "Gastric","UpperGI", diagnosis),
) %>%
group_by(id) %>%
mutate(survival_length = max(Time_to)) %>%
arrange(desc(survival_length)) %>%
ungroup() %>%
mutate(survival_order = 30 - dplyr::min_rank(survival_length)) %>%
as.data.frame()
p_swim <- swimmer_plot(df = CUP.diagnosis,
id = "id",
end = 'Time_to',
name_fill = 'diagnosis',
id_order = 'survival_order',
increasing = FALSE,
col = "white",
alpha = 1,
width = .6
) +
swimmer_points(
df_points = CUP.event,
id = 'id',
time = 'Time',
name_shape = 'Event',
size = 2,
stroke = 1.5,
fill = "black",
name_col = 'Event'
) +
scale_shape_manual(
name = "Event",
values = c(
"AFP" = 2,
"Initial biopsy" = 18,
"Liquid biopsy" = 20,
"Plasma TARC" = 1,
"Repeat biopsy" = 5,
"MDT review" = 3
),
breaks = c(
"Initial biopsy",
"Liquid biopsy",
"Plasma TARC",
"Repeat biopsy",
"MDT review",
"AFP"
)
) +
scale_colour_manual(
name = "Event",
values = c(
"AFP" = 'gold',
"Initial biopsy" = 'deepskyblue',
"Liquid biopsy" = 'darkblue',
"Plasma TARC" = 'brown',
"Repeat biopsy" = 'black',
"MDT review" = 'firebrick3'
),
breaks = c(
"Initial biopsy",
"Liquid biopsy",
"Plasma TARC",
"Repeat biopsy",
"MDT review",
"AFP"
)
) +
scale_y_continuous(name = "Time since CUP diagnosis (months)",breaks = seq(0,50,by = 5)) +
labs(shape = "Event", colour = "Event", fill = "Diagnosis") +
labs(x = "Sample ID") +
theme_classic() +
swimmer_arrows(df_arrows = CUP.diagnosis, id = 'id',
arrow_start = 'Time_to',
cont = 'Alive',
name_col = 'Alive',
show.legend = FALSE,
type = "open",
cex = 0.5,
colour = "black") +
scale_fill_manual(name = "Diagnosis", values = c("CUP" = "#BBBBBB", "Gastric" = "#e6ab02","Lymphoma"='#d62728',
"CHOL" = "#666666", "LUAD" = "#1f77b4", "SARC" = "#C5B6CE",
"Gynae" = "#bc80bd", "BLCA" = "#2ca02c","BRCA" ="#e377c2",
"CRC" = "#a6761d", "BLCA" = "#2ca02c",
"Correct" = "#1b9277", "Incorrect" = "#d62728")) +
theme(legend.direction = "vertical", legend.position = "right",
legend.box = "horizontal", legend.key.size = unit(0.5, "cm"),
legend.justification = "right", legend.title = element_text(colour = 'black'))
ggsave(here("output/swimplot.pdf"), height = 6, width = 9)
dir.create(here("source_data"), showWarnings = FALSE)
CUP.diagnosis %>%
dplyr::rename(sample_name = id) %>%
left_join(CUPsummary %>% dplyr::select(sample_name, CUPiD = call)) %>%
mutate(CUPiD = ifelse(CUPiD %in% c("Unclassified","NCC"), NA,CUPiD),
concordance = ifelse(diagnosis == CUPiD | (diagnosis == "Lymphoma" & CUPiD == "DLBC"), "Consistent", "Inconsistent")) %>%
arrange(survival_length) %>% mutate(concordance = ifelse(diagnosis == "CUP", NA, concordance)) %>%
write_csv(here("source_data/swimmers_plot_diagnosis.csv"))
CNVmatrixCUP <- read_rds(here("figure_inputs/CNV_CUP.rds")) %>%
as_tibble() %>%
dplyr::select(-tidyselect::matches("width|strand")) %>%
dplyr::mutate(window = paste0(seqnames, ":",start, "-",end)) %>%
tibble::remove_rownames() %>%
tibble::column_to_rownames("window") %>%
dplyr::select(-seqnames, -start, -end) %>%
as.matrix() %>%
t()
colAnno <- CNVmatrixCUP %>%
colnames() %>%
enframe() %>%
dplyr::select(value) %>%
mutate(chr = as.numeric(str_remove(value,":.*")) %% 2 ) %>%
column_to_rownames("value")
rowAnno <- CUPsummary %>%
dplyr::select(sample_name, call, tumourFraction = ichorTumourFraction) %>%
dplyr::arrange(desc(tumourFraction)) %>%
mutate(call = factor(call, levels = colsToUse22),
call = fct_relevel(call, "Unclassified",after = 30)) %>%
column_to_rownames("sample_name")
CUPiDcolsCNV <- CUPiDcols
CUPiDcolsCNV["NCC"] <- "#BBBBBB"
names(CUPiDcolsCNV) <- str_replace(names(CUPiDcolsCNV),"NCC","Unclassified")
CUPiDcolsCNV <- CUPiDcolsCNV[levels(rowAnno$call) ]
CNVmatrixCUP[rownames(rowAnno), ] %>%
pheatmap::pheatmap(cluster_rows = FALSE,
cluster_cols = FALSE,
show_colnames = FALSE,
show_rownames = FALSE,
annotation_row = rowAnno,
breaks = seq(from = -1, to = 1, length.out = 8),
annotation_colors = list(chr = c("1" = "darkgrey", "2" = "lightgrey"),
call = CUPiDcolsCNV),
annotation_col = colAnno,
gaps_col = colAnno %>% dplyr::pull(chr) %>% diff() %>% {which(. != 0)},
color = rev(RColorBrewer::brewer.pal(name = "RdBu", n = 7)))
CNVmatrixCUP[rownames(rowAnno), ] %>%
pheatmap::pheatmap(cluster_rows = FALSE,
cluster_cols = FALSE,
show_colnames = FALSE,
show_rownames = FALSE,
annotation_row = rowAnno,
breaks = seq(from = -1, to = 1, length.out = 8),
annotation_colors = list(chr = c("1" = "darkgrey", "2" = "lightgrey"),
call = CUPiDcolsCNV),
annotation_col = colAnno,
gaps_col = colAnno %>% dplyr::pull(chr) %>% diff() %>% {which(. != 0)},
color = rev(RColorBrewer::brewer.pal(name = "RdBu", n = 7)),
filename = here("output/CNVplotCUP.pdf"),
height = 6,
width = 9,
)
CNVmatrixCUP %>%
as.data.frame() %>%
rownames_to_column("sample_name") %>%
write_csv(here("source_data/CNVmatrixCUP.csv"))
sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.3
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Europe/London
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] GenomicRanges_1.52.0 GenomeInfoDb_1.36.2 IRanges_2.34.1
## [4] S4Vectors_0.38.1 BiocGenerics_0.46.0 glue_1.7.0
## [7] DT_0.30 swimplot_1.2.0 ggbeeswarm_0.7.2
## [10] yardstick_1.3.0 forcats_1.0.0 readr_2.1.4
## [13] dplyr_1.1.4 stringr_1.5.1 tibble_3.2.1
## [16] ggplot2_3.4.4 purrr_1.0.2 tidyr_1.3.1
## [19] here_1.0.1
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.4 beeswarm_0.4.0 xfun_0.40
## [4] bslib_0.5.1 htmlwidgets_1.6.2 tzdb_0.4.0
## [7] bitops_1.0-7 vctrs_0.6.5 tools_4.3.1
## [10] crosstalk_1.2.0 generics_0.1.3 parallel_4.3.1
## [13] fansi_1.0.6 highr_0.10 pkgconfig_2.0.3
## [16] pheatmap_1.0.12 RColorBrewer_1.1-3 GenomeInfoDbData_1.2.10
## [19] lifecycle_1.0.4 compiler_4.3.1 farver_2.1.1
## [22] textshaping_0.3.6 munsell_0.5.0 vipor_0.4.5
## [25] htmltools_0.5.6 sass_0.4.7 RCurl_1.98-1.12
## [28] yaml_2.3.7 pillar_1.9.0 crayon_1.5.2
## [31] jquerylib_0.1.4 ellipsis_0.3.2 cachem_1.0.8
## [34] tidyselect_1.2.0 digest_0.6.33 stringi_1.8.3
## [37] rprojroot_2.0.3 fastmap_1.1.1 grid_4.3.1
## [40] colorspace_2.1-0 cli_3.6.2 magrittr_2.0.3
## [43] utf8_1.2.4 withr_3.0.0 scales_1.2.1
## [46] bit64_4.0.5 XVector_0.40.0 rmarkdown_2.24
## [49] bit_4.0.5 ragg_1.2.5 hms_1.1.3
## [52] evaluate_0.21 knitr_1.43 rlang_1.1.3
## [55] rstudioapi_0.15.0 vroom_1.6.3 jsonlite_1.8.7
## [58] R6_2.5.1 zlibbioc_1.46.0 systemfonts_1.0.4