here::i_am("figure_generation/cfDNA_summary.Rmd")
library(here)
library(tidyr)
library(purrr)
library(ggplot2)
library(tibble)
library(stringr)
library(dplyr)
library(readr)
library(forcats)
library(ggbeeswarm)
library(RColorBrewer)
dir.create(here("output"), showWarnings = FALSE)

Introduction

This file plots some metrics for the cfDNA samples, without involving the CUPiD predictions. First we read in the output of the Nextflow pipeline, included in the git repo and called cfDNA_qseaSummary.csv. This contains 211 samples in total, with 143 patients with known tumour types, 27 non-cancer controls and 41 patients with CUP.

CUPiDcols <- read_rds(here("figure_inputs", "CUPiD_colour_palette.rds"))

qseaSummary <- read_csv(here("figure_inputs", "cfDNA_qseaSummary.csv"))

Independent test cohort

Plot number of samples per class

colsToUse <- qseaSummary %>% 
  pull(correct_class) %>% 
  na.omit() %>% 
  unique() %>% 
  sort() %>% 
  setdiff("CUP")

qseaSummary %>%
  distinct(sample_name, correct_class) %>%
  filter(correct_class != "CUP") %>%
  ggplot(aes(y = fct_reorder(correct_class,correct_class, .fun = length), fill = correct_class)) +
  geom_bar() +
  scale_fill_manual(values = CUPiDcols[colsToUse]) +
  theme_bw() +
  geom_text(
    aes(label = after_stat(count)),
    stat = 'count',
    nudge_x = 2,
    size = 5
  ) +
  labs(x = "Count",
       y = "Class") + 
  xlim(c(0,38))

ggsave(here("output/cfDNA_n.pdf"), width = 4, height = 4)

Plot ichor tumour fraction averages

qseaSummary %>%
  filter(correct_class != "CUP") %>%
  mutate(ichorTumourFraction = 100*ichorTumourFraction) %>%
  distinct(sample_name, correct_class, ichorTumourFraction) %>%
  ggplot(aes(y = fct_reorder(correct_class, ichorTumourFraction),
             x = ichorTumourFraction,
             col = correct_class)) +
  geom_boxplot(outlier.shape = NA) +
  geom_quasirandom() +
  geom_vline(xintercept = 3, linetype = "dotted") +
  scale_colour_manual(values = CUPiDcols[colsToUse]) +
  scale_x_log10() +
  labs(y = "Tumour Class",
       x = "Estimated Tumour Fraction (%, log scale)",
       title = "IchorCNA estimated tumour fraction",
       subtitle = "Cutoff of 3% shown") +
  theme_bw()

ggsave(here("output/ichorTumourFraction.pdf"), width = 5, height = 5)

Plot the relative enrichment

newOrder <- qseaSummary %>%
  distinct(sample_name, correct_class, ichorTumourFraction, valid_fragments, hyperStableEdgar, relH_MeCap = relH, relH_Input = input_relH) %>%
  group_by(correct_class) %>%
  summarise(median = median(relH_MeCap)) %>%
  arrange(median) %>%
  pull(correct_class)

qseaSummary %>%
  filter(correct_class != "CUP") %>%
  distinct(sample_name, correct_class, ichorTumourFraction, valid_fragments, hyperStableEdgar, relH_MeCap = relH, relH_Input = input_relH) %>%
  pivot_longer(matches("relH"), names_to = "enrich", values_to = "relH", names_prefix = "relH_") %>%
  mutate(enrich = ifelse(enrich == "MeCap","Enriched","Non-enriched")) %>%
  mutate(enrich = factor(enrich, levels = c("Enriched","Non-enriched") ),
         correct_class = factor(correct_class, levels = newOrder)) %>%
  ggplot(aes(y = correct_class, x = relH, col = enrich)) +
  geom_boxplot(outlier.shape = NA, position = "identity") +
  geom_quasirandom() +
  theme_bw() +
  #cbctools::scale_color_cruk() +
  labs(y = "Class",
       x = "Relative Enrichment (relH)",
       colour = "Fraction")

ggsave(here("output/cfDNA_relH.pdf"), width = 4, height = 4)

Export table of metrics

qseaSummary %>%
  dplyr::select(sample_name, correct_class, age, sex, source, "cfDNA yield (ng/mL)" = cfDNA_yield, "DNA input (ng)" = DNA_input,
                "ichorCNA Tumour Fraction" = ichorTumourFraction, "Hyper-stable Fraction" = hyperStableEdgar,
                "Initial Reads" = raw_total_sequences, "Mapped Reads" = reads_mapped,
                "Deduplicated Reads" = reads_post_dedup, "Total Fragments in QSEA" = total_fragments,
                "Final Valid Fragments" = valid_fragments,
                "enrichment score (relH)" = relH, "enrichment score (GoGe)" = GoGe,
                "Initial Reads (non-enriched)" = input_raw_total_sequences, "Mapped Reads (non-enriched)" = input_reads_mapped,
                "Deduplicated Reads (non-enriched)" = input_reads_post_dedup, "Total Fragments in QSEA (non-enriched)" = input_total_fragments,
                "Final Valid Fragments (non-enriched)" = input_valid_fragments,
                "enrichment score (relH), non-enriched" = input_relH,
                "enrichment score (GoGe), non-enriched" = input_GoGe) %>%
  write_csv(here("output/cfDNA_methylation_metrics.csv"))

CNV plot in the Validation Cohort

CNVmatrix <- read_rds(here("figure_inputs/CNV_knownTypes.rds")) %>%
  as_tibble() %>%
  dplyr::select(-tidyselect::matches("width|strand")) %>%
  dplyr::mutate(window = paste0(seqnames, ":",start, "-",end)) %>%
  tibble::remove_rownames() %>%
  tibble::column_to_rownames("window") %>%
  dplyr::select(-seqnames, -start, -end) %>%
  as.matrix() %>%
  t()

colAnno <- CNVmatrix %>%
  colnames() %>%
  enframe() %>%
  dplyr::select(value) %>%
  mutate(chr = as.numeric(str_remove(value,":.*")) %% 2 ) %>%
  column_to_rownames("value")

rowAnno <- qseaSummary %>%
  filter(correct_class != "CUP", correct_class != "NCC") %>%
  dplyr::select(sample_name, correct_class, tumourFraction = ichorTumourFraction)

avgTFs <- rowAnno %>%
  group_by(correct_class) %>%
  summarise(median = median(tumourFraction)) %>%
  arrange(desc(median))

rowAnno <- rowAnno %>%
  mutate(correct_class = factor(correct_class, levels = avgTFs$correct_class)) %>%
  dplyr::arrange(correct_class, desc(tumourFraction)) %>%
  column_to_rownames("sample_name")

CNVmatrix[rownames(rowAnno), ] %>%
  pheatmap::pheatmap(cluster_rows = FALSE,
                     cluster_cols = FALSE,
                     show_colnames = FALSE,
                     show_rownames = FALSE,
                     annotation_row = rowAnno,
                     breaks = seq(from = -1, to = 1, length.out = 8),
                     annotation_colors = list(chr = c("1" = "darkgrey", "2" = "lightgrey"),
                                              correct_class = CUPiDcols[colsToUse]),
                     annotation_col = colAnno,
                     gaps_row = rowAnno %>% dplyr::pull(correct_class) %>% as.character() %>% rle() %>% {.$lengths} %>% cumsum(),
                     gaps_col = colAnno %>% dplyr::pull(chr) %>% diff() %>% {which(. != 0)},
                     color = rev(RColorBrewer::brewer.pal(name = "RdBu", n = 7)),
                     filename = here("output/CNVplot.pdf"),
                     height = 8.5,
                     width = 9.5,
  )

dir.create(here("source_data"), showWarnings = FALSE)

CNVmatrix %>% 
  as.data.frame() %>% 
  write_csv(here("source_data/CNVmatrix.csv"))

CUP samples

Plot the relative enrichment

qseaSummary %>%
  filter(correct_class == "CUP") %>%
  distinct(sample_name, correct_class, ichorTumourFraction, valid_fragments, hyperStableEdgar, relH_MeCap = relH, relH_Input = input_relH) %>%
  pivot_longer(matches("relH"), names_to = "enrich", values_to = "relH", names_prefix = "relH_") %>%
  mutate(enrich = ifelse(enrich == "MeCap","Enriched","Non-enriched")) %>%
  mutate(enrich = factor(enrich, levels = c("Enriched","Non-enriched") )) %>%
  ggplot(aes(y = relH, x = enrich, col = enrich)) +
  geom_boxplot(outlier.shape = NA, position = "identity") +
  #geom_jitter(height = 0, width = 0.2) +
  geom_quasirandom() +
  theme_bw() +
  #cbctools::scale_color_cruk() +
  labs(x = "Fraction",
       y = "Relative Enrichment (relH)",
       colour = "Fraction")

ggsave(here("output/CUP_relH.pdf"), width = 4, height = 4)

Export metrics

qseaSummary %>%
  filter(correct_class == "CUP") %>%
  dplyr::select(sample_name, correct_class, age, sex, source, "cfDNA yield (ng/mL)" = cfDNA_yield, "DNA input (ng)" = DNA_input,
                "ichorCNA Tumour Fraction" = ichorTumourFraction, "Hyper-stable Fraction" = hyperStableEdgar,
                "Initial Reads" = raw_total_sequences, "Mapped Reads" = reads_mapped,
                "Deduplicated Reads" = reads_post_dedup, "Total Fragments in QSEA" = total_fragments,
                "Final Valid Fragments" = valid_fragments,
                "enrichment score (relH)" = relH, "enrichment score (GoGe)" = GoGe,
                "Initial Reads (non-enriched)" = input_raw_total_sequences, "Mapped Reads (non-enriched)" = input_reads_mapped,
                "Deduplicated Reads (non-enriched)" = input_reads_post_dedup, "Total Fragments in QSEA (non-enriched)" = input_total_fragments,
                "Final Valid Fragments (non-enriched)" = input_valid_fragments,
                "enrichment score (relH), non-enriched" = input_relH,
                "enrichment score (GoGe), non-enriched" = input_GoGe) %>%
  write_csv(here("output/CUP_methylation_metrics.csv"))

Summary information for the 79 NCC samples used to generate the classifier

NCC01summary <- read_csv(here("figure_inputs/NCC01_qseaSummary.csv"))

NCC01summary %>%
  dplyr::select(sample_name, age, sex, source,  "cfDNA yield (ng/mL)" = cfDNA_yield, "DNA input (ng)" = DNA_input,
                "ichorCNA Tumour Fraction" = ichorTumourFraction, "Hyper-stable Fraction" = hyperStableEdgar,
                "Initial Reads" = raw_total_sequences, "Mapped Reads" = reads_mapped,
                "Deduplicated Reads" = reads_post_dedup, "Total Fragments in QSEA" = total_fragments,
                "Final Valid Fragments" = valid_fragments,
                "enrichment score (relH)" = relH, "enrichment score (GoGe)" = GoGe,
                "Initial Reads (non-enriched)" = input_raw_total_sequences, "Mapped Reads (non-enriched)" = input_reads_mapped,
                "Deduplicated Reads (non-enriched)" = input_reads_post_dedup, "Total Fragments in QSEA (non-enriched)" = input_total_fragments,
                "Final Valid Fragments (non-enriched)" = input_valid_fragments,
                "enrichment score (relH), non-enriched" = input_relH,
                "enrichment score (GoGe), non-enriched" = input_GoGe) %>%
  write_csv(here("output/NCC01_methylation_metrics.csv"))
sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.3
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/London
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] GenomicRanges_1.52.0 GenomeInfoDb_1.36.2  IRanges_2.34.1      
##  [4] S4Vectors_0.38.1     BiocGenerics_0.46.0  RColorBrewer_1.1-3  
##  [7] ggbeeswarm_0.7.2     forcats_1.0.0        readr_2.1.4         
## [10] dplyr_1.1.4          stringr_1.5.1        tibble_3.2.1        
## [13] ggplot2_3.4.4        purrr_1.0.2          tidyr_1.3.1         
## [16] here_1.0.1          
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.4            beeswarm_0.4.0          xfun_0.40              
##  [4] bslib_0.5.1             tzdb_0.4.0              vctrs_0.6.5            
##  [7] tools_4.3.1             bitops_1.0-7            generics_0.1.3         
## [10] parallel_4.3.1          fansi_1.0.6             highr_0.10             
## [13] pkgconfig_2.0.3         pheatmap_1.0.12         lifecycle_1.0.4        
## [16] GenomeInfoDbData_1.2.10 compiler_4.3.1          farver_2.1.1           
## [19] textshaping_0.3.6       munsell_0.5.0           vipor_0.4.5            
## [22] htmltools_0.5.6         sass_0.4.7              RCurl_1.98-1.12        
## [25] yaml_2.3.7              pillar_1.9.0            crayon_1.5.2           
## [28] jquerylib_0.1.4         cachem_1.0.8            tidyselect_1.2.0       
## [31] digest_0.6.33           stringi_1.8.3           labeling_0.4.3         
## [34] rprojroot_2.0.3         fastmap_1.1.1           grid_4.3.1             
## [37] colorspace_2.1-0        cli_3.6.2               magrittr_2.0.3         
## [40] utf8_1.2.4              withr_3.0.0             scales_1.2.1           
## [43] bit64_4.0.5             rmarkdown_2.24          XVector_0.40.0         
## [46] bit_4.0.5               ragg_1.2.5              hms_1.1.3              
## [49] evaluate_0.21           knitr_1.43              rlang_1.1.3            
## [52] glue_1.7.0              rstudioapi_0.15.0       vroom_1.6.3            
## [55] jsonlite_1.8.7          R6_2.5.1                zlibbioc_1.46.0        
## [58] systemfonts_1.0.4