start_time <- Sys.time()
quiet_library <- function(...) {
suppressPackageStartupMessages(library(...))
}
quiet_library(rhdf5)
quiet_library(BarMixer)
quiet_library(Matrix)
quiet_library(ggplot2)
quiet_library(cowplot)
Declaring start
stm("Starting .h5 split")
stm(paste0("Using BarMixer v", installed.packages()["BarMixer","Version"]))
in_h5 <- params$in_h5
in_hto <- params$in_hto
out_dir <- params$out_dir
stm(paste0("IN H5 file : ", in_h5))
stm(paste0("IN HTO results path : ", in_hto))
stm(paste0("OUT Directory : ", out_dir))
print(c(
paste0("IN H5 file : ", in_h5),
paste0("IN HTO results path : ", in_hto),
paste0("OUT Directory : ", out_dir)
))
## [1] "IN H5 file : /mnt/barware-manuscript/BarWare-demo/demo_output/W4-hashed-24k/rna_metadata/W4-hashed-24k.h5"
## [2] "IN HTO results path : /mnt/barware-manuscript/BarWare-demo/demo_output/W4-hashed-24k/hto_processed"
## [3] "OUT Directory : /mnt/barware-manuscript/BarWare-demo/demo_output/split_h5"
if(!file.exists(in_h5)) {
stm(paste0("ERROR: Cannot find IN H5 file:", in_h5))
stop()
} else {
stm(paste0("Found H5 file:", in_h5))
in_well <- sub(".h5$","",basename(in_h5))
}
in_mat <- list.files(in_hto, pattern = "hto_count_matrix", full.names = TRUE)
if(!file.exists(in_mat)) {
stm(paste0("ERROR: Cannot find HTO count matrix:", in_mat))
stop()
} else {
stm(paste0("Found HTO count matrix:", in_mat))
}
in_tbl <- list.files(in_hto, pattern = "hto_category_table", full.names = TRUE)
if(!file.exists(in_tbl)) {
stm(paste0("ERROR: Cannot find HTO category table:", in_tbl))
stop()
} else {
stm(paste0("Found HTO category table:", in_tbl))
}
stm(paste0("Loading HTO count matrix from ", in_mat))
hash_count_df <- fread(in_mat)
hash_count_mat <- as.matrix(hash_count_df[,-1])
rownames(hash_count_mat) <- hash_count_df[[1]]
rm(hash_count_df)
stm(paste0("Loading HTO category table from ", in_tbl))
hash_category_table <- fread(in_tbl)
hto_barcodes <- unique(hash_category_table$hto_barcode[hash_category_table$hto_category == "singlet"])
output_h5_files <- file.path(out_dir, paste0(in_well, "_",
c(hto_barcodes,"multiplet"),
".h5"))
output_messages <- paste0("OUT HDF5 : ", output_h5_files)
for(i in seq_along(output_messages)) {
stm(output_messages[i])
}
out_json <- file.path(out_dir, paste0(in_well, "_split_h5_metrics.json"))
json_message <- paste0(paste0("OUT JSON : ", out_json))
stm(json_message)
print(output_messages)
## [1] "OUT HDF5 : /mnt/barware-manuscript/BarWare-demo/demo_output/split_h5/W4-hashed-24k_AGTAAGTTCAGCGTA.h5"
## [2] "OUT HDF5 : /mnt/barware-manuscript/BarWare-demo/demo_output/split_h5/W4-hashed-24k_GGTTGCCAGATGTCA.h5"
## [3] "OUT HDF5 : /mnt/barware-manuscript/BarWare-demo/demo_output/split_h5/W4-hashed-24k_TGATGGCCTATTGGG.h5"
## [4] "OUT HDF5 : /mnt/barware-manuscript/BarWare-demo/demo_output/split_h5/W4-hashed-24k_AAGTATCGTTTCGCA.h5"
## [5] "OUT HDF5 : /mnt/barware-manuscript/BarWare-demo/demo_output/split_h5/W4-hashed-24k_TTCCGCCTCTCTTTG.h5"
## [6] "OUT HDF5 : /mnt/barware-manuscript/BarWare-demo/demo_output/split_h5/W4-hashed-24k_GTCAACTCTTTAGCG.h5"
## [7] "OUT HDF5 : /mnt/barware-manuscript/BarWare-demo/demo_output/split_h5/W4-hashed-24k_multiplet.h5"
print(json_message)
## [1] "OUT JSON : /mnt/barware-manuscript/BarWare-demo/demo_output/split_h5/W4-hashed-24k_split_h5_metrics.json"
stm(paste0("Loading HDF5 from ", in_h5))
h5_list <- h5dump(in_h5)
stm("Adding hash data to HDF5 results")
h5_list <- add_h5_list_hash_results(h5_list,
hash_category_table,
hash_count_mat,
match_target = "original_barcodes")
stm("Splitting HDF5 based on Hash Category Table")
split_h5_list <- split_h5_list_by_hash(h5_list)
for(i in seq_along(split_h5_list)) {
out_file <- file.path(out_dir, paste0(in_well, "_", names(split_h5_list)[i],".h5"))
stm(paste0("Writing HDF5 to ", out_file))
write_h5_list(split_h5_list[[i]],
h5_file = out_file,
overwrite = TRUE)
h5closeAll()
}
Extract metadata for plotting
stm("Generating tables and plots for report")
meta <- h5_list_cell_metadata(h5_list)
qc_list <- list(report_type = "split_h5_by_hash",
report_datetime = as.character(start_time),
report_uuid = ids::uuid(use_time = TRUE),
package = "BarMixer",
package_version = sessionInfo()$otherPkgs$BarMixer$Version,
well_id = in_well)
Metadata prep
meta <- as.data.table(meta)
if("sample_id" %in% names(meta)) {
meta$plot_barcode <- paste0(meta$hto_barcode,"\n",meta$sample_id)
} else {
meta$sample_id <- "no_sample_id"
meta$plot_barcode <- meta$hto_barcode
}
hto_order <- c("no_hash","singlet","doublet","multiplet")
hto_count_stats <- meta[, .(n_cells = nrow(.SD),
frac_cells = round(nrow(.SD)/nrow(meta), 4),
total_aligned_reads = sum(.SD$n_reads),
frac_all_aligned_reads = round(sum(.SD$n_reads)/sum(meta$n_reads), 4)),
by = hto_category]
hto_count_stats <- hto_count_stats[match(hto_order,hto_count_stats$hto_category),]
qc_list$hto_category_stats <- lapply(1:nrow(hto_count_stats),
function(x) {
list(n_cells = hto_count_stats$n_cells[x],
frac_cells = hto_count_stats$frac_cells[x],
total_aligned_reads = hto_count_stats$total_aligned_reads[x],
frac_all_aligned_reads = hto_count_stats$frac_all_aligned_reads[x])
})
names(qc_list$hto_category_stats) <- hto_count_stats$hto_category
qc_table(hto_count_stats)
hto_count_stats$fill <- c("black",
"darkblue",
"dodgerblue",
"mediumorchid4")
category_counts_barplot <- ggplot() +
geom_bar(data = hto_count_stats,
aes(x = as.factor(1:nrow(hto_count_stats)),
y = n_cells,
fill = fill),
stat = "identity") +
geom_text(data = hto_count_stats,
aes(x = as.factor(1:nrow(hto_count_stats)),
y = n_cells + 0.02 * max(n_cells),
label = n_cells),
hjust = 0.5,
vjust = 0) +
scale_fill_identity() +
scale_x_discrete("",
breaks = 1:nrow(hto_count_stats),
labels = hto_count_stats$hto_category) +
scale_y_continuous("N Cells", limits = c(0, 4e4)) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90,
hjust = 0,
vjust = 0.3)) +
ggtitle("Cells per HTO Category")
suppressWarnings(
category_counts_barplot
)
plot_df <- hto_count_stats
plot_df$cells_max <- cumsum(plot_df$frac_cells)
plot_df$cells_min <- data.table::shift(plot_df$cells_max, type = "lag", fill = 0)
plot_df$reads_max <- cumsum(plot_df$frac_all_aligned_reads)
plot_df$reads_min <- data.table::shift(plot_df$reads_max, type = "lag", fill = 0)
hto_category_fraction_barplot <- ggplot() +
geom_rect(data = plot_df,
aes(xmin = reads_min,
xmax = reads_max,
ymin = 0.6,
ymax = 1.4,
fill = fill)) +
geom_rect(data = plot_df,
aes(xmin = cells_min,
xmax = cells_max,
ymin = 1.6,
ymax = 2.4,
fill = fill)) +
scale_fill_identity() +
scale_x_continuous("") +
scale_y_continuous("",
breaks = c(1, 2),
labels = c("Fraction of Reads",
"Fraction of Cell Barcodes")) +
theme_bw() +
ggtitle("Fraction of Cells and Reads per HTO Category")
suppressWarnings(
hto_category_fraction_barplot
)
singlet_meta <- as.data.table(meta[meta$hto_category == "singlet",])
bc_count_stats <- singlet_meta[, .(n_singlets = nrow(.SD),
frac_singlets = round(nrow(.SD)/nrow(singlet_meta), 4)),
by = list(sample_id,hto_barcode)]
qc_list$sample_singlet_stats <- lapply(1:nrow(bc_count_stats),
function(x) {
list(hto_barcode = bc_count_stats$hto_barcode[x],
n_singlets = bc_count_stats$n_singlets[x],
frac_singlets = bc_count_stats$frac_singlets[x])
})
names(qc_list$sample_singlet_stats) <- bc_count_stats$sample_id
qc_table(bc_count_stats)
bc_to_plot_bc <- data.frame(hto_barcode = unique(singlet_meta$hto_barcode),
plot_barcode = unique(singlet_meta$plot_barcode))
bc_count_stats <- bc_count_stats[bc_to_plot_bc,
on = "hto_barcode"]
ggplot() +
geom_bar(data = bc_count_stats,
aes(x = as.factor(plot_barcode),
y = n_singlets),
stat = "identity") +
geom_hline(aes(yintercept = median(bc_count_stats$n_singlets)),
linetype = "dashed") +
scale_x_discrete("") +
scale_y_continuous("N Cells",
limits = c(0, 4000)) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90,
hjust = 1,
vjust = 0.3)) +
ggtitle("Cells per HTO Barcode")
hto_read_stats <- meta[, .(q25 = quantile(n_reads, 0.25),
median = quantile(n_reads, 0.50),
q75 = quantile(n_reads, 0.75)),
by = hto_category]
hto_read_stats <- hto_read_stats[match(hto_order,hto_read_stats$hto_category),]
for(category in hto_read_stats$hto_category) {
qc_list$hto_category_stats[[category]]$mapped_reads_per_cell_q25 <- hto_read_stats$q25[hto_read_stats$hto_category == category]
qc_list$hto_category_stats[[category]]$mapped_reads_per_cell_median <- hto_read_stats$median[hto_read_stats$hto_category == category]
qc_list$hto_category_stats[[category]]$mapped_reads_per_cell_q75 <- hto_read_stats$q75[hto_read_stats$hto_category == category]
}
qc_table(hto_read_stats)
bc_read_stats <- singlet_meta[, .(q25 = quantile(n_reads, 0.25),
median = quantile(n_reads, 0.50),
q75 = quantile(n_reads, 0.75)),
by = list(sample_id,hto_barcode)]
for(sample_id in bc_read_stats$sample_id) {
qc_list$sample_singlet_stats[[sample_id]]$mapped_reads_per_cell_q25 <- bc_read_stats$q25[bc_read_stats$sample_id == sample_id]
qc_list$sample_singlet_stats[[sample_id]]$mapped_reads_per_cell_median <- bc_read_stats$median[bc_read_stats$sample_id == sample_id]
qc_list$sample_singlet_stats[[sample_id]]$mapped_reads_per_cell_q75 <- bc_read_stats$q75[bc_read_stats$sample_id == sample_id]
}
qc_table(bc_read_stats)
category_reads_violins <- qc_violin_plot(meta,
category_x = "hto_category",
name_x = "HTO Category",
column_y = "n_reads",
name_y = "N Reads per Cell",
fill = "dodgerblue")
barcode_reads_violins <- qc_violin_plot(meta[meta$hto_category == "singlet",],
category_x = "plot_barcode",
name_x = "HTO Barcode (singlets)",
column_y = "n_reads",
name_y = "N Reads per Cell",
fill = "dodgerblue") +
ggtitle("Reads per Cell")
reads_violin_list <- list(category_reads_violins,
barcode_reads_violins)
suppressWarnings(
plot_grid(plotlist = reads_violin_list,
ncol = 2, rel_widths = c(1, 3),
nrow = 1, align = "h")
)
hto_umi_stats <- meta[, .(q25 = quantile(n_umis, 0.25),
median = quantile(n_umis, 0.50),
q75 = quantile(n_umis, 0.75)),
by = hto_category]
hto_umi_stats <- hto_umi_stats[match(hto_order,hto_umi_stats$hto_category),]
for(category in hto_umi_stats$hto_category) {
qc_list$hto_category_stats[[category]]$umis_per_cell_q25 <- hto_umi_stats$q25[hto_umi_stats$hto_category == category]
qc_list$hto_category_stats[[category]]$umis_per_cell_median <- hto_umi_stats$median[hto_umi_stats$hto_category == category]
qc_list$hto_category_stats[[category]]$umis_per_cell_q75 <- hto_umi_stats$q75[hto_umi_stats$hto_category == category]
}
qc_table(hto_umi_stats)
bc_umi_stats <- singlet_meta[, .(q25 = quantile(n_umis, 0.25),
median = quantile(n_umis, 0.50),
q75 = quantile(n_umis, 0.75)),
by = list(sample_id,hto_barcode)]
for(sample_id in bc_umi_stats$sample_id) {
qc_list$sample_singlet_stats[[sample_id]]$umis_per_cell_q25 <- bc_umi_stats$q25[bc_umi_stats$sample_id == sample_id]
qc_list$sample_singlet_stats[[sample_id]]$umis_per_cell_median <- bc_umi_stats$median[bc_umi_stats$sample_id == sample_id]
qc_list$sample_singlet_stats[[sample_id]]$umis_per_cell_q75 <- bc_umi_stats$q75[bc_umi_stats$sample_id == sample_id]
}
qc_table(bc_umi_stats)
category_umis_violins <- qc_violin_plot(meta,
category_x = "hto_category",
name_x = "HTO Category",
column_y = "n_umis",
name_y = "N UMIs per Cell",
fill = "purple")
barcode_umis_violins <- qc_violin_plot(meta[meta$hto_category == "singlet",],
category_x = "plot_barcode",
name_x = "HTO Barcode (singlets)",
column_y = "n_umis",
name_y = "N UMIs per Cell",
fill = "purple") +
ggtitle("UMIs per Cell")
umis_violin_list <- list(category_umis_violins,
barcode_umis_violins)
suppressWarnings(
plot_grid(plotlist = umis_violin_list,
ncol = 2, rel_widths = c(1, 3),
nrow = 1, align = "h")
)
hto_gene_stats <- meta[, .(q25 = quantile(n_genes, 0.25),
median = quantile(n_genes, 0.50),
q75 = quantile(n_genes, 0.75)),
by = hto_category]
hto_gene_stats <- hto_gene_stats[match(hto_order,hto_gene_stats$hto_category),]
for(category in hto_gene_stats$hto_category) {
qc_list$hto_category_stats[[category]]$genes_per_cell_q25 <- hto_gene_stats$q25[hto_gene_stats$hto_category == category]
qc_list$hto_category_stats[[category]]$genes_per_cell_median <- hto_gene_stats$median[hto_gene_stats$hto_category == category]
qc_list$hto_category_stats[[category]]$genes_per_cell_q75 <- hto_gene_stats$q75[hto_gene_stats$hto_category == category]
}
qc_table(hto_gene_stats)
bc_gene_stats <- singlet_meta[, .(q25 = quantile(n_genes, 0.25),
median = quantile(n_genes, 0.50),
q75 = quantile(n_genes, 0.75)),
by = list(sample_id,hto_barcode)]
for(sample_id in bc_gene_stats$sample_id) {
qc_list$sample_singlet_stats[[sample_id]]$genes_per_cell_q25 <- bc_gene_stats$q25[bc_gene_stats$sample_id == sample_id]
qc_list$sample_singlet_stats[[sample_id]]$genes_per_cell_median <- bc_gene_stats$median[bc_gene_stats$sample_id == sample_id]
qc_list$sample_singlet_stats[[sample_id]]$genes_per_cell_q75 <- bc_gene_stats$q75[bc_gene_stats$sample_id == sample_id]
}
qc_table(bc_gene_stats)
category_genes_violins <- qc_violin_plot(meta,
category_x = "hto_category",
name_x = "HTO Category",
column_y = "n_genes",
name_y = "N Genes per Cell",
fill = "orangered")
barcode_genes_violins <- qc_violin_plot(meta[meta$hto_category == "singlet",],
category_x = "plot_barcode",
name_x = "HTO Barcode (singlets)",
column_y = "n_genes",
name_y = "N Genes per Cell",
fill = "orangered") +
ggtitle("Genes per Cell")
genes_violin_list <- list(category_genes_violins,
barcode_genes_violins)
suppressWarnings(
plot_grid(plotlist = genes_violin_list,
ncol = 2, rel_widths = c(1, 3),
nrow = 1, align = "h")
)
stm(paste0("Writing JSON to ", out_json))
qc_list_json <- jsonlite::toJSON(qc_list,
auto_unbox = TRUE,
pretty = TRUE)
writeLines(qc_list_json,
out_json)
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux 9 (stretch)
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.7.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.7.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] cowplot_1.0.0 ggplot2_3.3.0 BarMixer_1.0.1 Matrix_1.2-18
## [5] data.table_1.13.0 rhdf5_2.28.1 optparse_1.6.6
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.1.0 xfun_0.12 purrr_0.3.4 lattice_0.20-40
## [5] colorspace_1.4-1 vctrs_0.3.4 generics_0.0.2 htmltools_0.5.1.1
## [9] getopt_1.20.3 yaml_2.2.1 rlang_0.4.8 R.oo_1.24.0
## [13] pillar_1.4.6 later_1.0.0 glue_1.4.2 withr_2.1.2
## [17] R.utils_2.10.1 uuid_0.1-4 lifecycle_0.2.0 stringr_1.4.0
## [21] munsell_0.5.0 gtable_0.3.0 R.methodsS3_1.8.1 htmlwidgets_1.5.1
## [25] evaluate_0.14 labeling_0.3 knitr_1.28 fastmap_1.0.1
## [29] httpuv_1.5.2 crosstalk_1.0.0 Rcpp_1.0.3 xtable_1.8-4
## [33] scales_1.1.0 promises_1.1.0 DT_0.12 jsonlite_1.6.1
## [37] mime_0.9 farver_2.0.3 ids_1.0.1 digest_0.6.25
## [41] stringi_1.4.6 dplyr_1.0.2 shiny_1.4.0 grid_3.6.3
## [45] tools_3.6.3 magrittr_1.5 tibble_3.0.4 crayon_1.3.4
## [49] pkgconfig_2.0.3 ellipsis_0.3.1 assertthat_0.2.1 rmarkdown_2.1
## [53] Rhdf5lib_1.6.3 R6_2.4.1 compiler_3.6.3
Total time elapsed
end_time <- Sys.time()
diff_time <- end_time - start_time
time_message <- paste0("Elapsed Time: ",
round(diff_time, 3),
" ", units(diff_time))
print(time_message)
## [1] "Elapsed Time: 1.07 mins"
stm(time_message)
stm("H5 split process complete.")