start_time <- Sys.time()
quiet_library <- function(...) {
suppressPackageStartupMessages(library(...))
}
quiet_library(rhdf5)
quiet_library(BarMixer)
quiet_library(Matrix)
quiet_library(ggplot2)
quiet_library(cowplot)
Declaring start
stm("Merging .h5 files")
stm(paste0("Using BarMixer v", installed.packages()["BarMixer","Version"]))
in_dir <- params$in_dir
out_dir <- params$out_dir
stm(paste0("IN H5 directory : ", in_dir))
stm(paste0("OUT Directory : ", out_dir))
print(c(
paste0("IN H5 directory : ", in_dir),
paste0("OUT Directory : ", out_dir)
))
## [1] "IN H5 directory : /mnt/barware-manuscript/BarWare-demo/demo_output/split_h5"
## [2] "OUT Directory : /mnt/barware-manuscript/BarWare-demo/demo_output/merged_h5"
in_file_names <- list.files(in_dir,
pattern = "_.+h5$")
# Check for per-well directories
if(length(in_file_names) == 0) {
in_file_dirs <- list.dirs(in_dir,
recursive = FALSE)
in_file_names <- unlist(lapply(in_file_dirs,
list.files,
pattern = "_.+h5$"))
in_file_paths <- unlist(lapply(in_file_dirs,
list.files,
pattern = "_.+h5$",
full.names = TRUE))
} else {
in_file_paths <- list.files(in_dir,
pattern = "_.+h5$",
full.names = TRUE)
}
stm(paste0("Found ", length(in_file_paths), " .h5 files for processing"))
print(paste0("Found ", length(in_file_paths), " .h5 files for processing"))
## [1] "Found 42 .h5 files for processing"
in_file_hashes <- sub(".h5$","",sub(".+_","",in_file_names))
unique_hashes <- unique(in_file_hashes)
stm(paste0("Identified ", length(unique_hashes), " hash patterns: ", paste(unique_hashes, collapse = ", ")))
meta_list <- list()
for(i in seq_along(unique_hashes)) {
current_hash <- unique_hashes[i]
stm(paste0("Merging for hash ", current_hash))
current_files <- in_file_paths[in_file_hashes == current_hash]
for(f in current_files) {
if(file.exists(f)) {
stm(paste0("IN H5 file : ", f))
} else {
stm(paste0("MISSING H5 file : ", f))
current_files <- current_files[current_files != f]
}
}
if(length(current_files) > 1) {
h5_ll <- lapply(current_files,
h5dump)
h5_list <- reduce_h5_list(h5_ll)
} else {
h5_list <- h5dump(current_files)
}
meta_list[[current_hash]] <- h5_list_cell_metadata(h5_list)
if("pool_id" %in% names(meta_list[[current_hash]])) {
current_pool <- meta_list[[current_hash]]$pool_id[1]
} else {
current_pool <- "merged"
}
if(current_hash == "multiplet") {
out_file <- file.path(out_dir,
paste0(current_pool, "_",
current_hash, ".h5"))
} else {
current_sample_id <- meta_list[[current_hash]]$sample_id[1]
out_file <- file.path(out_dir,
paste0(current_pool, "_",
current_sample_id, ".h5"))
}
stm(paste0("Writing HDF5 to : ", out_file))
write_h5_list(h5_list,
out_file,
overwrite = TRUE)
}
## Warning in BarMixer::write_h5_list(h5_list[[h5_name]], h5_file = h5_file, : NAs
## introduced by coercion to integer range
## Warning in BarMixer::write_h5_list(h5_list[[h5_name]], h5_file = h5_file, : NAs
## introduced by coercion to integer range
## Warning in BarMixer::write_h5_list(h5_list[[h5_name]], h5_file = h5_file, : NAs
## introduced by coercion to integer range
## Warning in BarMixer::write_h5_list(h5_list[[h5_name]], h5_file = h5_file, : NAs
## introduced by coercion to integer range
## Warning in BarMixer::write_h5_list(h5_list[[h5_name]], h5_file = h5_file, : NAs
## introduced by coercion to integer range
## Warning in BarMixer::write_h5_list(h5_list[[h5_name]], h5_file = h5_file, : NAs
## introduced by coercion to integer range
## Warning in BarMixer::write_h5_list(h5_list[[h5_name]], h5_file = h5_file, : NAs
## introduced by coercion to integer range
stm("Generating QC Metrics and Plots for Report")
all_meta <- do.call("rbind", meta_list)
all_meta <- as.data.table(all_meta)
if("sample_id" %in% names(all_meta)) {
all_meta$plot_barcode <- paste0(all_meta$hto_barcode,
"\n",
all_meta$sample_id)
hto_barcode_to_sample_id <- data.frame(hto_barcode = unique(all_meta$hto_barcode),
sample_id = unique(all_meta$sample_id))
} else {
all_meta$sample_id <- "No PBMC ID"
all_meta$plot_barcode <- all_meta$hto_barcode
hto_barcode_to_sample_id <- data.frame(sample_id = "No PBMC ID",
hto_barcode = unique(all_meta$hto_barcode))
}
singlet_meta <- all_meta[hto_category == "singlet"]
hto_barcode_to_sample_id <- as.data.table(hto_barcode_to_sample_id)
qc_list <- list(report_type = "merge_h5_by_hash",
report_datetime = as.character(start_time),
report_uuid = ids::uuid(use_time = TRUE),
package = "BarMixer",
package_version = sessionInfo()$otherPkgs$BarMixer$Version)
out_json <- file.path(out_dir, paste0(current_pool, "_final_h5_metrics.json"))
stm(paste0("OUT JSON : ", out_json))
print(paste0("OUT JSON : ", out_json))
## [1] "OUT JSON : /mnt/barware-manuscript/BarWare-demo/demo_output/merged_h5/merged_final_h5_metrics.json"
overview_metrics <- all_meta[,.(n_cells = nrow(.SD),
median_reads = median(n_reads),
median_umis = median(n_umis),
median_genes = median(n_genes))]
qc_list$pool_stats <- as.list(overview_metrics)
qc_table(overview_metrics)
hto_category_metrics <- all_meta[,.(n_cells = nrow(.SD),
median_reads = as.numeric(median(n_reads)),
median_umis = as.numeric(median(n_umis)),
median_genes = as.numeric(median(n_genes))),
by = hto_category]
qc_list$hto_category_stats <- lapply(1:nrow(hto_category_metrics),
function(x) {
list(n_cells = hto_category_metrics$n_cells[x],
median_reads = hto_category_metrics$median_reads[x],
median_umis = hto_category_metrics$median_umis[x],
median_genes = hto_category_metrics$median_genes[x])
})
names(qc_list$hto_category_stats) <- hto_category_metrics$hto_category
qc_table(hto_category_metrics)
well_metrics <- all_meta[,.(n_cells = nrow(.SD),
median_reads = as.numeric(median(n_reads)),
median_umis = as.numeric(median(n_umis)),
median_genes = as.numeric(median(n_genes))),
by = well_id]
for(well in well_metrics$well_id) {
qc_list$well_stats[[well]] <- list(n_cells = well_metrics$n_cells[well_metrics$well_id == well],
median_reads = well_metrics$median_reads[well_metrics$well_id == well],
median_umis = well_metrics$median_umis[well_metrics$well_id == well],
median_genes = well_metrics$median_genes[well_metrics$well_id == well])
}
knitr::kable(well_metrics,
caption = "Well Metrics Overview")
well_id | n_cells | median_reads | median_umis | median_genes |
---|---|---|---|---|
W3-hashed-16k | 9711 | 21609.0 | 7009 | 1940 |
W4-hashed-24k | 14478 | 21342.5 | 7057 | 1955 |
W5-hashed-32k | 18823 | 22682.0 | 7253 | 1984 |
W6-hashed-48k | 27155 | 24720.0 | 7425 | 2023 |
W7-hashed-64k | 33449 | 25384.0 | 7685 | 2052 |
W8-hashed-80k | 37541 | 29908.0 | 8058 | 2151 |
qc_violin_plot(all_meta,
category_x = "well_id",
name_x = "Well ID",
column_y = "n_reads",
name_y = "N Reads per Cell",
fill = "dodgerblue") +
ggtitle("Reads per Cell")
## Warning: Removed 31 rows containing non-finite values (stat_ydensity).
qc_violin_plot(all_meta,
category_x = "well_id",
name_x = "Well ID",
column_y = "n_umis",
name_y = "N UMIs per Cell",
fill = "purple") +
ggtitle("UMIs per Cell")
qc_violin_plot(all_meta,
category_x = "well_id",
name_x = "Well ID",
column_y = "n_genes",
name_y = "N Genes per Cell",
fill = "orangered") +
ggtitle("Genes per Cell")
## Warning: Removed 859 rows containing non-finite values (stat_ydensity).
hto_category_by_well <- all_meta[,
.(n_cells = nrow(.SD)),
by = list(well_id, hto_category)]
hto_category_count_by_well <- dcast(hto_category_by_well,
formula = hto_category ~ well_id,
value.var = "n_cells")
for(well in names(hto_category_count_by_well)[-1]) {
hto_cat_list <- as.list(hto_category_count_by_well[[well]])
names(hto_cat_list) <- paste0("n_",hto_category_count_by_well$hto_category)
qc_list$well_stats[[well]] <- c(qc_list$well_stats[[well]], hto_cat_list)
}
qc_table(hto_category_count_by_well)
qc_aligned_barplot(all_meta,
category_x = "well_id",
name_x = "Well ID",
category_y = "hto_category",
category_name = "HTO Category",
colorset_y = "varibow",
name_y = "N Cells",
padding = 0.2) +
ggtitle("HTO Category Counts per Well")
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
hto_category_by_well <- hto_category_by_well[,
frac_cells := round(n_cells/sum(n_cells),4),
by = well_id]
hto_category_frac_by_well <- dcast(hto_category_by_well,
formula = hto_category ~ well_id,
value.var = "frac_cells")
qc_table(hto_category_frac_by_well)
qc_stacked_barplot(all_meta,
category_x = "well_id",
name_x = "Well ID",
category_y = "hto_category",
category_name = "HTO Category",
colorset_y = "varibow",
name_y = "Fraction of Cells",
as_fraction = TRUE) +
ggtitle("HTO Category Fraction per Well")
hto_barcode_by_well <- singlet_meta[,
.(n_cells = nrow(.SD)),
by = list(well_id, hto_barcode)]
hto_barcode_count_by_well <- dcast(hto_barcode_by_well,
formula = hto_barcode ~ well_id,
value.var = "n_cells")
hto_barcode_count_by_well <- hto_barcode_to_sample_id[hto_barcode_count_by_well,
on = "hto_barcode"]
for(well in names(hto_category_count_by_well)[-1]) {
sample_list <- as.list(hto_barcode_count_by_well[[well]])
names(sample_list) <- hto_barcode_count_by_well$sample_id
qc_list$well_stats[[well]]$sample_singlet_counts <- sample_list
}
for(sample_id in hto_barcode_count_by_well$sample_id) {
sample_row <- hto_barcode_count_by_well[hto_barcode_count_by_well$sample_id == sample_id, ,drop = FALSE]
sample_row <- sample_row[,-c(1,2)]
qc_list$sample_stats[[sample_id]]$well_counts <- as.list(sample_row)
}
qc_table(hto_barcode_count_by_well)
qc_aligned_barplot(singlet_meta,
category_x = "well_id",
name_x = "Well ID",
category_y = "plot_barcode",
category_name = "HTO Barcode",
colorset_y = "varibow",
name_y = "N Cells",
padding = 0.2) +
ggtitle("HTO Barcode Distribution in Wells")
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
qc_aligned_barplot(singlet_meta,
category_x = "plot_barcode",
name_x = "HTO Barcode",
category_y = "well_id",
category_name = "Well ID",
colorset_y = "varibow",
name_y = "N Cells",
padding = 0.2) +
ggtitle("Well Distribution in HTO Barcodes")
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
hto_barcode_by_well <- hto_barcode_by_well[,
frac_cells := round(n_cells/sum(n_cells),4),
by = well_id]
hto_barcode_frac_by_well <- dcast(hto_barcode_by_well,
formula = hto_barcode ~ well_id,
value.var = "frac_cells")
hto_barcode_frac_by_well <- hto_barcode_to_sample_id[hto_barcode_frac_by_well,
on = "hto_barcode"]
qc_table(hto_barcode_frac_by_well)
qc_stacked_barplot(singlet_meta,
category_x = "well_id",
name_x = "Well ID",
category_y = "plot_barcode",
category_name = "HTO Barcode",
colorset_y = "varibow",
name_y = "Fraction of Cells",
as_fraction = TRUE) +
ggtitle("HTO Barcode Fraction per Well")
qc_stacked_barplot(singlet_meta,
category_x = "plot_barcode",
category_y = "well_id",
category_name = "Well ID",
name_x = "HTO Barcode",
colorset_y = "varibow",
name_y = "Fraction of Cells",
as_fraction = TRUE) +
ggtitle("Well Fraction per HTO Barcode")
hto_barcode_metrics <- singlet_meta[,.(n_cells = nrow(.SD),
median_reads = as.numeric(median(n_reads)),
median_umis = as.numeric(median(n_umis)),
median_genes = as.numeric(median(n_genes))),
by = hto_barcode]
hto_barcode_metrics <- hto_barcode_to_sample_id[hto_barcode_metrics,
on = "hto_barcode"]
for(sample_id in hto_barcode_metrics$sample_id) {
sample_row <- hto_barcode_metrics[hto_barcode_metrics$sample_id == sample_id, , drop = FALSE]
qc_list$sample_stats[[sample_id]] <- as.list(sample_row[, -2])
}
qc_table(hto_barcode_metrics)
category_reads_violins <- qc_violin_plot(all_meta,
category_x = "hto_category",
name_x = "HTO Category",
column_y = "n_reads",
name_y = "N Reads per Cell",
fill = "dodgerblue")
barcode_reads_violins <- qc_violin_plot(all_meta[all_meta$hto_category == "singlet",],
category_x = "plot_barcode",
name_x = "HTO Barcode (singlets)",
column_y = "n_reads",
name_y = "N Reads per Cell",
fill = "dodgerblue") +
ggtitle("Reads per Cell")
reads_violin_list <- list(category_reads_violins,
barcode_reads_violins)
plot_grid(plotlist = reads_violin_list,
ncol = 2, rel_widths = c(1, 3),
nrow = 1, align = "h")
## Warning: Removed 31 rows containing non-finite values (stat_ydensity).
## Warning: Removed 15 rows containing non-finite values (stat_ydensity).
reads_hto_barcode_by_well <- singlet_meta[,
.(median_reads = median(n_reads)),
by = list(well_id, hto_barcode)]
reads_hto_barcode_by_well <- dcast(reads_hto_barcode_by_well,
formula = hto_barcode ~ well_id,
value.var = "median_reads")
reads_hto_barcode_by_well <- hto_barcode_to_sample_id[reads_hto_barcode_by_well,
on = "hto_barcode"]
for(well in names(reads_hto_barcode_by_well)[-c(1,2)]) {
sample_list <- as.list(reads_hto_barcode_by_well[[well]])
names(sample_list) <- reads_hto_barcode_by_well$sample_id
qc_list$well_stats[[well]]$sample_mapped_reads_median <- sample_list
}
for(sample_id in reads_hto_barcode_by_well$sample_id) {
sample_row <- reads_hto_barcode_by_well[reads_hto_barcode_by_well$sample_id == sample_id, ,drop = FALSE]
sample_row <- sample_row[,-c(1,2)]
qc_list$sample_stats[[sample_id]]$well_mapped_reads_median <- as.list(sample_row)
}
qc_table(reads_hto_barcode_by_well)
category_umis_violins <- qc_violin_plot(all_meta,
category_x = "hto_category",
name_x = "HTO Category",
column_y = "n_umis",
name_y = "N UMIs per Cell",
fill = "purple")
barcode_umis_violins <- qc_violin_plot(all_meta[all_meta$hto_category == "singlet",],
category_x = "plot_barcode",
name_x = "HTO Barcode (singlets)",
column_y = "n_umis",
name_y = "N UMIs per Cell",
fill = "purple") +
ggtitle("UMIs per Cell")
umis_violin_list <- list(category_umis_violins,
barcode_umis_violins)
plot_grid(plotlist = umis_violin_list,
ncol = 2, rel_widths = c(1, 3),
nrow = 1, align = "h")
umis_hto_barcode_by_well <- singlet_meta[,
.(median_umis = median(n_umis)),
by = list(well_id, hto_barcode)]
umis_hto_barcode_by_well <- dcast(umis_hto_barcode_by_well,
formula = hto_barcode ~ well_id,
value.var = "median_umis")
umis_hto_barcode_by_well <- hto_barcode_to_sample_id[umis_hto_barcode_by_well,
on = "hto_barcode"]
for(well in names(umis_hto_barcode_by_well)[-c(1,2)]) {
sample_list <- as.list(umis_hto_barcode_by_well[[well]])
names(sample_list) <- umis_hto_barcode_by_well$sample_id
qc_list$well_stats[[well]]$sample_umis_median <- sample_list
}
for(sample_id in umis_hto_barcode_by_well$sample_id) {
sample_row <- umis_hto_barcode_by_well[umis_hto_barcode_by_well$sample_id == sample_id, ,drop = FALSE]
sample_row <- sample_row[,-c(1,2)]
qc_list$sample_stats[[sample_id]]$well_umis_median <- as.list(sample_row)
}
qc_table(umis_hto_barcode_by_well)
category_genes_violins <- qc_violin_plot(all_meta,
category_x = "hto_category",
name_x = "HTO Category",
column_y = "n_genes",
name_y = "N Genes per Cell",
fill = "orangered")
barcode_genes_violins <- qc_violin_plot(all_meta[all_meta$hto_category == "singlet",],
category_x = "plot_barcode",
name_x = "HTO Barcode (singlets)",
column_y = "n_genes",
name_y = "N Genes per Cell",
fill = "orangered") +
ggtitle("Genes per Cell")
genes_violin_list <- list(category_genes_violins,
barcode_genes_violins)
plot_grid(plotlist = genes_violin_list,
ncol = 2, rel_widths = c(1, 3),
nrow = 1, align = "h")
## Warning: Removed 859 rows containing non-finite values (stat_ydensity).
## Warning: Removed 803 rows containing non-finite values (stat_ydensity).
genes_hto_barcode_by_well <- singlet_meta[,
.(median_genes = median(n_genes)),
by = list(well_id, hto_barcode)]
genes_hto_barcode_by_well <- dcast(genes_hto_barcode_by_well,
formula = hto_barcode ~ well_id,
value.var = "median_genes")
genes_hto_barcode_by_well <- hto_barcode_to_sample_id[genes_hto_barcode_by_well,
on = "hto_barcode"]
for(well in names(genes_hto_barcode_by_well)[-c(1,2)]) {
sample_list <- as.list(genes_hto_barcode_by_well[[well]])
names(sample_list) <- genes_hto_barcode_by_well$sample_id
qc_list$well_stats[[well]]$sample_genes_median <- sample_list
}
for(sample_id in genes_hto_barcode_by_well$sample_id) {
sample_row <- genes_hto_barcode_by_well[genes_hto_barcode_by_well$sample_id == sample_id, ,drop = FALSE]
sample_row <- sample_row[,-c(1,2)]
qc_list$sample_stats[[sample_id]]$well_genes_median <- as.list(sample_row)
}
qc_table(genes_hto_barcode_by_well)
stm(paste0("Writing JSON to ", out_json))
qc_list_json <- jsonlite::toJSON(qc_list,
auto_unbox = TRUE,
pretty = TRUE)
writeLines(qc_list_json,
out_json)
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux 9 (stretch)
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.7.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.7.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] cowplot_1.0.0 ggplot2_3.3.0 BarMixer_1.0.1 Matrix_1.2-18
## [5] data.table_1.13.0 rhdf5_2.28.1 optparse_1.6.6
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.3 highr_0.8 later_1.0.0 compiler_3.6.3
## [5] pillar_1.4.6 tools_3.6.3 uuid_0.1-4 digest_0.6.25
## [9] jsonlite_1.6.1 evaluate_0.14 lifecycle_0.2.0 tibble_3.0.4
## [13] gtable_0.3.0 lattice_0.20-40 pkgconfig_2.0.3 rlang_0.4.8
## [17] shiny_1.4.0 crosstalk_1.0.0 yaml_2.2.1 xfun_0.12
## [21] fastmap_1.0.1 withr_2.1.2 stringr_1.4.0 dplyr_1.0.2
## [25] knitr_1.28 htmlwidgets_1.5.1 generics_0.0.2 vctrs_0.3.4
## [29] DT_0.12 grid_3.6.3 getopt_1.20.3 tidyselect_1.1.0
## [33] glue_1.4.2 R6_2.4.1 rmarkdown_2.1 farver_2.0.3
## [37] Rhdf5lib_1.6.3 purrr_0.3.4 magrittr_1.5 promises_1.1.0
## [41] ids_1.0.1 scales_1.1.0 htmltools_0.5.1.1 ellipsis_0.3.1
## [45] assertthat_0.2.1 xtable_1.8-4 mime_0.9 colorspace_1.4-1
## [49] httpuv_1.5.2 labeling_0.3 stringi_1.4.6 munsell_0.5.0
## [53] crayon_1.3.4
Total time elapsed
end_time <- Sys.time()
diff_time <- end_time - start_time
time_message <- paste0("Elapsed Time: ",
round(diff_time, 3),
" ", units(diff_time))
print(time_message)
## [1] "Elapsed Time: 3.665 mins"
stm(time_message)
stm("H5 merge process complete.")