start_time <- Sys.time()
quiet_library <- function(...) {
suppressPackageStartupMessages(library(...))
}
quiet_library(rhdf5)
quiet_library(BarMixer)
quiet_library(Matrix)
quiet_library(ggplot2)
quiet_library(cowplot)
quiet_library(jsonlite)
Declaring start
stm("Starting .h5 metadata analysis")
stm(paste0("Using BarMixer v", installed.packages()["BarMixer","Version"]))
in_tenx <- params$in_tenx
in_well <- params$in_well
in_sample <- params$in_sample
out_dir <- params$out_dir
# Check Well ID format
if(grepl("^[A-Z|-]+[0-9]+-.?P[0-9]C[0-9]W[0-9]$", in_well)) {
in_well <- sub("-.P","-P",in_well)
well_type <- "aifi"
} else {
well_type <- "other"
}
stm(paste0("IN 10x outs/ : ", in_tenx))
stm(paste0("IN Well ID : ", in_well))
stm(paste0("OUT H5 directory : ", out_dir))
print(c(
paste0("IN 10x outs/ : ", in_tenx),
paste0("IN Well ID : ", in_well),
paste0("OUT H5 directory : ", out_dir)
))
## [1] "IN 10x outs/ : /mnt/barware-manuscript/BarWare-demo/cellranger/W3-hashed-16k/outs/"
## [2] "IN Well ID : W3-hashed-16k"
## [3] "OUT H5 directory : /mnt/barware-manuscript/BarWare-demo/demo_output/W3-hashed-16k/rna_metadata"
in_h5 <- file.path(in_tenx, "filtered_feature_bc_matrix.h5")
if(!file.exists(in_h5)) {
stm(paste0("ERROR: Cannot find IN H5 file:", in_h5))
stop()
} else {
stm(paste0("Found matrix file:", in_h5))
}
in_mol <- file.path(in_tenx, "molecule_info.h5")
if(!file.exists(in_mol)) {
stm(paste0("ERROR: Cannot find IN Mol Info H5 file:", in_mol))
stop()
} else {
stm(paste0("Found mol_info file:", in_mol))
}
in_sum <- file.path(in_tenx, "metrics_summary.csv")
if(!file.exists(in_sum)) {
stm(paste0("ERROR: Cannot find IN Metrics Summary file:", in_sum))
stop()
} else {
stm(paste0("Found metrics file:", in_sum))
}
if(!dir.exists(out_dir)) {
stm(paste0("Creating Output Directory: ",out_dir))
dir.create(out_dir,
recursive = TRUE)
}
stm(paste0("Loading HDF5 from ", in_h5))
h5_list <- h5dump(in_h5)
out_h5 <- file.path(out_dir, paste0(in_well, ".h5"))
out_json <- sub(".h5$","_well_metrics.json",out_h5)
stm(paste0("OUT H5 file : ", out_h5))
stm(paste0("OUT JSON file : ", out_json))
print(c(
paste0("OUT H5 file : ", out_h5),
paste0("OUT JSON file : ", out_json)
))
## [1] "OUT H5 file : /mnt/barware-manuscript/BarWare-demo/demo_output/W3-hashed-16k/rna_metadata/W3-hashed-16k.h5"
## [2] "OUT JSON file : /mnt/barware-manuscript/BarWare-demo/demo_output/W3-hashed-16k/rna_metadata/W3-hashed-16k_well_metrics.json"
stm(paste0("Assembling Read Counts per Cell from ", in_mol))
bc <- sub("-1","",h5_list$matrix$barcodes)
bc_counts <- data.table(mol_idx = h5read(in_mol, "/barcode_idx"),
umi_count = h5read(in_mol, "/count"))
bc_sums <- bc_counts[, .(n_reads = sum(umi_count)), by = mol_idx]
rm(bc_counts)
mol_bc <- h5read(in_mol, "/barcodes")
bc_sums$cell_barcode <- mol_bc[bc_sums$mol_idx + 1]
rm(mol_bc)
bc_sums <- bc_sums[,.(cell_barcode, n_reads)]
n_reads <- bc_sums$n_reads[match(bc, bc_sums$cell_barcode)]
n_reads[is.na(n_reads)] <- 0
h5_list <- set_list_path(h5_list,
"/matrix/observations/n_reads",
n_reads)
stm("Computing UMI and Gene Counts per Cell")
h5_list <- h5_list_convert_to_dgCMatrix(h5_list, target = "matrix")
h5_list <- set_list_path(h5_list,
"/matrix/observations/n_umis",
unname(Matrix::colSums(h5_list$matrix_dgCMatrix)))
h5_list <- set_list_path(h5_list,
"/matrix/observations/n_genes",
unname(Matrix::colSums(h5_list$matrix_dgCMatrix > 0)))
h5_list <- h5_list_convert_from_dgCMatrix(h5_list, target = "matrix")
stm("Adding Cell UUIDs and Names")
h5_list <- add_cell_ids(h5_list,
add_uuid = TRUE,
replace_barcode = TRUE,
retain_original_barcode = TRUE,
add_name = TRUE)
if(well_type == "aifi") {
stm("Adding Batch, Pool, Chip, and Well metadata")
h5_list <- add_well_metadata(h5_list,
well_id = in_well)
} else {
h5_list <- set_list_path(h5_list,
"/matrix/observations/well_id",
rep(in_well, length(h5_list$matrix$barcodes)))
}
stm("Adding chrM count metadata")
h5_list <- h5_list_add_mito_umis(h5_list)
well_metrics <- read_tenx_metrics(in_sum)
well_metrics <- as.list(well_metrics)
h5_list <- set_list_path(h5_list,
"/well",
well_metrics)
h5_list <- set_list_path(h5_list,
"/well/well_id",
in_well)
stm(paste0("Writing HDF5 to ", out_h5))
write_h5_list(h5_list,
h5_file = out_h5,
overwrite = TRUE)
h5closeAll()
Extract metadata for plotting
stm("Generating tables and plots for report")
meta <- h5_list_cell_metadata(h5_list)
qc_list <- list(report_type = "add_tenx_rna_metadata",
report_datetime = as.character(start_time),
report_uuid = ids::uuid(use_time = TRUE),
package = "BarMixer",
package_version = sessionInfo()$otherPkgs$H5weaver$Version,
well_id = in_well)
qc_well_metrics <- data.frame(tenx_metric = names(well_metrics),
value = as.vector(unlist(well_metrics)),
stringsAsFactors = FALSE)
qc_well_metrics$value[qc_well_metrics$tenx_metric == "number_of_reads"] <- as.numeric(qc_well_metrics$value[qc_well_metrics$tenx_metric == "number_of_reads"] / 1e6)
qc_well_metrics$tenx_metric[qc_well_metrics$tenx_metric == "number_of_reads"] <- "millions_of_reads"
qc_list$well_metrics <- split(qc_well_metrics$value, 1:nrow(qc_well_metrics))
names(qc_list$well_metrics) <- qc_well_metrics$tenx_metric
qc_table(qc_well_metrics)
overview_stats <- data.frame(statistic = c("N Mapped Reads",
"N UMIs",
"N Mito. UMIs",
"N Genes",
"Reads per UMI",
"UMIs per Gene",
"Reads per Gene"),
q_25 = c(quantile(meta$n_reads, 0.25),
quantile(meta$n_umis, 0.25),
quantile(meta$n_mito_umis, 0.25),
quantile(meta$n_genes, 0.25),
quantile(meta$n_reads/meta$n_umis, 0.25, na.rm = TRUE),
quantile(meta$n_umis/meta$n_genes, 0.25, na.rm = TRUE),
quantile(meta$n_reads/meta$n_genes, 0.25, na.rm = TRUE)),
median = c(quantile(meta$n_reads, 0.5),
quantile(meta$n_umis, 0.50),
quantile(meta$n_mito_umis, 0.50),
quantile(meta$n_genes, 0.50),
quantile(meta$n_reads/meta$n_umis, 0.50, na.rm = TRUE),
quantile(meta$n_umis/meta$n_genes, 0.50, na.rm = TRUE),
quantile(meta$n_reads/meta$n_genes, 0.50, na.rm = TRUE)),
q_75 = c(quantile(meta$n_reads, 0.75),
quantile(meta$n_umis, 0.75),
quantile(meta$n_mito_umis, 0.25),
quantile(meta$n_genes, 0.75),
quantile(meta$n_reads/meta$n_umis, 0.75, na.rm = TRUE),
quantile(meta$n_umis/meta$n_genes, 0.75, na.rm = TRUE),
quantile(meta$n_reads/meta$n_genes, 0.75, na.rm = TRUE)))
overview_stats$q_25 <- round(overview_stats$q_25, 2)
overview_stats$median <- round(overview_stats$median, 2)
overview_stats$q_75 <- round(overview_stats$q_75, 2)
qc_list$well_count_stats <- lapply(1:nrow(overview_stats),
function(x) {
list(q_25 = overview_stats$q_25[x],
median = overview_stats$median[x],
q_75 = overview_stats$q_75[x])
})
names(qc_list$well_count_stats) <- tolower(gsub("[\\. ]+", "_", overview_stats$statistic))
qc_table(overview_stats)
reads_plot <- qc_hist_plot(meta,
column = "n_reads",
name_x = "N Reads per Cell",
fill = "dodgerblue",
target = 2e4) +
ggtitle("Alignment/Mapping Distributions")
umis_plot <- qc_hist_plot(meta,
column = "n_umis",
name_x = "N UMIs per Cell",
fill = "purple",
target = 8e3)
genes_plot <- qc_hist_plot(meta,
column = "n_genes",
name_x = "N Genes per Cell",
fill = "orangered",
target = 2e3)
histogram_list <- list(reads_plot,
umis_plot,
genes_plot)
suppressWarnings(
plot_grid(plotlist = histogram_list,
ncol = 1,
nrow = 3)
)
umi_reads_scatter <- qc_scatter_plot(meta,
column_x = "n_reads",
name_x = "N Reads per Cell",
column_y = "n_umis",
name_y = "N UMIs per Cell",
color = "darkblue") +
ggtitle("UMIs vs Reads")
suppressWarnings(
umi_reads_scatter
)
## Warning: Removed 1 rows containing missing values (geom_point).
genes_umis_scatter <- qc_scatter_plot(meta,
column_x = "n_umis",
name_x = "N UMIs per Cell",
column_y = "n_genes",
name_y = "N Genes per Cell",
color = "red") +
ggtitle("Genes vs UMIs")
suppressWarnings(
genes_umis_scatter
)
## Warning: Removed 310 rows containing missing values (geom_point).
genes_reads_scatter <- qc_scatter_plot(meta,
column_x = "n_reads",
name_x = "N Reads per Cell",
column_y = "n_genes",
name_y = "N Genes per Cell",
color = "darkgreen") +
ggtitle("Genes vs Reads")
suppressWarnings(
genes_reads_scatter
)
## Warning: Removed 311 rows containing missing values (geom_point).
stm(paste0("Writing JSON to ", out_json))
qc_list_json <- jsonlite::toJSON(qc_list,
auto_unbox = TRUE,
pretty = TRUE)
writeLines(qc_list_json,
out_json)
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux 9 (stretch)
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.7.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.7.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] jsonlite_1.6.1 cowplot_1.0.0 ggplot2_3.3.0 BarMixer_1.0.1
## [5] Matrix_1.2-18 data.table_1.13.0 rhdf5_2.28.1 optparse_1.6.6
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.1.0 xfun_0.12 purrr_0.3.4 lattice_0.20-40
## [5] colorspace_1.4-1 vctrs_0.3.4 generics_0.0.2 htmltools_0.5.1.1
## [9] getopt_1.20.3 yaml_2.2.1 rlang_0.4.8 R.oo_1.24.0
## [13] pillar_1.4.6 later_1.0.0 glue_1.4.2 withr_2.1.2
## [17] R.utils_2.10.1 uuid_0.1-4 lifecycle_0.2.0 stringr_1.4.0
## [21] munsell_0.5.0 gtable_0.3.0 R.methodsS3_1.8.1 htmlwidgets_1.5.1
## [25] evaluate_0.14 labeling_0.3 knitr_1.28 fastmap_1.0.1
## [29] httpuv_1.5.2 crosstalk_1.0.0 Rcpp_1.0.3 xtable_1.8-4
## [33] scales_1.1.0 promises_1.1.0 DT_0.12 mime_0.9
## [37] farver_2.0.3 ids_1.0.1 digest_0.6.25 stringi_1.4.6
## [41] dplyr_1.0.2 shiny_1.4.0 grid_3.6.3 tools_3.6.3
## [45] magrittr_1.5 tibble_3.0.4 crayon_1.3.4 pkgconfig_2.0.3
## [49] ellipsis_0.3.1 assertthat_0.2.1 rmarkdown_2.1 Rhdf5lib_1.6.3
## [53] R6_2.4.1 compiler_3.6.3
Total time elapsed
end_time <- Sys.time()
diff_time <- end_time - start_time
time_message <- paste0("Elapsed Time: ",
round(diff_time, 3),
" ", units(diff_time))
print(time_message)
## [1] "Elapsed Time: 41.578 secs"
stm(time_message)
stm("H5 metadata process complete.")