Contents

Data Processing

Session Preparation

Load libraries:

start_time <- Sys.time()

quiet_library <- function(...) {
  suppressPackageStartupMessages(library(...))
}
quiet_library(rhdf5)
quiet_library(BarMixer)
quiet_library(Matrix)
quiet_library(ggplot2)
quiet_library(cowplot)
quiet_library(jsonlite)

Declaring start

stm("Starting .h5 metadata analysis")
stm(paste0("Using BarMixer v", installed.packages()["BarMixer","Version"]))

Argument parsing

in_tenx <- params$in_tenx
in_well <- params$in_well
in_sample <- params$in_sample
out_dir <- params$out_dir

# Check Well ID format
if(grepl("^[A-Z|-]+[0-9]+-.?P[0-9]C[0-9]W[0-9]$", in_well)) {
  in_well <- sub("-.P","-P",in_well)
  well_type <- "aifi"
} else {
  well_type <- "other"
}

stm(paste0("IN  10x outs/       : ", in_tenx))
stm(paste0("IN  Well ID         : ", in_well))
stm(paste0("OUT H5 directory    : ", out_dir))

Input Parameters

print(c(
  paste0("IN  10x outs/       : ", in_tenx),
  paste0("IN  Well ID         : ", in_well),
  paste0("OUT H5 directory    : ", out_dir)
))
## [1] "IN  10x outs/       : /mnt/barware-manuscript/BarWare-demo/cellranger/W4-hashed-24k/outs/"        
## [2] "IN  Well ID         : W4-hashed-24k"                                                              
## [3] "OUT H5 directory    : /mnt/barware-manuscript/BarWare-demo/demo_output/W4-hashed-24k/rna_metadata"

Check Input Files

in_h5 <- file.path(in_tenx, "filtered_feature_bc_matrix.h5")

if(!file.exists(in_h5)) {
  stm(paste0("ERROR: Cannot find IN H5 file:", in_h5))
  stop()
} else {
  stm(paste0("Found matrix file:", in_h5))
}

in_mol <- file.path(in_tenx, "molecule_info.h5")

if(!file.exists(in_mol)) {
  stm(paste0("ERROR: Cannot find IN Mol Info H5 file:", in_mol))
  stop()
} else {
  stm(paste0("Found mol_info file:", in_mol))
}

in_sum <- file.path(in_tenx, "metrics_summary.csv")

if(!file.exists(in_sum)) {
  stm(paste0("ERROR: Cannot find IN Metrics Summary file:", in_sum))
  stop()
} else {
  stm(paste0("Found metrics file:", in_sum))
}

Create out directory if missing

if(!dir.exists(out_dir)) {
  stm(paste0("Creating Output Directory: ",out_dir))
  dir.create(out_dir, 
             recursive = TRUE)
}

Return to Contents

Load inputs

Load scRNA-seq Dataset

stm(paste0("Loading HDF5 from ", in_h5))
h5_list <- h5dump(in_h5)

Set output files

out_h5 <- file.path(out_dir, paste0(in_well, ".h5"))
out_json <- sub(".h5$","_well_metrics.json",out_h5)

stm(paste0("OUT H5 file         : ", out_h5))
stm(paste0("OUT JSON file       : ", out_json))

print(c(
  paste0("OUT H5 file         : ", out_h5),
  paste0("OUT JSON file       : ", out_json)
))
## [1] "OUT H5 file         : /mnt/barware-manuscript/BarWare-demo/demo_output/W4-hashed-24k/rna_metadata/W4-hashed-24k.h5"               
## [2] "OUT JSON file       : /mnt/barware-manuscript/BarWare-demo/demo_output/W4-hashed-24k/rna_metadata/W4-hashed-24k_well_metrics.json"

Read molecule info to get read counts per cell

stm(paste0("Assembling Read Counts per Cell from ", in_mol))

bc <- sub("-1","",h5_list$matrix$barcodes)

bc_counts <- data.table(mol_idx = h5read(in_mol, "/barcode_idx"),
                        umi_count = h5read(in_mol, "/count"))

bc_sums <- bc_counts[, .(n_reads = sum(umi_count)), by = mol_idx]
rm(bc_counts)

mol_bc <- h5read(in_mol, "/barcodes")
bc_sums$cell_barcode <- mol_bc[bc_sums$mol_idx + 1]
rm(mol_bc)

bc_sums <- bc_sums[,.(cell_barcode, n_reads)]

n_reads <- bc_sums$n_reads[match(bc, bc_sums$cell_barcode)]
n_reads[is.na(n_reads)] <- 0

h5_list <- set_list_path(h5_list,
                           "/matrix/observations/n_reads",
                           n_reads)

Return to Contents

Assemble data

Compute N UMIs and N Genes per cell

stm("Computing UMI and Gene Counts per Cell")

h5_list <- h5_list_convert_to_dgCMatrix(h5_list, target = "matrix")

h5_list <- set_list_path(h5_list,
                         "/matrix/observations/n_umis",
                         unname(Matrix::colSums(h5_list$matrix_dgCMatrix)))

h5_list <- set_list_path(h5_list,
                         "/matrix/observations/n_genes",
                         unname(Matrix::colSums(h5_list$matrix_dgCMatrix > 0)))

h5_list <- h5_list_convert_from_dgCMatrix(h5_list, target = "matrix")

Add cell ids

stm("Adding Cell UUIDs and Names")

h5_list <- add_cell_ids(h5_list,
                        add_uuid = TRUE,
                        replace_barcode = TRUE,
                        retain_original_barcode = TRUE,
                        add_name = TRUE)

Generate Chip, Pool, and Batch IDs based on well_id

if(well_type == "aifi") {
  stm("Adding Batch, Pool, Chip, and Well metadata")

  h5_list <- add_well_metadata(h5_list,
                             well_id = in_well)
} else {
  h5_list <- set_list_path(h5_list,
                           "/matrix/observations/well_id",
                           rep(in_well, length(h5_list$matrix$barcodes)))
}

Add chrM gene counts

stm("Adding chrM count metadata")

h5_list <- h5_list_add_mito_umis(h5_list)

Add Well Metrics

well_metrics <- read_tenx_metrics(in_sum)

well_metrics <- as.list(well_metrics)

h5_list <- set_list_path(h5_list,
                         "/well",
                         well_metrics)

h5_list <- set_list_path(h5_list,
                         "/well/well_id",
                         in_well)

Return to Contents

Write Output

Write HDF5 files

stm(paste0("Writing HDF5 to ", out_h5))
write_h5_list(h5_list,
              h5_file = out_h5,
              overwrite = TRUE)
h5closeAll()

Return to Contents

QC Tables and Plots

Extract metadata for plotting

stm("Generating tables and plots for report")
meta <- h5_list_cell_metadata(h5_list)

qc_list <- list(report_type = "add_tenx_rna_metadata",
                report_datetime = as.character(start_time),
                report_uuid = ids::uuid(use_time = TRUE),
                package = "BarMixer",
                package_version = sessionInfo()$otherPkgs$H5weaver$Version,
                well_id = in_well)

Return to Contents

Well QC

10x Genomics Mapping Metrics

qc_well_metrics <- data.frame(tenx_metric = names(well_metrics),
                              value = as.vector(unlist(well_metrics)),
                              stringsAsFactors = FALSE)

qc_well_metrics$value[qc_well_metrics$tenx_metric == "number_of_reads"] <- as.numeric(qc_well_metrics$value[qc_well_metrics$tenx_metric == "number_of_reads"] / 1e6)
qc_well_metrics$tenx_metric[qc_well_metrics$tenx_metric == "number_of_reads"] <- "millions_of_reads"

qc_list$well_metrics <- split(qc_well_metrics$value, 1:nrow(qc_well_metrics))
names(qc_list$well_metrics) <- qc_well_metrics$tenx_metric

qc_table(qc_well_metrics)

Return to Contents

Well Read/UMI/Gene Summary Stats

overview_stats <- data.frame(statistic = c("N Mapped Reads",
                                           "N UMIs",
                                           "N Mito. UMIs",
                                           "N Genes",
                                           "Reads per UMI",
                                           "UMIs per Gene",
                                           "Reads per Gene"),
                             q_25 = c(quantile(meta$n_reads, 0.25),
                                      quantile(meta$n_umis,  0.25),
                                      quantile(meta$n_mito_umis, 0.25),
                                      quantile(meta$n_genes, 0.25),
                                      quantile(meta$n_reads/meta$n_umis, 0.25, na.rm = TRUE),
                                      quantile(meta$n_umis/meta$n_genes, 0.25, na.rm = TRUE),
                                      quantile(meta$n_reads/meta$n_genes, 0.25, na.rm = TRUE)),
                             median = c(quantile(meta$n_reads, 0.5),
                                        quantile(meta$n_umis,  0.50),
                                        quantile(meta$n_mito_umis, 0.50),
                                        quantile(meta$n_genes, 0.50),
                                        quantile(meta$n_reads/meta$n_umis, 0.50, na.rm = TRUE),
                                        quantile(meta$n_umis/meta$n_genes, 0.50, na.rm = TRUE),
                                        quantile(meta$n_reads/meta$n_genes, 0.50, na.rm = TRUE)),
                             q_75 = c(quantile(meta$n_reads, 0.75),
                                      quantile(meta$n_umis,  0.75),
                                      quantile(meta$n_mito_umis, 0.25),
                                      quantile(meta$n_genes, 0.75),
                                      quantile(meta$n_reads/meta$n_umis, 0.75, na.rm = TRUE),
                                      quantile(meta$n_umis/meta$n_genes, 0.75, na.rm = TRUE),
                                      quantile(meta$n_reads/meta$n_genes, 0.75, na.rm = TRUE)))

overview_stats$q_25 <- round(overview_stats$q_25, 2)
overview_stats$median <- round(overview_stats$median, 2)
overview_stats$q_75 <- round(overview_stats$q_75, 2)

qc_list$well_count_stats <- lapply(1:nrow(overview_stats),
                                      function(x) {
                                        list(q_25 = overview_stats$q_25[x],
                                             median = overview_stats$median[x],
                                             q_75 = overview_stats$q_75[x])
                                      })
names(qc_list$well_count_stats) <- tolower(gsub("[\\. ]+", "_", overview_stats$statistic))

qc_table(overview_stats)

Return to Contents

Well Read/UMI/Gene Histograms

reads_plot <- qc_hist_plot(meta,
                           column = "n_reads",
                           name_x = "N Reads per Cell",
                           fill = "dodgerblue",
                           target = 2e4) +
  ggtitle("Alignment/Mapping Distributions")

umis_plot <- qc_hist_plot(meta,
                          column = "n_umis",
                          name_x = "N UMIs per Cell",
                          fill = "purple",
                          target = 8e3)

genes_plot <- qc_hist_plot(meta,
                           column = "n_genes",
                           name_x = "N Genes per Cell",
                           fill = "orangered",
                           target = 2e3)

histogram_list <- list(reads_plot,
                       umis_plot,
                       genes_plot)

suppressWarnings(
  plot_grid(plotlist = histogram_list,
            ncol = 1,
            nrow = 3)
)

Return to Contents

UMIs vs Reads Scatter Plot

umi_reads_scatter <- qc_scatter_plot(meta,
                                     column_x = "n_reads",
                                     name_x = "N Reads per Cell",
                                     column_y = "n_umis",
                                     name_y = "N UMIs per Cell",
                                     color = "darkblue") +
  ggtitle("UMIs vs Reads")

suppressWarnings(
  umi_reads_scatter
)

Genes vs UMIs Scatter Plot

genes_umis_scatter <- qc_scatter_plot(meta,
                                      column_x = "n_umis",
                                      name_x = "N UMIs per Cell",
                                      column_y = "n_genes",
                                      name_y = "N Genes per Cell",
                                      color = "red") +
  ggtitle("Genes vs UMIs")

suppressWarnings(
  genes_umis_scatter
)
## Warning: Removed 344 rows containing missing values (geom_point).

Genes vs Reads Scatter Plot

genes_reads_scatter <- qc_scatter_plot(meta,
                                       column_x = "n_reads",
                                       name_x = "N Reads per Cell",
                                       column_y = "n_genes",
                                       name_y = "N Genes per Cell",
                                       color = "darkgreen") +
  ggtitle("Genes vs Reads")

suppressWarnings(
  genes_reads_scatter
)
## Warning: Removed 344 rows containing missing values (geom_point).

Return to Contents

Write QC JSON

stm(paste0("Writing JSON to ", out_json))

qc_list_json <- jsonlite::toJSON(qc_list,
                                 auto_unbox = TRUE,
                                 pretty = TRUE)

writeLines(qc_list_json,
           out_json)

Return to Contents

Session Information

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux 9 (stretch)
## 
## Matrix products: default
## BLAS:   /usr/lib/libblas/libblas.so.3.7.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.7.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] jsonlite_1.6.1    cowplot_1.0.0     ggplot2_3.3.0     BarMixer_1.0.1   
## [5] Matrix_1.2-18     data.table_1.13.0 rhdf5_2.28.1      optparse_1.6.6   
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.1.0  xfun_0.12         purrr_0.3.4       lattice_0.20-40  
##  [5] colorspace_1.4-1  vctrs_0.3.4       generics_0.0.2    htmltools_0.5.1.1
##  [9] getopt_1.20.3     yaml_2.2.1        rlang_0.4.8       R.oo_1.24.0      
## [13] pillar_1.4.6      later_1.0.0       glue_1.4.2        withr_2.1.2      
## [17] R.utils_2.10.1    uuid_0.1-4        lifecycle_0.2.0   stringr_1.4.0    
## [21] munsell_0.5.0     gtable_0.3.0      R.methodsS3_1.8.1 htmlwidgets_1.5.1
## [25] evaluate_0.14     labeling_0.3      knitr_1.28        fastmap_1.0.1    
## [29] httpuv_1.5.2      crosstalk_1.0.0   Rcpp_1.0.3        xtable_1.8-4     
## [33] scales_1.1.0      promises_1.1.0    DT_0.12           mime_0.9         
## [37] farver_2.0.3      ids_1.0.1         digest_0.6.25     stringi_1.4.6    
## [41] dplyr_1.0.2       shiny_1.4.0       grid_3.6.3        tools_3.6.3      
## [45] magrittr_1.5      tibble_3.0.4      crayon_1.3.4      pkgconfig_2.0.3  
## [49] ellipsis_0.3.1    assertthat_0.2.1  rmarkdown_2.1     Rhdf5lib_1.6.3   
## [53] R6_2.4.1          compiler_3.6.3

Total time elapsed

end_time <- Sys.time()
diff_time <- end_time - start_time
time_message <- paste0("Elapsed Time: ", 
                       round(diff_time, 3),
                       " ", units(diff_time))
print(time_message)
## [1] "Elapsed Time: 52.558 secs"
stm(time_message)
stm("H5 metadata process complete.")

Return to Contents