# Set seeds and initial parameters for ArchR
set.seed(1)

# Number of cores to use (adapt to your machine, but has to be > 1 otherwise plotTSSenrichment does not work properly)
addArchRThreads(threads = 8)

## Setting default number of Parallel threads to 8.

# Reference genome (adapt to your data, use the same reference that was used for the alignment)
addArchRGenome("mm10")

## Setting default genome to Mm10.

1 Samples

We are analyzing CD4+ T cells from healthy murine tissue (spleen, colon, fat and skin). For spleen we have an additional sample of CD25+ regulatory T cells. Overall we have 5 samples in this analysis, which were previously aligned using Cell Ranger ATAC (v2.0.0) count.

scATAC_1 spleen CD4+ T cells
scATAC_8 spleen CD25+ T cells
scATAC_4 colon CD4+ T cells
scATAC_5 fat CD4+ T cells
scATAC_9 skin CD4+ T cells

2 Create ArrowFiles

From the fragments.tsv files output by Cell Ranger ATAC count, generate ArrowFiles for subsequent analysis with ArchR.

# Read in fragments files
inputFiles = vector()
n = 1
for(X in c(1,4,5,8,9)) {
  inputFiles[n] = paste0("data/MD_scATAC_",X,"/fragments.tsv.gz")
  n = n+1
}

# Name input files
names(inputFiles) = vector() 
n = 1
for(X in c(1,4,5,8,9)) {
   names(inputFiles)[n] = paste0("scATAC_",X,"")
   n = n+1
 }

# Create arrow files
# Evaluate different thresholds depending on your data: 
# - minTSS:   Start with 0 to see all cells, afterwards evaluate which threshold 
#             works for all of the samples
# - minFrags: Recommended to set >= 1000, otherwise the analysis might not be
#             robust enough
# Check different parameters to set with ?createArrowFiles
ArrowFiles <- createArrowFiles(
  inputFiles = inputFiles,
  sampleNames = names(inputFiles),
  minTSS = 0,
  minFrags = 1000, 
  addTileMat = TRUE,
  addGeneScoreMat = TRUE,
  force = TRUE 
  )

After you created the ArrowFiles, check the folder “QualityControl” for each of the samples to evaluate which minTSS threshold to use for the analysis. In this analysis, we decided on minTSS = 10. We will filter the data accordingly later on.

3 Create ArchRProject

Now combine the individual ArrowFiles into an ArchRProject.

# Define arrow files
ArrowFiles = vector()
n = 1
for(X in c(1,4,5,8,9)) {
  ArrowFiles[n] = paste0("scATAC_",X,".arrow")
  n = n+1
}

# Create ArchRProject
# It is recommended to set copyArrows = TRUE to maintain an unaltered copy for 
# later usage. 
proj = ArchRProject(
  ArrowFiles = ArrowFiles,
  copyArrows = TRUE
)

saveArchRProject(proj, 
  outputDirectory="ArchRProject", 
  load=F
)

# Load ArchRProject
proj = loadArchRProject(
  path = "ArchRProject", 
  force = FALSE, 
  showLogo = FALSE
  )

4 QC and Filtering

4.1 TSS Enrichment vs nFrags (unfiltered)

# After checking the plots in "QualityControl", we decided to use minTSS = 10. 
# Now, we filter out all cells with minTSS <= 10. 
proj = proj[proj@cellColData$TSSEnrichment >= 10, ]

# Save
saveArchRProject(proj, 
  outputDirectory="ArchRProject", 
  load = TRUE,
  overwrite = TRUE
)

## class: ArchRProject 
## outputDirectory: /omics/groups/OE0436/internal/Regensburg/Mainz/03_archr/scripts/methods/paper/ArchRProject 
## samples(5): scATAC_4 scATAC_1 scATAC_9 scATAC_8 scATAC_5
## sampleColData names(1): ArrowFiles
## cellColData names(13): Sample TSSEnrichment ... nDiFrags BlacklistRatio
## numberOfCells(1): 17931
## medianTSS(1): 20.751
## medianFrags(1): 26426

# As part of the quality control, we plot the TSS enrichment against the number 
# of fragments. As we only have one project now, we get one plot for all of the 
# data.
qc_metrics = getCellColData(proj, select = c("log10(nFrags)", "TSSEnrichment"))

p_qc_metrics = ggPoint(
  x = qc_metrics[,1],
  y = qc_metrics[,2],
  colorDensity = TRUE,
  continuousSet = "sambaNight",
  xlabel = "Log10 Unique Fragments",
  ylabel = "TSS Enrichment",
  xlim = c(log10(500), quantile(qc_metrics[,1], probs = 0.99)),
  ylim = c(0, quantile(qc_metrics[,2], probs = 0.99))
) + geom_hline(yintercept = 10, lty = "dashed") + geom_vline(xintercept = 3, lty = "dashed")

p_qc_metrics

4.2 Filter out barcodes marked as non-cell by CellRanger

# We use additional Cell Ranger information to filter out low-quality cells.
# For this we use the "is_cell_barcode" column of the "singlecell.csv" file, 
# which maps high-quality cells to 1.
singlecell = list()
for (x in c("1","4","5","8","9")){
  # Define filenames and read files
  filename = paste("data/MD_scATAC_",x,"/singlecell.csv",sep = "") 
  data = read.csv(filename)
  # To match quality information to cells, we need the barcodes to match the 
  # ones in our ArchRProject. For this we:
  # 1) create a vector of "scATAC_x#"
  # 2) add vector as a column to the data
  # 3) create column containing ArchRProj-style barcodes
  bc = c(rep(paste("scATAC_",x,"#",sep = ""),nrow(data))) 
  data_barcode = cbind(bc,data) 
  data_fullbc = data_barcode %>% unite("full_barcode", bc:barcode, remove = FALSE, sep = "") 
  singlecell[[x]] = data_fullbc
}

# Combine the dataframes
singlecell_fullbc = rbindlist(singlecell, use.names = FALSE, fill = FALSE) 

# Extract rownames that are also in the ArchRProject
rownames_archr = rownames(proj@cellColData)
subset_singlecell_fullbc = singlecell_fullbc[singlecell_fullbc$full_barcode %in% rownames_archr, ]

# Extract is__cell_barcode column from singlecell.csv and give it barcodes as rownames
df_is_cell_barcode = as.data.frame(subset_singlecell_fullbc$is__cell_barcode)
rownames(df_is_cell_barcode) = subset_singlecell_fullbc$full_barcode

# Order is_cell_barcode the way the ArchRProject is ordered and create filter
is_cell_barcode = df_is_cell_barcode[order(match(rownames(df_is_cell_barcode), rownames_archr)), ]
filter_archr = is_cell_barcode==1

# Filter ArchRProject for is_cell_barcode==1
proj = proj[filter_archr, ]

proj <- loadArchRProject(
  path = "ArchRProject", 
  force = FALSE, 
  showLogo = FALSE
  )

# Get an overview of the project:
proj

## class: ArchRProject 
## outputDirectory: /omics/groups/OE0436/internal/Regensburg/Mainz/03_archr/scripts/methods/paper/ArchRProject 
## samples(5): scATAC_4 scATAC_1 scATAC_9 scATAC_8 scATAC_5
## sampleColData names(1): ArrowFiles
## cellColData names(13): Sample TSSEnrichment ... nDiFrags BlacklistRatio
## numberOfCells(1): 14700
## medianTSS(1): 20.8665
## medianFrags(1): 29405.5

4.3 TSS Enrichment vs nFrags (filtered)

qc_metrics2 = getCellColData(proj, select = c("log10(nFrags)", "TSSEnrichment"))

p_qc_metrics2 = ggPoint(
  x = qc_metrics2[,1], 
  y = qc_metrics2[,2], 
  colorDensity = TRUE,
  continuousSet = "sambaNight",
  xlabel = "Log10 Unique Fragments",
  ylabel = "TSS Enrichment",
  xlim = c(log10(500), quantile(qc_metrics2[,1], probs = 0.99)),
  ylim = c(0, quantile(qc_metrics2[,2], probs = 0.99))
) + geom_hline(yintercept = 10, lty = "dashed") + geom_vline(xintercept = 3, lty = "dashed")

p_qc_metrics2

4.4 Plot sample statistics for the ArchRProject

# 1. Show TSS enrichment score for each sample in a ridge plot.
# The TSS enrichment score is a proxy for the quality of the sample and should
# be similar in all samples in the dataset.
p1 <- plotGroups(
  ArchRProj = proj, 
  groupBy = "Sample", 
  colorBy = "cellColData", 
  name = "TSSEnrichment",
  plotAs = "ridges"
)
p1

# 2. You can also display TSS enrichment scores in a violin plot.
# Lower and upper hinges correspond to the 25th and 75th percentiles, the middle
# corresponds to the median. Lower and upper whiskers extend from the hinge to 
# the lowest or highest value or 1.5 times the IQR.
p2 <- plotGroups(
  ArchRProj = proj, 
  groupBy = "Sample", 
  colorBy = "cellColData", 
  name = "TSSEnrichment",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p2

# 3. Show log10(unique nuclear fragments) for each sample in a ridge plot.
# It would be ideal if all of the samples have a similar number of fragments 
# per cell. If this is not the case, you can downsample the files by randomly
# removing fragments from the larger sample(s) in the fragments.tsv.gz file.
p3 <- plotGroups(
  ArchRProj = proj, 
  groupBy = "Sample", 
  colorBy = "cellColData", 
  name = "log10(nFrags)",
  plotAs = "ridges"
)
p3

# 4. You can also display log10(unique nuclear fragments) in a violin plot.
p4 <- plotGroups(
  ArchRProj = proj, 
  groupBy = "Sample", 
  colorBy = "cellColData", 
  name = "log10(nFrags)",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p4

# 5. plot sample fragment size distributios:
# Ideally, you should see a nucleosome profile with enrichment in 
# periodicity of 150 bp (length of DNA wrapped around one nucleosome) 
p5 <- plotFragmentSizes(ArchRProj = proj)
p5

# 6. plot TSS enrichment profiles:
p6 <- plotTSSEnrichment(ArchRProj = proj)
p6

5 Dimensionality reduction

5.1 Compute LSI dimensionality reduction

# LSI dimensionality reduction
# Create a dimensionality reduction of the data as basis for visualisation as UMAP
proj <- addIterativeLSI( 
  ArchRProj = proj,
  useMatrix = "TileMatrix", 
  name = "IterativeLSI", 
  iterations = 2, 
  clusterParams = list(
    resolution = c(0.2), 
    sampleCells = 10000, 
    n.start = 10
  ), 
  varFeatures = 25000, 
  dimsToUse = 1:30,
  force = TRUE
)

# Clustering
proj <- addClusters(
  input = proj,
  reducedDims = "IterativeLSI",
  method = "Seurat",
  name = "Clusters",
  resolution = 0.8, 
  force = TRUE
)

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 14700
## Number of edges: 515701
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8966
## Number of communities: 21
## Elapsed time: 1 seconds

5.2 Visualization in UMAP embedding

# Visualization of clustering in UMAP embedding.
# This step is not deterministic, plots will look different to the figures in
# the manuscript
proj <- addUMAP(
  ArchRProj = proj, 
  reducedDims = "IterativeLSI", 
  name = "UMAP", 
  nNeighbors = 30, 
  minDist = 0.5, 
  metric = "cosine", 
  force = TRUE
)

# Color by clusters
p_clusters <- plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "Clusters", 
  embedding = "UMAP", 
  size = 0.5
  )

# Color by samples
p_samples <- plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "Sample", 
  embedding = "UMAP", 
  size = 0.5
  )

# Plot
ggAlignPlots(p_clusters, p_samples, type = "h")

5.3 Identifying marker genes

# Get marker features
# Determine marker genes of each cluster based on the gene score matrix. 
# For this, each cluster is compared to all other clusters using wilcoxon 
# rank-sum test.
markersGS <- getMarkerFeatures(
  ArchRProj = proj, 
  useMatrix = "GeneScoreMatrix", 
  groupBy = "Clusters",
  bias = c("TSSEnrichment", "log10(nFrags)"),
  testMethod = "wilcoxon"
)

# Define which genes to highlight
markerGenes <- c("Cd4", "Il2ra", "Foxp3", "Batf", "Ccr8", "Ikzf2", "Rorc", "Tbx21", "Ifng", "Gata3", "Klrg1", "Il2")

# Get markers
markerList <- getMarkers(markersGS, cutOff = "FDR <= 0.01 & Log2FC >= 1.25")

# Overlay per-cell gene scores on our UMAP embedding
genes <- plotEmbedding(
  ArchRProj = proj, 
  colorBy = "GeneScoreMatrix", 
  name = markerGenes, 
  embedding = "UMAP",
  plotAs = "points", 
  quantCut = c(0.01, 0.95),
  imputeWeights = NULL,
  size = 0.5
)

# To plot all genes we can use cowplot to arrange the various marker genes into
# a single plot
genes_cow <- lapply(genes, function(x){
  x + guides(color = FALSE, fill = FALSE) + 
    theme_ArchR(baseSize = 6.5) +
    theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
    theme(
      axis.text.x=element_blank(), 
      axis.ticks.x=element_blank(), 
      axis.text.y=element_blank(), 
      axis.ticks.y=element_blank()
    )
})
do.call(cowplot::plot_grid, c(list(ncol = 4), genes_cow))

5.4 Impute marker genes with MAGIC

# Some of the gene score overlays appear quite variable. This is due to the 
# sparsity of scATAC-seq data. MAGIC can be used to impute gene scores by 
# smoothing signal across nearby cells. This greatly improves the visual 
# interpretation of gene scores.

#Wwe first add impute weights to our ArchRProject
proj <- addImputeWeights(proj)

# These impute weights can then be passed to plotEmbedding() when plotting gene 
# scores overlayed on the UMAP embedding
magic_genes <- plotEmbedding(
  ArchRProj = proj, 
  colorBy = "GeneScoreMatrix", 
  name = markerGenes, 
  embedding = "UMAP",
  plotAs = "points", 
  imputeWeights = getImputeWeights(proj),
  size = 0.5
)

#Plot all genes in cowplot
magic_genes_cow <- lapply(magic_genes, function(x){
  x + guides(color = FALSE, fill = FALSE) + 
    theme_ArchR(baseSize = 6.5) +
    theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
    theme(
      axis.text.x=element_blank(), 
      axis.ticks.x=element_blank(), 
      axis.text.y=element_blank(), 
      axis.ticks.y=element_blank()
    )
})
do.call(cowplot::plot_grid, c(list(ncol = 4), magic_genes_cow))

5.5 Tweak different parameters of LSI dimensionality reduction

Now we tweak different parameters to show their effect on the results. For this dataset we previously selected the defaults parameters. All of the following is not saved and only for demonstration purposes.

This should give you an idea of which parameters to tweak in your own data analysis.

5.5.1 More iterations

# LSI dimensionality reduction
proj_4iterations <- addIterativeLSI( 
  ArchRProj = proj,
  useMatrix = "TileMatrix", 
  name = "IterativeLSI", 
  iterations = 4, 
  clusterParams = list( #See Seurat::FindClusters
    resolution = c(0.2), 
    sampleCells = 10000, 
    n.start = 10
  ), 
  varFeatures = 25000, 
  dimsToUse = 1:30,
  force = TRUE
)

# Clustering
proj_4iterations <- addClusters(
  input = proj_4iterations,
  reducedDims = "IterativeLSI",
  method = "Seurat",
  name = "Clusters",
  resolution = 0.8, 
  force = TRUE
)

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 14700
## Number of edges: 520040
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8967
## Number of communities: 20
## Elapsed time: 2 seconds

# Visualization of clustering as UMAP
proj_4iterations <- addUMAP(
  ArchRProj = proj_4iterations, 
  reducedDims = "IterativeLSI", 
  name = "UMAP", 
  nNeighbors = 30, 
  minDist = 0.5, 
  metric = "cosine", 
  force = TRUE
)

# Color by clusters
p_clusters_4iterations <- plotEmbedding(
  ArchRProj = proj_4iterations, 
  colorBy = "cellColData", 
  name = "Clusters", 
  embedding = "UMAP", 
  size = 0.5
  )

# Color by samples
p_samples_4iterations <- plotEmbedding(
  ArchRProj = proj_4iterations, 
  colorBy = "cellColData", 
  name = "Sample", 
  embedding = "UMAP", 
  size = 0.5
  )

# Plot
ggAlignPlots(p_clusters_4iterations, p_samples_4iterations, type = "h")

# Get marker features
markersGS_4iterations <- getMarkerFeatures(
  ArchRProj = proj_4iterations, 
  useMatrix = "GeneScoreMatrix", 
  groupBy = "Clusters",
  bias = c("TSSEnrichment", "log10(nFrags)"),
  testMethod = "wilcoxon"
)

# Add impute weights
proj_4iterations <- addImputeWeights(proj_4iterations)

# These impute weights can then be passed to plotEmbedding() when plotting gene 
# scores overlayed on the UMAP embedding
magic_genes_4iterations <- plotEmbedding(
  ArchRProj = proj_4iterations, 
  colorBy = "GeneScoreMatrix", 
  name = markerGenes, 
  embedding = "UMAP",
  plotAs = "points", 
  imputeWeights = getImputeWeights(proj_4iterations),
  size = 0.5
)

# Plot all genes in cowplot
magic_genes_cow_4iterations <- lapply(magic_genes_4iterations, function(x){
  x + guides(color = FALSE, fill = FALSE) + 
    theme_ArchR(baseSize = 6.5) +
    theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
    theme(
      axis.text.x=element_blank(), 
      axis.ticks.x=element_blank(), 
      axis.text.y=element_blank(), 
      axis.ticks.y=element_blank()
    )
})
do.call(cowplot::plot_grid, c(list(ncol = 4), magic_genes_cow))

5.5.2 Less variable features

# LSI dimensionality reduction
proj_varFeatures <- addIterativeLSI( 
  ArchRProj = proj,
  useMatrix = "TileMatrix", 
  name = "IterativeLSI", 
  iterations = 2, 
  clusterParams = list( #See Seurat::FindClusters
    resolution = c(0.2), 
    sampleCells = 10000, 
    n.start = 10
  ), 
  varFeatures = 10000, 
  dimsToUse = 1:30,
  force = TRUE
)

# Clustering
proj_varFeatures <- addClusters(
  input = proj_varFeatures,
  reducedDims = "IterativeLSI",
  method = "Seurat",
  name = "Clusters",
  resolution = 0.8, 
  force = TRUE
)

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 14700
## Number of edges: 530573
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8955
## Number of communities: 20
## Elapsed time: 1 seconds

# Visualization of clustering as UMAP
proj_varFeatures <- addUMAP(
  ArchRProj = proj_varFeatures, 
  reducedDims = "IterativeLSI", 
  name = "UMAP", 
  nNeighbors = 30, 
  minDist = 0.5, 
  metric = "cosine", 
  force = TRUE
)

# Color by clusters
p_clusters_varFeatures <- plotEmbedding(
  ArchRProj = proj_varFeatures, 
  colorBy = "cellColData", 
  name = "Clusters", 
  embedding = "UMAP", 
  size = 0.5
  )

# Color by samples
p_samples_varFeatures <- plotEmbedding(
  ArchRProj = proj_varFeatures, 
  colorBy = "cellColData", 
  name = "Sample", 
  embedding = "UMAP", 
  size = 0.5
  )

# Plot
ggAlignPlots(p_clusters_varFeatures, p_samples_varFeatures, type = "h")

# Get marker features
markersGS_varFeatures <- getMarkerFeatures(
  ArchRProj = proj_varFeatures, 
  useMatrix = "GeneScoreMatrix", 
  groupBy = "Clusters",
  bias = c("TSSEnrichment", "log10(nFrags)"),
  testMethod = "wilcoxon"
)

# Add impute weights
proj_varFeatures <- addImputeWeights(proj_varFeatures)

# These impute weights can then be passed to plotEmbedding() when plotting gene 
# scores overlayed on the UMAP embedding
magic_genes_varFeatures <- plotEmbedding(
  ArchRProj = proj_varFeatures, 
  colorBy = "GeneScoreMatrix", 
  name = markerGenes, 
  embedding = "UMAP",
  plotAs = "points", 
  imputeWeights = getImputeWeights(proj_varFeatures),
  size = 0.5
)

# Plot all genes in cowplot
magic_genes_cow_varFeatures <- lapply(magic_genes_varFeatures, function(x){
  x + guides(color = FALSE, fill = FALSE) + 
    theme_ArchR(baseSize = 6.5) +
    theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
    theme(
      axis.text.x=element_blank(), 
      axis.ticks.x=element_blank(), 
      axis.text.y=element_blank(), 
      axis.ticks.y=element_blank()
    )
})
do.call(cowplot::plot_grid, c(list(ncol = 4), magic_genes_cow_varFeatures))

5.5.3 dimsToUse 2:30

# LSI dimensionality reduction
proj_dimsToUse <- addIterativeLSI( 
  ArchRProj = proj,
  useMatrix = "TileMatrix", 
  name = "IterativeLSI", 
  iterations = 2, 
  clusterParams = list( #See Seurat::FindClusters
    resolution = c(0.2), 
    sampleCells = 10000, 
    n.start = 10
  ), 
  varFeatures = 25000, 
  dimsToUse = 2:30,
  force = TRUE
)

# Clustering
proj_dimsToUse <- addClusters(
  input = proj_dimsToUse,
  reducedDims = "IterativeLSI",
  method = "Seurat",
  name = "Clusters",
  resolution = 0.8, 
  force = TRUE
)

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 14700
## Number of edges: 517695
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8969
## Number of communities: 20
## Elapsed time: 2 seconds

# Visualization of clustering as UMAP
proj_dimsToUse <- addUMAP(
  ArchRProj = proj_dimsToUse, 
  reducedDims = "IterativeLSI", 
  name = "UMAP", 
  nNeighbors = 30, 
  minDist = 0.5, 
  metric = "cosine", 
  force = TRUE
)

# Color by clusters
p_clusters_dimsToUse <- plotEmbedding(
  ArchRProj = proj_dimsToUse, 
  colorBy = "cellColData", 
  name = "Clusters", 
  embedding = "UMAP", 
  size = 0.5
  )

# Color by samples
p_samples_dimsToUse <- plotEmbedding(
  ArchRProj = proj_dimsToUse, 
  colorBy = "cellColData", 
  name = "Sample", 
  embedding = "UMAP", 
  size = 0.5
  )

# Plot
ggAlignPlots(p_clusters_dimsToUse, p_samples_dimsToUse, type = "h")

# Get marker features
markersGS_dimsToUse <- getMarkerFeatures(
  ArchRProj = proj_dimsToUse, 
  useMatrix = "GeneScoreMatrix", 
  groupBy = "Clusters",
  bias = c("TSSEnrichment", "log10(nFrags)"),
  testMethod = "wilcoxon"
)

# Add impute weights
proj_dimsToUse <- addImputeWeights(proj_dimsToUse)

# These impute weights can then be passed to plotEmbedding() when plotting gene 
# scores overlayed on the UMAP embedding
magic_genes_dimsToUse <- plotEmbedding(
  ArchRProj = proj_dimsToUse, 
  colorBy = "GeneScoreMatrix", 
  name = markerGenes, 
  embedding = "UMAP",
  plotAs = "points", 
  imputeWeights = getImputeWeights(proj_dimsToUse),
  size = 0.5
)

# Plot all genes in cowplot
magic_genes_cow_dimsToUse <- lapply(magic_genes, function(x){
  x + guides(color = FALSE, fill = FALSE) + 
    theme_ArchR(baseSize = 6.5) +
    theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
    theme(
      axis.text.x=element_blank(), 
      axis.ticks.x=element_blank(), 
      axis.text.y=element_blank(), 
      axis.ticks.y=element_blank()
    )
})
do.call(cowplot::plot_grid, c(list(ncol = 4), magic_genes_cow_dimsToUse))

# Visualisation of clustering as UMAP
proj <- addUMAP(
  ArchRProj = proj, 
  reducedDims = "IterativeLSI", 
  name = "UMAP", 
  nNeighbors = 30, 
  minDist = 0.5, 
  metric = "cosine", 
  force = TRUE
)

# Color by clusters
p_clusters <- plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "Clusters", 
  embedding = "UMAP", 
  size = 0.5
  )

# Color by samples
p_samples <- plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "Sample", 
  embedding = "UMAP", 
  size = 0.5
  )

# Plot
ggAlignPlots(p_clusters, p_samples, type = "h")

6 Filter doublets

6.1 Calculate doublet scores

# Calculate doublet scores on the ArchRProject
proj <- addDoubletScores(
  input = proj,
  k = 10, #Refers to how many cells near a "pseudo-doublet" to count
  knnMethod = "UMAP", #Refers to the embedding to use for nearest neighbor search
  LSIMethod = 1, 
  force = TRUE
)

## ArchR logging to : ArchRLogs/ArchR-addDoubletScores-12ad572f0853e-Date-2023-04-28_Time-16-37-56.log
## If there is an issue, please report to github with logFile!

## 2023-04-28 16:37:56 : Batch Execution w/ safelapply!, 0 mins elapsed.

## 2023-04-28 16:37:56 : scATAC_4 (1 of 5) :  Computing Doublet Statistics, 0 mins elapsed.

## scATAC_4 (1 of 5) : UMAP Projection R^2 = 0.99876

## scATAC_4 (1 of 5) : UMAP Projection R^2 = 0.99876

## 2023-04-28 16:42:25 : scATAC_1 (2 of 5) :  Computing Doublet Statistics, 4.482 mins elapsed.

## scATAC_1 (2 of 5) : UMAP Projection R^2 = 0.98144

## scATAC_1 (2 of 5) : UMAP Projection R^2 = 0.98144

## 2023-04-28 16:46:09 : scATAC_9 (3 of 5) :  Computing Doublet Statistics, 8.209 mins elapsed.

## Filtering 1 dims correlated > 0.75 to log10(depth + 1)

## scATAC_9 (3 of 5) : UMAP Projection R^2 = 0.99609

## scATAC_9 (3 of 5) : UMAP Projection R^2 = 0.99609

## 2023-04-28 16:48:25 : scATAC_8 (4 of 5) :  Computing Doublet Statistics, 10.476 mins elapsed.

## scATAC_8 (4 of 5) : UMAP Projection R^2 = 0.99388

## scATAC_8 (4 of 5) : UMAP Projection R^2 = 0.99388

## 2023-04-28 16:50:38 : scATAC_5 (5 of 5) :  Computing Doublet Statistics, 12.693 mins elapsed.

## scATAC_5 (5 of 5) : UMAP Projection R^2 = 0.99447

## scATAC_5 (5 of 5) : UMAP Projection R^2 = 0.99447

## ArchR logging successful to : ArchRLogs/ArchR-addDoubletScores-12ad572f0853e-Date-2023-04-28_Time-16-37-56.log

## class: ArchRProject 
## outputDirectory: /omics/groups/OE0436/internal/Regensburg/Mainz/03_archr/scripts/methods/paper/ArchRProject 
## samples(5): scATAC_4 scATAC_1 scATAC_9 scATAC_8 scATAC_5
## sampleColData names(1): ArrowFiles
## cellColData names(16): Sample TSSEnrichment ... DoubletScore
##   DoubletEnrichment
## numberOfCells(1): 14700
## medianTSS(1): 20.8665
## medianFrags(1): 29405.5

6.2 Test different doublet filter ratios

In the next step, we evaluate different filter ratios. The filter ratio determines how many cells will be filtered out as doublets. Doublets scores were calculated above.

6.2.1 FilterRatio = 0.5

# Create temporary project and filter doublets 
proj_tmp <- filterDoublets(
  proj,
  filterRatio = 0.5
  )

## Filtering 265 cells from ArchRProject!

##  scATAC_1 : 88 of 4199 (2.1%)

##  scATAC_4 : 124 of 4988 (2.5%)

##  scATAC_5 : 6 of 1159 (0.5%)

##  scATAC_8 : 20 of 2016 (1%)

##  scATAC_9 : 27 of 2338 (1.2%)

# Visualize (and compare to non-filtered sample)
# LSI dimensional reduction:
proj_tmp <- addIterativeLSI(
  ArchRProj = proj_tmp,
  useMatrix = "TileMatrix",
  name = "IterativeLSI",
  iterations = 2, 
  clusterParams = list( #See Seurat::FindClusters
    resolution = c(0.2), 
    sampleCells = 10000, 
    n.start = 10 
  ),
  varFeatures = 25000, 
  dimsToUse = 1:30,
  force = TRUE
)

# Clustering
proj_tmp <- addClusters(
  input = proj_tmp,
  reducedDims = "IterativeLSI",
  method = "Seurat",
  name = "Clusters",
  resolution = 0.8, 
  force = TRUE
)

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 14435
## Number of edges: 506353
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8952
## Number of communities: 20
## Elapsed time: 2 seconds

# Visualisation as UMAP
proj_tmp <- addUMAP(
  ArchRProj = proj_tmp, 
  reducedDims = "IterativeLSI", 
  name = "UMAP", 
  nNeighbors = 30, 
  minDist = 0.5, 
  metric = "cosine", 
  force = TRUE
)

# Color by clusters
p_clusters_tmp <- plotEmbedding(
  ArchRProj = proj_tmp, 
  colorBy = "cellColData", 
  name = "Clusters", 
  embedding = "UMAP", 
  size = 0.5
  )

# Color by samples
p_samples_tmp <- plotEmbedding(
  ArchRProj = proj_tmp, 
  colorBy = "cellColData", 
  name = "Sample", 
  embedding = "UMAP", 
  size = 0.5
)

# Plot
ggAlignPlots(p_clusters_tmp, p_samples_tmp, type = "h")

# Color by doublet enrichment score
p_doubscore_tmp <- plotEmbedding(
  ArchRProj = proj_tmp, 
  colorBy = "cellColData", 
  name = "DoubletScore", 
  embedding = "UMAP", 
  plotAs = "points", 
  size = 0.5
)

p_doubenr_tmp <- plotEmbedding(
  ArchRProj = proj_tmp, 
  colorBy = "cellColData", 
  name = "DoubletEnrichment", 
  embedding = "UMAP", 
  plotAs = "points", 
  size = 0.5
)

# Color by number of fragments per cell
p_nfrags_tmp <- plotEmbedding(
  ArchRProj = proj_tmp, 
  colorBy = "cellColData", 
  name = "nFrags", 
  embedding = "UMAP", 
  plotAs = "points", 
  size = 0.5
)

# Plot
plot_grid(p_doubscore_tmp, p_doubenr_tmp, p_nfrags_tmp, align = "h", ncol = 3, scale = 1.1)

# Check whether the biology makes sense
# Overlay gene scores on UMAP embedding of proj_tmp, use MAGIC smoothing

# Get marker features
markersGS_tmp <- getMarkerFeatures(
  ArchRProj = proj_tmp, 
  useMatrix = "GeneScoreMatrix", 
  groupBy = "Clusters",
  bias = c("TSSEnrichment", "log10(nFrags)"),
  testMethod = "wilcoxon"
)

# Add impute weights
proj_tmp <- addImputeWeights(proj_tmp)

# Plot
magic_genes_tmp <- plotEmbedding(
  ArchRProj = proj_tmp, 
  colorBy = "GeneScoreMatrix", 
  name = markerGenes, 
  embedding = "UMAP",
  plotAs = "points", 
  imputeWeights = getImputeWeights(proj_tmp),
  size = 0.5
)

# Plot all genes in cowplot
magic_genes_cow_tmp <- lapply(magic_genes_tmp, function(x){
  x + guides(color = FALSE, fill = FALSE) + 
    theme_ArchR(baseSize = 6.5) +
    theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
    theme(
      axis.text.x=element_blank(), 
      axis.ticks.x=element_blank(), 
      axis.text.y=element_blank(), 
      axis.ticks.y=element_blank()
    )
})
do.call(cowplot::plot_grid, c(list(ncol = 4), magic_genes_cow_tmp))

# Sometimes it can also help to look at the gene scores visualized as violin
# plots for the single clusters
p_Cd4 <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Cd4",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Cd4

p_Il2ra <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Il2ra",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Il2ra

p_Foxp3 <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Foxp3",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Foxp3

p_Batf <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Batf",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Batf

p_Ccr8 <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Ccr8",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Ccr8

p_Ikzf2 <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Ikzf2",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Ikzf2

p_Rorc <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Rorc",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Rorc

p_Tbx21 <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Tbx21",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Tbx21

p_Ifng <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Ifng",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Ifng

p_Gata3 <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Gata3",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Gata3

p_Klrg1 <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Klrg1",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Klrg1

p_Il2 <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Il2",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Il2

# Cell numbers per cluster
cM_proj_cellsPass_tmp <- confusionMatrix(paste0(proj_tmp$Clusters), paste0(proj_tmp$Sample))
cM_proj_cellsPass_tmp

## 20 x 5 sparse Matrix of class "dgCMatrix"
##     scATAC_1 scATAC_4 scATAC_5 scATAC_8 scATAC_9
## C3       747        1        5        2        .
## C4      2798        .       53        .        .
## C1       181        6       40     1477        3
## C14       31        5       22      479        8
## C12      176       40      546        2       14
## C7        39        .        3        2        1
## C17       10        2      117       32        4
## C9         1        1        6        .      902
## C5       125       40      231        .       10
## C10        1        5       14        .      470
## C6         2     1290       18        .        .
## C11        .      458        6        .        1
## C15        .      776       25        .        9
## C16        .      511        6        .        .
## C13        .     1379        6        .        2
## C19        .      189       10        .       11
## C20        .       21        7        .       30
## C8         .      111       32        .        2
## C2         .       27        5        .        1
## C18        .        2        1        2      843

# Save
write.csv(as.data.frame(cM_proj_cellsPass_tmp), "./ArchRProject/Plots/cells_per_cluster_filterratio0-5.csv", row.names=FALSE)

6.2.2 FilterRatio = 1

# Create temporary project and filter doublets
proj_tmp <- filterDoublets(
  proj,
  filterRatio = 1 
  )

## Filtering 531 cells from ArchRProject!

##  scATAC_1 : 176 of 4199 (4.2%)

##  scATAC_4 : 248 of 4988 (5%)

##  scATAC_5 : 13 of 1159 (1.1%)

##  scATAC_8 : 40 of 2016 (2%)

##  scATAC_9 : 54 of 2338 (2.3%)

# Visualize (and compare to non-filtered sample):
# LSI dimensional reduction:
proj_tmp <- addIterativeLSI(
  ArchRProj = proj_tmp,
  useMatrix = "TileMatrix",
  name = "IterativeLSI",
  iterations = 2, 
  clusterParams = list( #See Seurat::FindClusters
    resolution = c(0.2), 
    sampleCells = 10000, 
    n.start = 10 
  ),
  varFeatures = 25000, 
  dimsToUse = 1:30,
  force = TRUE
)

# Clustering
proj_tmp <- addClusters(
  input = proj_tmp,
  reducedDims = "IterativeLSI",
  method = "Seurat",
  name = "Clusters",
  resolution = 0.8, 
  force = TRUE
)

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 14169
## Number of edges: 497811
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8958
## Number of communities: 20
## Elapsed time: 1 seconds

# Visualisation as UMAP
proj_tmp <- addUMAP(
  ArchRProj = proj_tmp, 
  reducedDims = "IterativeLSI", 
  name = "UMAP", 
  nNeighbors = 30, 
  minDist = 0.5, 
  metric = "cosine", 
  force = TRUE
)

# Color by clusters
p_clusters_tmp <- plotEmbedding(
  ArchRProj = proj_tmp, 
  colorBy = "cellColData", 
  name = "Clusters", 
  embedding = "UMAP", 
  size = 0.5
  )

# Color by samples
p_samples_tmp <- plotEmbedding(
  ArchRProj = proj_tmp, 
  colorBy = "cellColData", 
  name = "Sample", 
  embedding = "UMAP", 
  size = 0.5
)

# Plot
ggAlignPlots(p_clusters_tmp, p_samples_tmp, type = "h")

# Color by doublet enrichment score
p_doubscore_tmp <- plotEmbedding(
  ArchRProj = proj_tmp, 
  colorBy = "cellColData", 
  name = "DoubletScore", 
  embedding = "UMAP", 
  plotAs = "points", 
  size = 0.5
)

p_doubenr_tmp <- plotEmbedding(
  ArchRProj = proj_tmp, 
  colorBy = "cellColData", 
  name = "DoubletEnrichment", 
  embedding = "UMAP", 
  plotAs = "points", 
  size = 0.5
)

# Color by number of fragments per cell
p_nfrags_tmp <- plotEmbedding(
  ArchRProj = proj_tmp, 
  colorBy = "cellColData", 
  name = "nFrags", 
  embedding = "UMAP", 
  plotAs = "points", 
  size = 0.5
)

# Plot
plot_grid(p_doubscore_tmp, p_doubenr_tmp, p_nfrags_tmp, align = "h", ncol = 3, scale = 1.1)

# Get marker features
markersGS_tmp <- getMarkerFeatures(
  ArchRProj = proj_tmp, 
  useMatrix = "GeneScoreMatrix", 
  groupBy = "Clusters",
  bias = c("TSSEnrichment", "log10(nFrags)"),
  testMethod = "wilcoxon"
)

# Add impute weights
proj_tmp <- addImputeWeights(proj_tmp)

# Plot
magic_genes_tmp <- plotEmbedding(
  ArchRProj = proj_tmp, 
  colorBy = "GeneScoreMatrix", 
  name = markerGenes, 
  embedding = "UMAP", 
  plotAs = "points", 
  imputeWeights = getImputeWeights(proj_tmp),
  size = 0.5
)

# Plot all genes in cowplot
magic_genes_cow_tmp <- lapply(magic_genes_tmp, function(x){
  x + guides(color = FALSE, fill = FALSE) + 
    theme_ArchR(baseSize = 6.5) +
    theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
    theme(
      axis.text.x=element_blank(), 
      axis.ticks.x=element_blank(), 
      axis.text.y=element_blank(), 
      axis.ticks.y=element_blank()
    )
})
do.call(cowplot::plot_grid, c(list(ncol = 4), magic_genes_cow_tmp))

# Sometimes it can also help to look at the gene scores visualized as violin
# plots for the single clusters
p_Cd4 <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Cd4",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Cd4

p_Il2ra <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Il2ra",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Il2ra

p_Foxp3 <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Foxp3",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Foxp3

p_Batf <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Batf",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Batf

p_Ccr8 <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Ccr8",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Ccr8

p_Ikzf2 <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Ikzf2",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Ikzf2

p_Rorc <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Rorc",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Rorc

p_Tbx21 <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Tbx21",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Tbx21

p_Ifng <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Ifng",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Ifng

p_Gata3 <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Gata3",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Gata3

p_Klrg1 <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Klrg1",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Klrg1

p_Il2 <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Il2",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Il2

# Cell numbers per cluster
cM_proj_cellsPass_tmp <- confusionMatrix(paste0(proj_tmp$Clusters), paste0(proj_tmp$Sample))
cM_proj_cellsPass_tmp

## 20 x 5 sparse Matrix of class "dgCMatrix"
##     scATAC_1 scATAC_4 scATAC_5 scATAC_8 scATAC_9
## C2       706        4        4        1        .
## C3      2651        .       49        .        .
## C10      179        7       44     1449        4
## C8        31        4       21      487        7
## C16      157       39      539        2       17
## C1        39        1        5        2        1
## C14       10        6       31       33      824
## C19        1        2        5        .      986
## C4       245       44      234        .       10
## C20        1        6       13        .      371
## C9         1      348        6        .        1
## C5         2     1284       18        .        .
## C12        .      800       23        .       10
## C17        .     1358        6        .        3
## C13        .      496        5        .        .
## C6         .      184        9        .       14
## C7         .       21        7        .       30
## C18        .      109       33        .        3
## C11        .       25        5        .        1
## C15        .        2       89        2        2

# Save
write.csv(as.data.frame(cM_proj_cellsPass_tmp), "./ArchRProject/Plots/cells_per_cluster_filterratio1.csv", row.names=FALSE)

6.2.3 FilterRatio = 2

# Create temporary project and filter doublets
proj_tmp <- filterDoublets(
  proj,
  filterRatio = 2 
  )

## Filtering 1065 cells from ArchRProject!

##  scATAC_1 : 352 of 4199 (8.4%)

##  scATAC_4 : 497 of 4988 (10%)

##  scATAC_5 : 26 of 1159 (2.2%)

##  scATAC_8 : 81 of 2016 (4%)

##  scATAC_9 : 109 of 2338 (4.7%)

# Visualize (and compare to non-filtered sample)
# LSI dimensional reduction
proj_tmp <- addIterativeLSI(
  ArchRProj = proj_tmp,
  useMatrix = "TileMatrix",
  name = "IterativeLSI",
  iterations = 2, 
  clusterParams = list( #See Seurat::FindClusters
    resolution = c(0.2), 
    sampleCells = 10000, 
    n.start = 10 
  ),
  varFeatures = 25000, 
  dimsToUse = 1:30,
  force = TRUE
)

# Clustering
proj_tmp <- addClusters(
  input = proj_tmp,
  reducedDims = "IterativeLSI",
  method = "Seurat",
  name = "Clusters",
  resolution = 0.8, 
  force = TRUE
)

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 13635
## Number of edges: 477400
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8949
## Number of communities: 21
## Elapsed time: 1 seconds

# Visualisation as UMAP
proj_tmp <- addUMAP(
  ArchRProj = proj_tmp, 
  reducedDims = "IterativeLSI", 
  name = "UMAP", 
  nNeighbors = 30, 
  minDist = 0.5, 
  metric = "cosine", 
  force = TRUE
)

# Color by clusters
p_clusters_tmp <- plotEmbedding(
  ArchRProj = proj_tmp, 
  colorBy = "cellColData", 
  name = "Clusters", 
  embedding = "UMAP", 
  size = 0.5
  )

# Color by samples
p_samples_tmp <- plotEmbedding(
  ArchRProj = proj_tmp, 
  colorBy = "cellColData", 
  name = "Sample", 
  embedding = "UMAP", 
  size = 0.5
)

# Plot
ggAlignPlots(p_clusters_tmp, p_samples_tmp, type = "h")

# Color by doublet enrichment score
p_doubscore_tmp <- plotEmbedding(
  ArchRProj = proj_tmp, 
  colorBy = "cellColData", 
  name = "DoubletScore", 
  embedding = "UMAP", 
  plotAs = "points", 
  size = 0.5
)

p_doubenr_tmp <- plotEmbedding(
  ArchRProj = proj_tmp, 
  colorBy = "cellColData", 
  name = "DoubletEnrichment", 
  embedding = "UMAP", 
  plotAs = "points", 
  size = 0.5
)

# Color by number of fragments per cell
p_nfrags_tmp <- plotEmbedding(
  ArchRProj = proj_tmp, 
  colorBy = "cellColData", 
  name = "nFrags", 
  embedding = "UMAP", 
  plotAs = "points", 
  size = 0.5
)

# Plot
plot_grid(p_doubscore_tmp, p_doubenr_tmp, p_nfrags_tmp, align = "h", ncol = 3, scale = 1.1)

# Get marker features
markersGS_tmp <- getMarkerFeatures(
  ArchRProj = proj_tmp, 
  useMatrix = "GeneScoreMatrix", 
  groupBy = "Clusters",
  bias = c("TSSEnrichment", "log10(nFrags)"),
  testMethod = "wilcoxon"
)

# Add impute weights
proj_tmp <- addImputeWeights(proj_tmp)

# Plot
magic_genes_tmp <- plotEmbedding(
  ArchRProj = proj_tmp, 
  colorBy = "GeneScoreMatrix", 
  name = markerGenes, 
  embedding = "UMAP",
  plotAs = "points", 
  imputeWeights = getImputeWeights(proj_tmp),
  size = 0.5
)

# Plot all genes in cowplot
magic_genes_cow_tmp <- lapply(magic_genes_tmp, function(x){
  x + guides(color = FALSE, fill = FALSE) + 
    theme_ArchR(baseSize = 6.5) +
    theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
    theme(
      axis.text.x=element_blank(), 
      axis.ticks.x=element_blank(), 
      axis.text.y=element_blank(), 
      axis.ticks.y=element_blank()
    )
})
do.call(cowplot::plot_grid, c(list(ncol = 4), magic_genes_cow_tmp))

# Sometimes it can also help to look at the gene scores visualized as violin
# plots for the single clusters
p_Cd4 <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Cd4",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Cd4

p_Il2ra <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Il2ra",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Il2ra

p_Foxp3 <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Foxp3",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Foxp3

p_Batf <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Batf",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Batf

p_Ccr8 <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Ccr8",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Ccr8

p_Ikzf2 <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Ikzf2",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Ikzf2

p_Rorc <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Rorc",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Rorc

p_Tbx21 <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Tbx21",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Tbx21

p_Ifng <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Ifng",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Ifng

p_Gata3 <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Gata3",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Gata3

p_Klrg1 <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Klrg1",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Klrg1

p_Il2 <- plotGroups(
  ArchRProj = proj_tmp, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Il2",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Il2

# Cell numbers per cluster
cM_proj_cellsPass_tmp <- confusionMatrix(paste0(proj_tmp$Clusters), paste0(proj_tmp$Sample))
cM_proj_cellsPass_tmp

## 21 x 5 sparse Matrix of class "dgCMatrix"
##     scATAC_1 scATAC_4 scATAC_5 scATAC_8 scATAC_9
## C15      472        .        2        1        .
## C16     2710        .       44        .        .
## C20        9        .        1      205        .
## C12        2      205        8        1        .
## C13      169       25      533        2       17
## C21      169        6       41     1216        3
## C11       44        .        4        2        1
## C5        10        7      113       33       12
## C2        29        4       20      474        4
## C9         1        1       10        .      984
## C17      230       40      241        .       10
## C10        1        6       16        .      356
## C18        1     1298       18        .        1
## C6         .      686       26        .        7
## C14        .     1387        4        .        2
## C1         .       23        7        .       30
## C3         .      502        5        .        1
## C7         .      186        7        .       11
## C8         .       87       27        .        1
## C19        .       24        3        .        1
## C4         .        4        3        1      788

# Save
write.csv(as.data.frame(cM_proj_cellsPass_tmp), "./ArchRProject/Plots/cells_per_cluster_filterratio2.csv", row.names=FALSE)

6.3 Filtering doublets

# Create new ArchRProject and filter doublets with the filter ratio chosen based
# on the test runs above
proj <- filterDoublets(
  proj,
  filterRatio = 0.5, 
  )

## Filtering 265 cells from ArchRProject!

##  scATAC_1 : 88 of 4199 (2.1%)

##  scATAC_4 : 124 of 4988 (2.5%)

##  scATAC_5 : 6 of 1159 (0.5%)

##  scATAC_8 : 20 of 2016 (1%)

##  scATAC_9 : 27 of 2338 (1.2%)

# Load new ArchRProject
proj <- loadArchRProject(
  path = "ArchRProject", 
  force = TRUE, 
  showLogo = FALSE
  )

# Get an overview of the project
proj

## class: ArchRProject 
## outputDirectory: /omics/groups/OE0436/internal/Regensburg/Mainz/03_archr/scripts/methods/paper/ArchRProject 
## samples(5): scATAC_4 scATAC_1 scATAC_9 scATAC_8 scATAC_5
## sampleColData names(1): ArrowFiles
## cellColData names(16): Sample TSSEnrichment ... DoubletScore
##   DoubletEnrichment
## numberOfCells(1): 14435
## medianTSS(1): 20.872
## medianFrags(1): 29176

7 Test batch effect correction using HARMONY

One possible analysis step is batch effect correction using HARMONY. Usually, this should help reduce the batch effect in the data while maintaining biological effects.

In our specific case, the batch effect correction did not improve the data, which is why we omitted this step and only show it for demonstration purposes.

# https://github.com/immunogenomics/harmony

# Create a new ArchRProject with a reducedDims object named “Harmony”
proj_harmonyTest <- addHarmony(
  ArchRProj = proj,
  reducedDims = "IterativeLSI",
  name = "Harmony",
  groupBy = "Sample",
  sigma = 0.1
)

# UMAP for Harmony reductedDims object
proj_harmonyTest <- addUMAP(
  ArchRProj = proj_harmonyTest, 
  reducedDims = "Harmony", 
  name = "UMAPHarmony", 
  nNeighbors = 30, 
  minDist = 0.5, 
  metric = "cosine",
  force = TRUE
)

# Color by clusters
p_clusters_Harmony <- plotEmbedding(
  ArchRProj = proj_harmonyTest, 
  colorBy = "cellColData", 
  name = "Clusters", 
  embedding = "UMAPHarmony", 
  size = 0.5
)

# Color by samples
p_samples_Harmony <- plotEmbedding(
  ArchRProj = proj_harmonyTest, 
  colorBy = "cellColData", 
  name = "Sample", 
  embedding = "UMAPHarmony", 
  size = 0.5
)

# Plot
ggAlignPlots(p_clusters_Harmony, p_samples_Harmony, type = "h")

# Color by doublet enrichment score
p_doubscore_Harmony <- plotEmbedding(
  ArchRProj = proj_harmonyTest, 
  colorBy = "cellColData", 
  name = "DoubletScore", 
  embedding = "UMAPHarmony", 
  plotAs = "points", 
  size = 0.5
)

p_doubenr_Harmony <- plotEmbedding(
  ArchRProj = proj_harmonyTest, 
  colorBy = "cellColData", 
  name = "DoubletEnrichment", 
  embedding = "UMAPHarmony", 
  plotAs = "points", 
  size = 0.5
)

# Color by number of fragments per cell
p_nfrags_Harmony <- plotEmbedding(
  ArchRProj = proj_harmonyTest, 
  colorBy = "cellColData", 
  name = "nFrags", 
  embedding = "UMAPHarmony", 
  plotAs = "points", 
  size = 0.5
)

# Plot
plot_grid(p_doubscore_Harmony, p_doubenr_Harmony, p_nfrags_Harmony, align = "h", ncol = 3, scale = 1.1)

# Add impute weights
proj_harmonyTest <- addImputeWeights(proj_harmonyTest)

# overlay gene scores on UMAPHarmony embedding with MAGIC smoothing
magic_genes_harmony <- plotEmbedding(
  ArchRProj = proj_harmonyTest, 
  colorBy = "GeneScoreMatrix", 
  name = markerGenes, 
  embedding = "UMAPHarmony",
  plotAs = "points", 
  imputeWeights = getImputeWeights(proj_harmonyTest),
  size = 0.5
)

# Plot all genes in cowplot
magic_genes_harmony_cow <- lapply(magic_genes_harmony, function(x){
  x + guides(color = FALSE, fill = FALSE) + 
    theme_ArchR(baseSize = 6.5) +
    theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
    theme(
      axis.text.x=element_blank(), 
      axis.ticks.x=element_blank(), 
      axis.text.y=element_blank(), 
      axis.ticks.y=element_blank()
    )
})
do.call(cowplot::plot_grid, c(list(ncol = 4), magic_genes_harmony_cow))

8 Final filtered Dataset

# Clustering
proj <- addClusters(
  input = proj,
  reducedDims = "IterativeLSI",
  method = "Seurat",
  name = "Clusters",
  resolution = 0.9, 
  force = TRUE
)

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 14435
## Number of edges: 507495
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8886
## Number of communities: 21
## Elapsed time: 1 seconds

# Visualization in UMAP embedding
proj <- addUMAP(
  ArchRProj = proj, 
  reducedDims = "IterativeLSI", 
  name = "UMAP", 
  nNeighbors = 30, 
  minDist = 0.5, 
  metric = "cosine", 
  force = TRUE
)

# Color by clusters
p_clusters_final <- plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "Clusters", 
  embedding = "UMAP", 
  size = 0.5
  )

# Color by samples
p_samples_final <- plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "Sample", 
  embedding = "UMAP", 
  size = 0.5
  )

# Plot
ggAlignPlots(p_clusters_final, p_samples_final, type = "h")

# Color by doublet enrichment score
p_doubscore_final <- plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "DoubletScore", 
  embedding = "UMAP", 
  plotAs = "points", 
  size = 0.5
)

p_doubenr_final <- plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "DoubletEnrichment", 
  embedding = "UMAP", 
  plotAs = "points", 
  size = 0.5
)

# Color by number of fragments per cell
p_nfrags_final <- plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "nFrags", 
  embedding = "UMAP", 
  plotAs = "points", 
  size = 0.5
)

# Plot
plot_grid(p_doubscore_final, p_doubenr_final, p_nfrags_final, align = "h", ncol = 3, scale = 1.1)

# Overlay gene scores on UMAP embedding of proj, use MAGIC smoothing
# Get marker features
markersGS_final <- getMarkerFeatures(
  ArchRProj = proj,
  useMatrix = "GeneScoreMatrix",
  groupBy = "Clusters",
  bias = c("TSSEnrichment", "log10(nFrags)"),
  testMethod = "wilcoxon"
)

# Add impute weights
proj <- addImputeWeights(proj)

# Plot
magic_genes_final <- plotEmbedding(
  ArchRProj = proj, 
  colorBy = "GeneScoreMatrix", 
  name = markerGenes, 
  embedding = "UMAP",
  plotAs = "points", 
  imputeWeights = getImputeWeights(proj),
  size = 0.5
)

# Plot all genes in cowplot
magic_genes_cow_final <- lapply(magic_genes_final, function(x){
  x + guides(color = FALSE, fill = FALSE) + 
    theme_ArchR(baseSize = 6.5) +
    theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
    theme(
      axis.text.x=element_blank(), 
      axis.ticks.x=element_blank(), 
      axis.text.y=element_blank(), 
      axis.ticks.y=element_blank()
    )
})
do.call(cowplot::plot_grid, c(list(ncol = 4), magic_genes_cow_final))

# These are my clusters with cell numbers
cM_doubfilter <- confusionMatrix(paste0(proj$Clusters), paste0(proj$Sample))
cM_doubfilter

## 21 x 5 sparse Matrix of class "dgCMatrix"
##     scATAC_1 scATAC_4 scATAC_5 scATAC_8 scATAC_9
## C4       482        .        .        .        .
## C5       282        3        8        4        1
## C2      2707        .       53        .        .
## C18        8        .        .      291        .
## C16       32        4       19      475        8
## C14      174       40      548        2       13
## C19      170        9       45     1191        3
## C8        10        4      118       30        6
## C11        1        1        6        .      892
## C3       244       43      228        .       10
## C12        1        7       15        .      472
## C13        .      461        6        .        1
## C6         .      777       22        .        8
## C1         .     1276       20        .        .
## C7         .      513        6        .        .
## C15        .     1377        3        .        2
## C20        .       23        7        .       30
## C21        .      188       13        .       22
## C10        .      104       30        .        2
## C17        .       30        5        .        1
## C9         .        4        1        3      840

# Visualize confusion matrix
cM_doubfilter <- cM_doubfilter / Matrix::rowSums(cM_doubfilter)
p_cM_doubfilter <- pheatmap::pheatmap(
  mat = as.matrix(cM_doubfilter), 
  color = paletteContinuous("whiteBlue"), 
  border_color = "black"
)
p_cM_doubfilter

Using single-cell chromatin accessibility sequencing to characterize CD4+ T cells from murine tissues

Kathrin Luise Braband, Annekathrin Ludt, Sara Salome Helbich, Malte Simon, Niklas Beumer, Benedikt Brors, Federico Marini and Michael Delacher

2023-04-28