Using single-cell chromatin accessibility sequencing to characterize CD4+ T cells from murine tissues
Kathrin Luise Braband, Annekathrin Ludt, Sara Salome Helbich, Malte Simon, Niklas Beumer, Benedikt Brors, Federico Marini and Michael Delacher
# Set seeds and initial parameters for ArchR
set.seed(1)
# Number of cores to use (adapt to your machine, but has to be > 1 otherwise plotTSSenrichment does not work properly)
addArchRThreads(threads = 8)
## Setting default number of Parallel threads to 8.
# Reference genome (adapt to your data, use the same reference that was used for the alignment)
addArchRGenome("mm10")
## Setting default genome to Mm10.
We are analyzing CD4+ T cells from healthy murine tissue (spleen,
colon, fat and skin). For spleen we have an additional sample of CD25+
regulatory T cells. Overall we have 5 samples in this analysis, which
were previously aligned using Cell Ranger ATAC (v2.0.0) count.
scATAC_1 spleen CD4+ T cells
scATAC_8 spleen CD25+ T cells
scATAC_4 colon CD4+ T cells
scATAC_5 fat CD4+ T cells
scATAC_9
skin CD4+ T cells
From the fragments.tsv files output by Cell Ranger ATAC count, generate ArrowFiles for subsequent analysis with ArchR.
# Read in fragments files
inputFiles = vector()
n = 1
for(X in c(1,4,5,8,9)) {
inputFiles[n] = paste0("data/MD_scATAC_",X,"/fragments.tsv.gz")
n = n+1
}
# Name input files
names(inputFiles) = vector()
n = 1
for(X in c(1,4,5,8,9)) {
names(inputFiles)[n] = paste0("scATAC_",X,"")
n = n+1
}
# Create arrow files
# Evaluate different thresholds depending on your data:
# - minTSS: Start with 0 to see all cells, afterwards evaluate which threshold
# works for all of the samples
# - minFrags: Recommended to set >= 1000, otherwise the analysis might not be
# robust enough
# Check different parameters to set with ?createArrowFiles
ArrowFiles <- createArrowFiles(
inputFiles = inputFiles,
sampleNames = names(inputFiles),
minTSS = 0,
minFrags = 1000,
addTileMat = TRUE,
addGeneScoreMat = TRUE,
force = TRUE
)
After you created the ArrowFiles, check the folder “QualityControl” for each of the samples to evaluate which minTSS threshold to use for the analysis. In this analysis, we decided on minTSS = 10. We will filter the data accordingly later on.
Now combine the individual ArrowFiles into an ArchRProject.
# Define arrow files
ArrowFiles = vector()
n = 1
for(X in c(1,4,5,8,9)) {
ArrowFiles[n] = paste0("scATAC_",X,".arrow")
n = n+1
}
# Create ArchRProject
# It is recommended to set copyArrows = TRUE to maintain an unaltered copy for
# later usage.
proj = ArchRProject(
ArrowFiles = ArrowFiles,
copyArrows = TRUE
)
saveArchRProject(proj,
outputDirectory="ArchRProject",
load=F
)
# Load ArchRProject
proj = loadArchRProject(
path = "ArchRProject",
force = FALSE,
showLogo = FALSE
)
# After checking the plots in "QualityControl", we decided to use minTSS = 10.
# Now, we filter out all cells with minTSS <= 10.
proj = proj[proj@cellColData$TSSEnrichment >= 10, ]
# Save
saveArchRProject(proj,
outputDirectory="ArchRProject",
load = TRUE,
overwrite = TRUE
)
## class: ArchRProject
## outputDirectory: /omics/groups/OE0436/internal/Regensburg/Mainz/03_archr/scripts/methods/paper/ArchRProject
## samples(5): scATAC_4 scATAC_1 scATAC_9 scATAC_8 scATAC_5
## sampleColData names(1): ArrowFiles
## cellColData names(13): Sample TSSEnrichment ... nDiFrags BlacklistRatio
## numberOfCells(1): 17931
## medianTSS(1): 20.751
## medianFrags(1): 26426
# As part of the quality control, we plot the TSS enrichment against the number
# of fragments. As we only have one project now, we get one plot for all of the
# data.
qc_metrics = getCellColData(proj, select = c("log10(nFrags)", "TSSEnrichment"))
p_qc_metrics = ggPoint(
x = qc_metrics[,1],
y = qc_metrics[,2],
colorDensity = TRUE,
continuousSet = "sambaNight",
xlabel = "Log10 Unique Fragments",
ylabel = "TSS Enrichment",
xlim = c(log10(500), quantile(qc_metrics[,1], probs = 0.99)),
ylim = c(0, quantile(qc_metrics[,2], probs = 0.99))
) + geom_hline(yintercept = 10, lty = "dashed") + geom_vline(xintercept = 3, lty = "dashed")
p_qc_metrics
# We use additional Cell Ranger information to filter out low-quality cells.
# For this we use the "is_cell_barcode" column of the "singlecell.csv" file,
# which maps high-quality cells to 1.
singlecell = list()
for (x in c("1","4","5","8","9")){
# Define filenames and read files
filename = paste("data/MD_scATAC_",x,"/singlecell.csv",sep = "")
data = read.csv(filename)
# To match quality information to cells, we need the barcodes to match the
# ones in our ArchRProject. For this we:
# 1) create a vector of "scATAC_x#"
# 2) add vector as a column to the data
# 3) create column containing ArchRProj-style barcodes
bc = c(rep(paste("scATAC_",x,"#",sep = ""),nrow(data)))
data_barcode = cbind(bc,data)
data_fullbc = data_barcode %>% unite("full_barcode", bc:barcode, remove = FALSE, sep = "")
singlecell[[x]] = data_fullbc
}
# Combine the dataframes
singlecell_fullbc = rbindlist(singlecell, use.names = FALSE, fill = FALSE)
# Extract rownames that are also in the ArchRProject
rownames_archr = rownames(proj@cellColData)
subset_singlecell_fullbc = singlecell_fullbc[singlecell_fullbc$full_barcode %in% rownames_archr, ]
# Extract is__cell_barcode column from singlecell.csv and give it barcodes as rownames
df_is_cell_barcode = as.data.frame(subset_singlecell_fullbc$is__cell_barcode)
rownames(df_is_cell_barcode) = subset_singlecell_fullbc$full_barcode
# Order is_cell_barcode the way the ArchRProject is ordered and create filter
is_cell_barcode = df_is_cell_barcode[order(match(rownames(df_is_cell_barcode), rownames_archr)), ]
filter_archr = is_cell_barcode==1
# Filter ArchRProject for is_cell_barcode==1
proj = proj[filter_archr, ]
proj <- loadArchRProject(
path = "ArchRProject",
force = FALSE,
showLogo = FALSE
)
# Get an overview of the project:
proj
## class: ArchRProject
## outputDirectory: /omics/groups/OE0436/internal/Regensburg/Mainz/03_archr/scripts/methods/paper/ArchRProject
## samples(5): scATAC_4 scATAC_1 scATAC_9 scATAC_8 scATAC_5
## sampleColData names(1): ArrowFiles
## cellColData names(13): Sample TSSEnrichment ... nDiFrags BlacklistRatio
## numberOfCells(1): 14700
## medianTSS(1): 20.8665
## medianFrags(1): 29405.5
qc_metrics2 = getCellColData(proj, select = c("log10(nFrags)", "TSSEnrichment"))
p_qc_metrics2 = ggPoint(
x = qc_metrics2[,1],
y = qc_metrics2[,2],
colorDensity = TRUE,
continuousSet = "sambaNight",
xlabel = "Log10 Unique Fragments",
ylabel = "TSS Enrichment",
xlim = c(log10(500), quantile(qc_metrics2[,1], probs = 0.99)),
ylim = c(0, quantile(qc_metrics2[,2], probs = 0.99))
) + geom_hline(yintercept = 10, lty = "dashed") + geom_vline(xintercept = 3, lty = "dashed")
p_qc_metrics2
# 1. Show TSS enrichment score for each sample in a ridge plot.
# The TSS enrichment score is a proxy for the quality of the sample and should
# be similar in all samples in the dataset.
p1 <- plotGroups(
ArchRProj = proj,
groupBy = "Sample",
colorBy = "cellColData",
name = "TSSEnrichment",
plotAs = "ridges"
)
p1
# 2. You can also display TSS enrichment scores in a violin plot.
# Lower and upper hinges correspond to the 25th and 75th percentiles, the middle
# corresponds to the median. Lower and upper whiskers extend from the hinge to
# the lowest or highest value or 1.5 times the IQR.
p2 <- plotGroups(
ArchRProj = proj,
groupBy = "Sample",
colorBy = "cellColData",
name = "TSSEnrichment",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p2
# 3. Show log10(unique nuclear fragments) for each sample in a ridge plot.
# It would be ideal if all of the samples have a similar number of fragments
# per cell. If this is not the case, you can downsample the files by randomly
# removing fragments from the larger sample(s) in the fragments.tsv.gz file.
p3 <- plotGroups(
ArchRProj = proj,
groupBy = "Sample",
colorBy = "cellColData",
name = "log10(nFrags)",
plotAs = "ridges"
)
p3
# 4. You can also display log10(unique nuclear fragments) in a violin plot.
p4 <- plotGroups(
ArchRProj = proj,
groupBy = "Sample",
colorBy = "cellColData",
name = "log10(nFrags)",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p4
# 5. plot sample fragment size distributios:
# Ideally, you should see a nucleosome profile with enrichment in
# periodicity of 150 bp (length of DNA wrapped around one nucleosome)
p5 <- plotFragmentSizes(ArchRProj = proj)
p5
# 6. plot TSS enrichment profiles:
p6 <- plotTSSEnrichment(ArchRProj = proj)
p6
# LSI dimensionality reduction
# Create a dimensionality reduction of the data as basis for visualisation as UMAP
proj <- addIterativeLSI(
ArchRProj = proj,
useMatrix = "TileMatrix",
name = "IterativeLSI",
iterations = 2,
clusterParams = list(
resolution = c(0.2),
sampleCells = 10000,
n.start = 10
),
varFeatures = 25000,
dimsToUse = 1:30,
force = TRUE
)
# Clustering
proj <- addClusters(
input = proj,
reducedDims = "IterativeLSI",
method = "Seurat",
name = "Clusters",
resolution = 0.8,
force = TRUE
)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 14700
## Number of edges: 515701
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8966
## Number of communities: 21
## Elapsed time: 1 seconds
# Visualization of clustering in UMAP embedding.
# This step is not deterministic, plots will look different to the figures in
# the manuscript
proj <- addUMAP(
ArchRProj = proj,
reducedDims = "IterativeLSI",
name = "UMAP",
nNeighbors = 30,
minDist = 0.5,
metric = "cosine",
force = TRUE
)
# Color by clusters
p_clusters <- plotEmbedding(
ArchRProj = proj,
colorBy = "cellColData",
name = "Clusters",
embedding = "UMAP",
size = 0.5
)
# Color by samples
p_samples <- plotEmbedding(
ArchRProj = proj,
colorBy = "cellColData",
name = "Sample",
embedding = "UMAP",
size = 0.5
)
# Plot
ggAlignPlots(p_clusters, p_samples, type = "h")
# Get marker features
# Determine marker genes of each cluster based on the gene score matrix.
# For this, each cluster is compared to all other clusters using wilcoxon
# rank-sum test.
markersGS <- getMarkerFeatures(
ArchRProj = proj,
useMatrix = "GeneScoreMatrix",
groupBy = "Clusters",
bias = c("TSSEnrichment", "log10(nFrags)"),
testMethod = "wilcoxon"
)
# Define which genes to highlight
markerGenes <- c("Cd4", "Il2ra", "Foxp3", "Batf", "Ccr8", "Ikzf2", "Rorc", "Tbx21", "Ifng", "Gata3", "Klrg1", "Il2")
# Get markers
markerList <- getMarkers(markersGS, cutOff = "FDR <= 0.01 & Log2FC >= 1.25")
# Overlay per-cell gene scores on our UMAP embedding
genes <- plotEmbedding(
ArchRProj = proj,
colorBy = "GeneScoreMatrix",
name = markerGenes,
embedding = "UMAP",
plotAs = "points",
quantCut = c(0.01, 0.95),
imputeWeights = NULL,
size = 0.5
)
# To plot all genes we can use cowplot to arrange the various marker genes into
# a single plot
genes_cow <- lapply(genes, function(x){
x + guides(color = FALSE, fill = FALSE) +
theme_ArchR(baseSize = 6.5) +
theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
theme(
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank()
)
})
do.call(cowplot::plot_grid, c(list(ncol = 4), genes_cow))
# Some of the gene score overlays appear quite variable. This is due to the
# sparsity of scATAC-seq data. MAGIC can be used to impute gene scores by
# smoothing signal across nearby cells. This greatly improves the visual
# interpretation of gene scores.
#Wwe first add impute weights to our ArchRProject
proj <- addImputeWeights(proj)
# These impute weights can then be passed to plotEmbedding() when plotting gene
# scores overlayed on the UMAP embedding
magic_genes <- plotEmbedding(
ArchRProj = proj,
colorBy = "GeneScoreMatrix",
name = markerGenes,
embedding = "UMAP",
plotAs = "points",
imputeWeights = getImputeWeights(proj),
size = 0.5
)
#Plot all genes in cowplot
magic_genes_cow <- lapply(magic_genes, function(x){
x + guides(color = FALSE, fill = FALSE) +
theme_ArchR(baseSize = 6.5) +
theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
theme(
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank()
)
})
do.call(cowplot::plot_grid, c(list(ncol = 4), magic_genes_cow))
Now we tweak different parameters to show their effect on the
results. For this dataset we previously selected the defaults
parameters. All of the following is not saved and only for demonstration
purposes.
This should give you an idea of which parameters to
tweak in your own data analysis.
# LSI dimensionality reduction
proj_4iterations <- addIterativeLSI(
ArchRProj = proj,
useMatrix = "TileMatrix",
name = "IterativeLSI",
iterations = 4,
clusterParams = list( #See Seurat::FindClusters
resolution = c(0.2),
sampleCells = 10000,
n.start = 10
),
varFeatures = 25000,
dimsToUse = 1:30,
force = TRUE
)
# Clustering
proj_4iterations <- addClusters(
input = proj_4iterations,
reducedDims = "IterativeLSI",
method = "Seurat",
name = "Clusters",
resolution = 0.8,
force = TRUE
)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 14700
## Number of edges: 520040
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8967
## Number of communities: 20
## Elapsed time: 2 seconds
# Visualization of clustering as UMAP
proj_4iterations <- addUMAP(
ArchRProj = proj_4iterations,
reducedDims = "IterativeLSI",
name = "UMAP",
nNeighbors = 30,
minDist = 0.5,
metric = "cosine",
force = TRUE
)
# Color by clusters
p_clusters_4iterations <- plotEmbedding(
ArchRProj = proj_4iterations,
colorBy = "cellColData",
name = "Clusters",
embedding = "UMAP",
size = 0.5
)
# Color by samples
p_samples_4iterations <- plotEmbedding(
ArchRProj = proj_4iterations,
colorBy = "cellColData",
name = "Sample",
embedding = "UMAP",
size = 0.5
)
# Plot
ggAlignPlots(p_clusters_4iterations, p_samples_4iterations, type = "h")
# Get marker features
markersGS_4iterations <- getMarkerFeatures(
ArchRProj = proj_4iterations,
useMatrix = "GeneScoreMatrix",
groupBy = "Clusters",
bias = c("TSSEnrichment", "log10(nFrags)"),
testMethod = "wilcoxon"
)
# Add impute weights
proj_4iterations <- addImputeWeights(proj_4iterations)
# These impute weights can then be passed to plotEmbedding() when plotting gene
# scores overlayed on the UMAP embedding
magic_genes_4iterations <- plotEmbedding(
ArchRProj = proj_4iterations,
colorBy = "GeneScoreMatrix",
name = markerGenes,
embedding = "UMAP",
plotAs = "points",
imputeWeights = getImputeWeights(proj_4iterations),
size = 0.5
)
# Plot all genes in cowplot
magic_genes_cow_4iterations <- lapply(magic_genes_4iterations, function(x){
x + guides(color = FALSE, fill = FALSE) +
theme_ArchR(baseSize = 6.5) +
theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
theme(
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank()
)
})
do.call(cowplot::plot_grid, c(list(ncol = 4), magic_genes_cow))
# LSI dimensionality reduction
proj_varFeatures <- addIterativeLSI(
ArchRProj = proj,
useMatrix = "TileMatrix",
name = "IterativeLSI",
iterations = 2,
clusterParams = list( #See Seurat::FindClusters
resolution = c(0.2),
sampleCells = 10000,
n.start = 10
),
varFeatures = 10000,
dimsToUse = 1:30,
force = TRUE
)
# Clustering
proj_varFeatures <- addClusters(
input = proj_varFeatures,
reducedDims = "IterativeLSI",
method = "Seurat",
name = "Clusters",
resolution = 0.8,
force = TRUE
)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 14700
## Number of edges: 530573
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8955
## Number of communities: 20
## Elapsed time: 1 seconds
# Visualization of clustering as UMAP
proj_varFeatures <- addUMAP(
ArchRProj = proj_varFeatures,
reducedDims = "IterativeLSI",
name = "UMAP",
nNeighbors = 30,
minDist = 0.5,
metric = "cosine",
force = TRUE
)
# Color by clusters
p_clusters_varFeatures <- plotEmbedding(
ArchRProj = proj_varFeatures,
colorBy = "cellColData",
name = "Clusters",
embedding = "UMAP",
size = 0.5
)
# Color by samples
p_samples_varFeatures <- plotEmbedding(
ArchRProj = proj_varFeatures,
colorBy = "cellColData",
name = "Sample",
embedding = "UMAP",
size = 0.5
)
# Plot
ggAlignPlots(p_clusters_varFeatures, p_samples_varFeatures, type = "h")
# Get marker features
markersGS_varFeatures <- getMarkerFeatures(
ArchRProj = proj_varFeatures,
useMatrix = "GeneScoreMatrix",
groupBy = "Clusters",
bias = c("TSSEnrichment", "log10(nFrags)"),
testMethod = "wilcoxon"
)
# Add impute weights
proj_varFeatures <- addImputeWeights(proj_varFeatures)
# These impute weights can then be passed to plotEmbedding() when plotting gene
# scores overlayed on the UMAP embedding
magic_genes_varFeatures <- plotEmbedding(
ArchRProj = proj_varFeatures,
colorBy = "GeneScoreMatrix",
name = markerGenes,
embedding = "UMAP",
plotAs = "points",
imputeWeights = getImputeWeights(proj_varFeatures),
size = 0.5
)
# Plot all genes in cowplot
magic_genes_cow_varFeatures <- lapply(magic_genes_varFeatures, function(x){
x + guides(color = FALSE, fill = FALSE) +
theme_ArchR(baseSize = 6.5) +
theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
theme(
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank()
)
})
do.call(cowplot::plot_grid, c(list(ncol = 4), magic_genes_cow_varFeatures))
# LSI dimensionality reduction
proj_dimsToUse <- addIterativeLSI(
ArchRProj = proj,
useMatrix = "TileMatrix",
name = "IterativeLSI",
iterations = 2,
clusterParams = list( #See Seurat::FindClusters
resolution = c(0.2),
sampleCells = 10000,
n.start = 10
),
varFeatures = 25000,
dimsToUse = 2:30,
force = TRUE
)
# Clustering
proj_dimsToUse <- addClusters(
input = proj_dimsToUse,
reducedDims = "IterativeLSI",
method = "Seurat",
name = "Clusters",
resolution = 0.8,
force = TRUE
)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 14700
## Number of edges: 517695
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8969
## Number of communities: 20
## Elapsed time: 2 seconds
# Visualization of clustering as UMAP
proj_dimsToUse <- addUMAP(
ArchRProj = proj_dimsToUse,
reducedDims = "IterativeLSI",
name = "UMAP",
nNeighbors = 30,
minDist = 0.5,
metric = "cosine",
force = TRUE
)
# Color by clusters
p_clusters_dimsToUse <- plotEmbedding(
ArchRProj = proj_dimsToUse,
colorBy = "cellColData",
name = "Clusters",
embedding = "UMAP",
size = 0.5
)
# Color by samples
p_samples_dimsToUse <- plotEmbedding(
ArchRProj = proj_dimsToUse,
colorBy = "cellColData",
name = "Sample",
embedding = "UMAP",
size = 0.5
)
# Plot
ggAlignPlots(p_clusters_dimsToUse, p_samples_dimsToUse, type = "h")
# Get marker features
markersGS_dimsToUse <- getMarkerFeatures(
ArchRProj = proj_dimsToUse,
useMatrix = "GeneScoreMatrix",
groupBy = "Clusters",
bias = c("TSSEnrichment", "log10(nFrags)"),
testMethod = "wilcoxon"
)
# Add impute weights
proj_dimsToUse <- addImputeWeights(proj_dimsToUse)
# These impute weights can then be passed to plotEmbedding() when plotting gene
# scores overlayed on the UMAP embedding
magic_genes_dimsToUse <- plotEmbedding(
ArchRProj = proj_dimsToUse,
colorBy = "GeneScoreMatrix",
name = markerGenes,
embedding = "UMAP",
plotAs = "points",
imputeWeights = getImputeWeights(proj_dimsToUse),
size = 0.5
)
# Plot all genes in cowplot
magic_genes_cow_dimsToUse <- lapply(magic_genes, function(x){
x + guides(color = FALSE, fill = FALSE) +
theme_ArchR(baseSize = 6.5) +
theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
theme(
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank()
)
})
do.call(cowplot::plot_grid, c(list(ncol = 4), magic_genes_cow_dimsToUse))
# Visualisation of clustering as UMAP
proj <- addUMAP(
ArchRProj = proj,
reducedDims = "IterativeLSI",
name = "UMAP",
nNeighbors = 30,
minDist = 0.5,
metric = "cosine",
force = TRUE
)
# Color by clusters
p_clusters <- plotEmbedding(
ArchRProj = proj,
colorBy = "cellColData",
name = "Clusters",
embedding = "UMAP",
size = 0.5
)
# Color by samples
p_samples <- plotEmbedding(
ArchRProj = proj,
colorBy = "cellColData",
name = "Sample",
embedding = "UMAP",
size = 0.5
)
# Plot
ggAlignPlots(p_clusters, p_samples, type = "h")
# Calculate doublet scores on the ArchRProject
proj <- addDoubletScores(
input = proj,
k = 10, #Refers to how many cells near a "pseudo-doublet" to count
knnMethod = "UMAP", #Refers to the embedding to use for nearest neighbor search
LSIMethod = 1,
force = TRUE
)
## ArchR logging to : ArchRLogs/ArchR-addDoubletScores-12ad572f0853e-Date-2023-04-28_Time-16-37-56.log
## If there is an issue, please report to github with logFile!
## 2023-04-28 16:37:56 : Batch Execution w/ safelapply!, 0 mins elapsed.
## 2023-04-28 16:37:56 : scATAC_4 (1 of 5) : Computing Doublet Statistics, 0 mins elapsed.
## scATAC_4 (1 of 5) : UMAP Projection R^2 = 0.99876
## scATAC_4 (1 of 5) : UMAP Projection R^2 = 0.99876
## 2023-04-28 16:42:25 : scATAC_1 (2 of 5) : Computing Doublet Statistics, 4.482 mins elapsed.
## scATAC_1 (2 of 5) : UMAP Projection R^2 = 0.98144
## scATAC_1 (2 of 5) : UMAP Projection R^2 = 0.98144
## 2023-04-28 16:46:09 : scATAC_9 (3 of 5) : Computing Doublet Statistics, 8.209 mins elapsed.
## Filtering 1 dims correlated > 0.75 to log10(depth + 1)
## scATAC_9 (3 of 5) : UMAP Projection R^2 = 0.99609
## scATAC_9 (3 of 5) : UMAP Projection R^2 = 0.99609
## 2023-04-28 16:48:25 : scATAC_8 (4 of 5) : Computing Doublet Statistics, 10.476 mins elapsed.
## scATAC_8 (4 of 5) : UMAP Projection R^2 = 0.99388
## scATAC_8 (4 of 5) : UMAP Projection R^2 = 0.99388
## 2023-04-28 16:50:38 : scATAC_5 (5 of 5) : Computing Doublet Statistics, 12.693 mins elapsed.
## scATAC_5 (5 of 5) : UMAP Projection R^2 = 0.99447
## scATAC_5 (5 of 5) : UMAP Projection R^2 = 0.99447
## ArchR logging successful to : ArchRLogs/ArchR-addDoubletScores-12ad572f0853e-Date-2023-04-28_Time-16-37-56.log
## class: ArchRProject
## outputDirectory: /omics/groups/OE0436/internal/Regensburg/Mainz/03_archr/scripts/methods/paper/ArchRProject
## samples(5): scATAC_4 scATAC_1 scATAC_9 scATAC_8 scATAC_5
## sampleColData names(1): ArrowFiles
## cellColData names(16): Sample TSSEnrichment ... DoubletScore
## DoubletEnrichment
## numberOfCells(1): 14700
## medianTSS(1): 20.8665
## medianFrags(1): 29405.5
In the next step, we evaluate different filter ratios. The filter ratio determines how many cells will be filtered out as doublets. Doublets scores were calculated above.
# Create temporary project and filter doublets
proj_tmp <- filterDoublets(
proj,
filterRatio = 0.5
)
## Filtering 265 cells from ArchRProject!
## scATAC_1 : 88 of 4199 (2.1%)
## scATAC_4 : 124 of 4988 (2.5%)
## scATAC_5 : 6 of 1159 (0.5%)
## scATAC_8 : 20 of 2016 (1%)
## scATAC_9 : 27 of 2338 (1.2%)
# Visualize (and compare to non-filtered sample)
# LSI dimensional reduction:
proj_tmp <- addIterativeLSI(
ArchRProj = proj_tmp,
useMatrix = "TileMatrix",
name = "IterativeLSI",
iterations = 2,
clusterParams = list( #See Seurat::FindClusters
resolution = c(0.2),
sampleCells = 10000,
n.start = 10
),
varFeatures = 25000,
dimsToUse = 1:30,
force = TRUE
)
# Clustering
proj_tmp <- addClusters(
input = proj_tmp,
reducedDims = "IterativeLSI",
method = "Seurat",
name = "Clusters",
resolution = 0.8,
force = TRUE
)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 14435
## Number of edges: 506353
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8952
## Number of communities: 20
## Elapsed time: 2 seconds
# Visualisation as UMAP
proj_tmp <- addUMAP(
ArchRProj = proj_tmp,
reducedDims = "IterativeLSI",
name = "UMAP",
nNeighbors = 30,
minDist = 0.5,
metric = "cosine",
force = TRUE
)
# Color by clusters
p_clusters_tmp <- plotEmbedding(
ArchRProj = proj_tmp,
colorBy = "cellColData",
name = "Clusters",
embedding = "UMAP",
size = 0.5
)
# Color by samples
p_samples_tmp <- plotEmbedding(
ArchRProj = proj_tmp,
colorBy = "cellColData",
name = "Sample",
embedding = "UMAP",
size = 0.5
)
# Plot
ggAlignPlots(p_clusters_tmp, p_samples_tmp, type = "h")
# Color by doublet enrichment score
p_doubscore_tmp <- plotEmbedding(
ArchRProj = proj_tmp,
colorBy = "cellColData",
name = "DoubletScore",
embedding = "UMAP",
plotAs = "points",
size = 0.5
)
p_doubenr_tmp <- plotEmbedding(
ArchRProj = proj_tmp,
colorBy = "cellColData",
name = "DoubletEnrichment",
embedding = "UMAP",
plotAs = "points",
size = 0.5
)
# Color by number of fragments per cell
p_nfrags_tmp <- plotEmbedding(
ArchRProj = proj_tmp,
colorBy = "cellColData",
name = "nFrags",
embedding = "UMAP",
plotAs = "points",
size = 0.5
)
# Plot
plot_grid(p_doubscore_tmp, p_doubenr_tmp, p_nfrags_tmp, align = "h", ncol = 3, scale = 1.1)
# Check whether the biology makes sense
# Overlay gene scores on UMAP embedding of proj_tmp, use MAGIC smoothing
# Get marker features
markersGS_tmp <- getMarkerFeatures(
ArchRProj = proj_tmp,
useMatrix = "GeneScoreMatrix",
groupBy = "Clusters",
bias = c("TSSEnrichment", "log10(nFrags)"),
testMethod = "wilcoxon"
)
# Add impute weights
proj_tmp <- addImputeWeights(proj_tmp)
# Plot
magic_genes_tmp <- plotEmbedding(
ArchRProj = proj_tmp,
colorBy = "GeneScoreMatrix",
name = markerGenes,
embedding = "UMAP",
plotAs = "points",
imputeWeights = getImputeWeights(proj_tmp),
size = 0.5
)
# Plot all genes in cowplot
magic_genes_cow_tmp <- lapply(magic_genes_tmp, function(x){
x + guides(color = FALSE, fill = FALSE) +
theme_ArchR(baseSize = 6.5) +
theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
theme(
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank()
)
})
do.call(cowplot::plot_grid, c(list(ncol = 4), magic_genes_cow_tmp))
# Sometimes it can also help to look at the gene scores visualized as violin
# plots for the single clusters
p_Cd4 <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Cd4",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Cd4
p_Il2ra <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Il2ra",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Il2ra
p_Foxp3 <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Foxp3",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Foxp3
p_Batf <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Batf",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Batf
p_Ccr8 <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Ccr8",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Ccr8
p_Ikzf2 <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Ikzf2",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Ikzf2
p_Rorc <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Rorc",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Rorc
p_Tbx21 <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Tbx21",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Tbx21
p_Ifng <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Ifng",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Ifng
p_Gata3 <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Gata3",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Gata3
p_Klrg1 <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Klrg1",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Klrg1
p_Il2 <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Il2",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Il2
# Cell numbers per cluster
cM_proj_cellsPass_tmp <- confusionMatrix(paste0(proj_tmp$Clusters), paste0(proj_tmp$Sample))
cM_proj_cellsPass_tmp
## 20 x 5 sparse Matrix of class "dgCMatrix"
## scATAC_1 scATAC_4 scATAC_5 scATAC_8 scATAC_9
## C3 747 1 5 2 .
## C4 2798 . 53 . .
## C1 181 6 40 1477 3
## C14 31 5 22 479 8
## C12 176 40 546 2 14
## C7 39 . 3 2 1
## C17 10 2 117 32 4
## C9 1 1 6 . 902
## C5 125 40 231 . 10
## C10 1 5 14 . 470
## C6 2 1290 18 . .
## C11 . 458 6 . 1
## C15 . 776 25 . 9
## C16 . 511 6 . .
## C13 . 1379 6 . 2
## C19 . 189 10 . 11
## C20 . 21 7 . 30
## C8 . 111 32 . 2
## C2 . 27 5 . 1
## C18 . 2 1 2 843
# Save
write.csv(as.data.frame(cM_proj_cellsPass_tmp), "./ArchRProject/Plots/cells_per_cluster_filterratio0-5.csv", row.names=FALSE)
# Create temporary project and filter doublets
proj_tmp <- filterDoublets(
proj,
filterRatio = 1
)
## Filtering 531 cells from ArchRProject!
## scATAC_1 : 176 of 4199 (4.2%)
## scATAC_4 : 248 of 4988 (5%)
## scATAC_5 : 13 of 1159 (1.1%)
## scATAC_8 : 40 of 2016 (2%)
## scATAC_9 : 54 of 2338 (2.3%)
# Visualize (and compare to non-filtered sample):
# LSI dimensional reduction:
proj_tmp <- addIterativeLSI(
ArchRProj = proj_tmp,
useMatrix = "TileMatrix",
name = "IterativeLSI",
iterations = 2,
clusterParams = list( #See Seurat::FindClusters
resolution = c(0.2),
sampleCells = 10000,
n.start = 10
),
varFeatures = 25000,
dimsToUse = 1:30,
force = TRUE
)
# Clustering
proj_tmp <- addClusters(
input = proj_tmp,
reducedDims = "IterativeLSI",
method = "Seurat",
name = "Clusters",
resolution = 0.8,
force = TRUE
)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 14169
## Number of edges: 497811
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8958
## Number of communities: 20
## Elapsed time: 1 seconds
# Visualisation as UMAP
proj_tmp <- addUMAP(
ArchRProj = proj_tmp,
reducedDims = "IterativeLSI",
name = "UMAP",
nNeighbors = 30,
minDist = 0.5,
metric = "cosine",
force = TRUE
)
# Color by clusters
p_clusters_tmp <- plotEmbedding(
ArchRProj = proj_tmp,
colorBy = "cellColData",
name = "Clusters",
embedding = "UMAP",
size = 0.5
)
# Color by samples
p_samples_tmp <- plotEmbedding(
ArchRProj = proj_tmp,
colorBy = "cellColData",
name = "Sample",
embedding = "UMAP",
size = 0.5
)
# Plot
ggAlignPlots(p_clusters_tmp, p_samples_tmp, type = "h")
# Color by doublet enrichment score
p_doubscore_tmp <- plotEmbedding(
ArchRProj = proj_tmp,
colorBy = "cellColData",
name = "DoubletScore",
embedding = "UMAP",
plotAs = "points",
size = 0.5
)
p_doubenr_tmp <- plotEmbedding(
ArchRProj = proj_tmp,
colorBy = "cellColData",
name = "DoubletEnrichment",
embedding = "UMAP",
plotAs = "points",
size = 0.5
)
# Color by number of fragments per cell
p_nfrags_tmp <- plotEmbedding(
ArchRProj = proj_tmp,
colorBy = "cellColData",
name = "nFrags",
embedding = "UMAP",
plotAs = "points",
size = 0.5
)
# Plot
plot_grid(p_doubscore_tmp, p_doubenr_tmp, p_nfrags_tmp, align = "h", ncol = 3, scale = 1.1)
# Get marker features
markersGS_tmp <- getMarkerFeatures(
ArchRProj = proj_tmp,
useMatrix = "GeneScoreMatrix",
groupBy = "Clusters",
bias = c("TSSEnrichment", "log10(nFrags)"),
testMethod = "wilcoxon"
)
# Add impute weights
proj_tmp <- addImputeWeights(proj_tmp)
# Plot
magic_genes_tmp <- plotEmbedding(
ArchRProj = proj_tmp,
colorBy = "GeneScoreMatrix",
name = markerGenes,
embedding = "UMAP",
plotAs = "points",
imputeWeights = getImputeWeights(proj_tmp),
size = 0.5
)
# Plot all genes in cowplot
magic_genes_cow_tmp <- lapply(magic_genes_tmp, function(x){
x + guides(color = FALSE, fill = FALSE) +
theme_ArchR(baseSize = 6.5) +
theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
theme(
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank()
)
})
do.call(cowplot::plot_grid, c(list(ncol = 4), magic_genes_cow_tmp))
# Sometimes it can also help to look at the gene scores visualized as violin
# plots for the single clusters
p_Cd4 <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Cd4",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Cd4
p_Il2ra <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Il2ra",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Il2ra
p_Foxp3 <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Foxp3",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Foxp3
p_Batf <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Batf",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Batf
p_Ccr8 <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Ccr8",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Ccr8
p_Ikzf2 <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Ikzf2",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Ikzf2
p_Rorc <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Rorc",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Rorc
p_Tbx21 <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Tbx21",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Tbx21
p_Ifng <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Ifng",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Ifng
p_Gata3 <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Gata3",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Gata3
p_Klrg1 <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Klrg1",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Klrg1
p_Il2 <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Il2",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Il2
# Cell numbers per cluster
cM_proj_cellsPass_tmp <- confusionMatrix(paste0(proj_tmp$Clusters), paste0(proj_tmp$Sample))
cM_proj_cellsPass_tmp
## 20 x 5 sparse Matrix of class "dgCMatrix"
## scATAC_1 scATAC_4 scATAC_5 scATAC_8 scATAC_9
## C2 706 4 4 1 .
## C3 2651 . 49 . .
## C10 179 7 44 1449 4
## C8 31 4 21 487 7
## C16 157 39 539 2 17
## C1 39 1 5 2 1
## C14 10 6 31 33 824
## C19 1 2 5 . 986
## C4 245 44 234 . 10
## C20 1 6 13 . 371
## C9 1 348 6 . 1
## C5 2 1284 18 . .
## C12 . 800 23 . 10
## C17 . 1358 6 . 3
## C13 . 496 5 . .
## C6 . 184 9 . 14
## C7 . 21 7 . 30
## C18 . 109 33 . 3
## C11 . 25 5 . 1
## C15 . 2 89 2 2
# Save
write.csv(as.data.frame(cM_proj_cellsPass_tmp), "./ArchRProject/Plots/cells_per_cluster_filterratio1.csv", row.names=FALSE)
# Create temporary project and filter doublets
proj_tmp <- filterDoublets(
proj,
filterRatio = 2
)
## Filtering 1065 cells from ArchRProject!
## scATAC_1 : 352 of 4199 (8.4%)
## scATAC_4 : 497 of 4988 (10%)
## scATAC_5 : 26 of 1159 (2.2%)
## scATAC_8 : 81 of 2016 (4%)
## scATAC_9 : 109 of 2338 (4.7%)
# Visualize (and compare to non-filtered sample)
# LSI dimensional reduction
proj_tmp <- addIterativeLSI(
ArchRProj = proj_tmp,
useMatrix = "TileMatrix",
name = "IterativeLSI",
iterations = 2,
clusterParams = list( #See Seurat::FindClusters
resolution = c(0.2),
sampleCells = 10000,
n.start = 10
),
varFeatures = 25000,
dimsToUse = 1:30,
force = TRUE
)
# Clustering
proj_tmp <- addClusters(
input = proj_tmp,
reducedDims = "IterativeLSI",
method = "Seurat",
name = "Clusters",
resolution = 0.8,
force = TRUE
)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 13635
## Number of edges: 477400
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8949
## Number of communities: 21
## Elapsed time: 1 seconds
# Visualisation as UMAP
proj_tmp <- addUMAP(
ArchRProj = proj_tmp,
reducedDims = "IterativeLSI",
name = "UMAP",
nNeighbors = 30,
minDist = 0.5,
metric = "cosine",
force = TRUE
)
# Color by clusters
p_clusters_tmp <- plotEmbedding(
ArchRProj = proj_tmp,
colorBy = "cellColData",
name = "Clusters",
embedding = "UMAP",
size = 0.5
)
# Color by samples
p_samples_tmp <- plotEmbedding(
ArchRProj = proj_tmp,
colorBy = "cellColData",
name = "Sample",
embedding = "UMAP",
size = 0.5
)
# Plot
ggAlignPlots(p_clusters_tmp, p_samples_tmp, type = "h")
# Color by doublet enrichment score
p_doubscore_tmp <- plotEmbedding(
ArchRProj = proj_tmp,
colorBy = "cellColData",
name = "DoubletScore",
embedding = "UMAP",
plotAs = "points",
size = 0.5
)
p_doubenr_tmp <- plotEmbedding(
ArchRProj = proj_tmp,
colorBy = "cellColData",
name = "DoubletEnrichment",
embedding = "UMAP",
plotAs = "points",
size = 0.5
)
# Color by number of fragments per cell
p_nfrags_tmp <- plotEmbedding(
ArchRProj = proj_tmp,
colorBy = "cellColData",
name = "nFrags",
embedding = "UMAP",
plotAs = "points",
size = 0.5
)
# Plot
plot_grid(p_doubscore_tmp, p_doubenr_tmp, p_nfrags_tmp, align = "h", ncol = 3, scale = 1.1)
# Get marker features
markersGS_tmp <- getMarkerFeatures(
ArchRProj = proj_tmp,
useMatrix = "GeneScoreMatrix",
groupBy = "Clusters",
bias = c("TSSEnrichment", "log10(nFrags)"),
testMethod = "wilcoxon"
)
# Add impute weights
proj_tmp <- addImputeWeights(proj_tmp)
# Plot
magic_genes_tmp <- plotEmbedding(
ArchRProj = proj_tmp,
colorBy = "GeneScoreMatrix",
name = markerGenes,
embedding = "UMAP",
plotAs = "points",
imputeWeights = getImputeWeights(proj_tmp),
size = 0.5
)
# Plot all genes in cowplot
magic_genes_cow_tmp <- lapply(magic_genes_tmp, function(x){
x + guides(color = FALSE, fill = FALSE) +
theme_ArchR(baseSize = 6.5) +
theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
theme(
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank()
)
})
do.call(cowplot::plot_grid, c(list(ncol = 4), magic_genes_cow_tmp))
# Sometimes it can also help to look at the gene scores visualized as violin
# plots for the single clusters
p_Cd4 <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Cd4",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Cd4
p_Il2ra <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Il2ra",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Il2ra
p_Foxp3 <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Foxp3",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Foxp3
p_Batf <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Batf",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Batf
p_Ccr8 <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Ccr8",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Ccr8
p_Ikzf2 <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Ikzf2",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Ikzf2
p_Rorc <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Rorc",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Rorc
p_Tbx21 <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Tbx21",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Tbx21
p_Ifng <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Ifng",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Ifng
p_Gata3 <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Gata3",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Gata3
p_Klrg1 <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Klrg1",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Klrg1
p_Il2 <- plotGroups(
ArchRProj = proj_tmp,
groupBy = "Clusters",
colorBy = "GeneScoreMatrix",
name = "Il2",
plotAs = "violin",
alpha = 0.4,
addBoxPlot = TRUE
)
p_Il2
# Cell numbers per cluster
cM_proj_cellsPass_tmp <- confusionMatrix(paste0(proj_tmp$Clusters), paste0(proj_tmp$Sample))
cM_proj_cellsPass_tmp
## 21 x 5 sparse Matrix of class "dgCMatrix"
## scATAC_1 scATAC_4 scATAC_5 scATAC_8 scATAC_9
## C15 472 . 2 1 .
## C16 2710 . 44 . .
## C20 9 . 1 205 .
## C12 2 205 8 1 .
## C13 169 25 533 2 17
## C21 169 6 41 1216 3
## C11 44 . 4 2 1
## C5 10 7 113 33 12
## C2 29 4 20 474 4
## C9 1 1 10 . 984
## C17 230 40 241 . 10
## C10 1 6 16 . 356
## C18 1 1298 18 . 1
## C6 . 686 26 . 7
## C14 . 1387 4 . 2
## C1 . 23 7 . 30
## C3 . 502 5 . 1
## C7 . 186 7 . 11
## C8 . 87 27 . 1
## C19 . 24 3 . 1
## C4 . 4 3 1 788
# Save
write.csv(as.data.frame(cM_proj_cellsPass_tmp), "./ArchRProject/Plots/cells_per_cluster_filterratio2.csv", row.names=FALSE)
# Create new ArchRProject and filter doublets with the filter ratio chosen based
# on the test runs above
proj <- filterDoublets(
proj,
filterRatio = 0.5,
)
## Filtering 265 cells from ArchRProject!
## scATAC_1 : 88 of 4199 (2.1%)
## scATAC_4 : 124 of 4988 (2.5%)
## scATAC_5 : 6 of 1159 (0.5%)
## scATAC_8 : 20 of 2016 (1%)
## scATAC_9 : 27 of 2338 (1.2%)
# Load new ArchRProject
proj <- loadArchRProject(
path = "ArchRProject",
force = TRUE,
showLogo = FALSE
)
# Get an overview of the project
proj
## class: ArchRProject
## outputDirectory: /omics/groups/OE0436/internal/Regensburg/Mainz/03_archr/scripts/methods/paper/ArchRProject
## samples(5): scATAC_4 scATAC_1 scATAC_9 scATAC_8 scATAC_5
## sampleColData names(1): ArrowFiles
## cellColData names(16): Sample TSSEnrichment ... DoubletScore
## DoubletEnrichment
## numberOfCells(1): 14435
## medianTSS(1): 20.872
## medianFrags(1): 29176
One possible analysis step is batch effect correction using HARMONY.
Usually, this should help reduce the batch effect in the data while
maintaining biological effects.
In our specific case, the batch effect correction did not improve the data, which is why we omitted this step and only show it for demonstration purposes.
# https://github.com/immunogenomics/harmony
# Create a new ArchRProject with a reducedDims object named “Harmony”
proj_harmonyTest <- addHarmony(
ArchRProj = proj,
reducedDims = "IterativeLSI",
name = "Harmony",
groupBy = "Sample",
sigma = 0.1
)
# UMAP for Harmony reductedDims object
proj_harmonyTest <- addUMAP(
ArchRProj = proj_harmonyTest,
reducedDims = "Harmony",
name = "UMAPHarmony",
nNeighbors = 30,
minDist = 0.5,
metric = "cosine",
force = TRUE
)
# Color by clusters
p_clusters_Harmony <- plotEmbedding(
ArchRProj = proj_harmonyTest,
colorBy = "cellColData",
name = "Clusters",
embedding = "UMAPHarmony",
size = 0.5
)
# Color by samples
p_samples_Harmony <- plotEmbedding(
ArchRProj = proj_harmonyTest,
colorBy = "cellColData",
name = "Sample",
embedding = "UMAPHarmony",
size = 0.5
)
# Plot
ggAlignPlots(p_clusters_Harmony, p_samples_Harmony, type = "h")
# Color by doublet enrichment score
p_doubscore_Harmony <- plotEmbedding(
ArchRProj = proj_harmonyTest,
colorBy = "cellColData",
name = "DoubletScore",
embedding = "UMAPHarmony",
plotAs = "points",
size = 0.5
)
p_doubenr_Harmony <- plotEmbedding(
ArchRProj = proj_harmonyTest,
colorBy = "cellColData",
name = "DoubletEnrichment",
embedding = "UMAPHarmony",
plotAs = "points",
size = 0.5
)
# Color by number of fragments per cell
p_nfrags_Harmony <- plotEmbedding(
ArchRProj = proj_harmonyTest,
colorBy = "cellColData",
name = "nFrags",
embedding = "UMAPHarmony",
plotAs = "points",
size = 0.5
)
# Plot
plot_grid(p_doubscore_Harmony, p_doubenr_Harmony, p_nfrags_Harmony, align = "h", ncol = 3, scale = 1.1)
# Add impute weights
proj_harmonyTest <- addImputeWeights(proj_harmonyTest)
# overlay gene scores on UMAPHarmony embedding with MAGIC smoothing
magic_genes_harmony <- plotEmbedding(
ArchRProj = proj_harmonyTest,
colorBy = "GeneScoreMatrix",
name = markerGenes,
embedding = "UMAPHarmony",
plotAs = "points",
imputeWeights = getImputeWeights(proj_harmonyTest),
size = 0.5
)
# Plot all genes in cowplot
magic_genes_harmony_cow <- lapply(magic_genes_harmony, function(x){
x + guides(color = FALSE, fill = FALSE) +
theme_ArchR(baseSize = 6.5) +
theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
theme(
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank()
)
})
do.call(cowplot::plot_grid, c(list(ncol = 4), magic_genes_harmony_cow))
# Clustering
proj <- addClusters(
input = proj,
reducedDims = "IterativeLSI",
method = "Seurat",
name = "Clusters",
resolution = 0.9,
force = TRUE
)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 14435
## Number of edges: 507495
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8886
## Number of communities: 21
## Elapsed time: 1 seconds
# Visualization in UMAP embedding
proj <- addUMAP(
ArchRProj = proj,
reducedDims = "IterativeLSI",
name = "UMAP",
nNeighbors = 30,
minDist = 0.5,
metric = "cosine",
force = TRUE
)
# Color by clusters
p_clusters_final <- plotEmbedding(
ArchRProj = proj,
colorBy = "cellColData",
name = "Clusters",
embedding = "UMAP",
size = 0.5
)
# Color by samples
p_samples_final <- plotEmbedding(
ArchRProj = proj,
colorBy = "cellColData",
name = "Sample",
embedding = "UMAP",
size = 0.5
)
# Plot
ggAlignPlots(p_clusters_final, p_samples_final, type = "h")
# Color by doublet enrichment score
p_doubscore_final <- plotEmbedding(
ArchRProj = proj,
colorBy = "cellColData",
name = "DoubletScore",
embedding = "UMAP",
plotAs = "points",
size = 0.5
)
p_doubenr_final <- plotEmbedding(
ArchRProj = proj,
colorBy = "cellColData",
name = "DoubletEnrichment",
embedding = "UMAP",
plotAs = "points",
size = 0.5
)
# Color by number of fragments per cell
p_nfrags_final <- plotEmbedding(
ArchRProj = proj,
colorBy = "cellColData",
name = "nFrags",
embedding = "UMAP",
plotAs = "points",
size = 0.5
)
# Plot
plot_grid(p_doubscore_final, p_doubenr_final, p_nfrags_final, align = "h", ncol = 3, scale = 1.1)
# Overlay gene scores on UMAP embedding of proj, use MAGIC smoothing
# Get marker features
markersGS_final <- getMarkerFeatures(
ArchRProj = proj,
useMatrix = "GeneScoreMatrix",
groupBy = "Clusters",
bias = c("TSSEnrichment", "log10(nFrags)"),
testMethod = "wilcoxon"
)
# Add impute weights
proj <- addImputeWeights(proj)
# Plot
magic_genes_final <- plotEmbedding(
ArchRProj = proj,
colorBy = "GeneScoreMatrix",
name = markerGenes,
embedding = "UMAP",
plotAs = "points",
imputeWeights = getImputeWeights(proj),
size = 0.5
)
# Plot all genes in cowplot
magic_genes_cow_final <- lapply(magic_genes_final, function(x){
x + guides(color = FALSE, fill = FALSE) +
theme_ArchR(baseSize = 6.5) +
theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
theme(
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank()
)
})
do.call(cowplot::plot_grid, c(list(ncol = 4), magic_genes_cow_final))
# These are my clusters with cell numbers
cM_doubfilter <- confusionMatrix(paste0(proj$Clusters), paste0(proj$Sample))
cM_doubfilter
## 21 x 5 sparse Matrix of class "dgCMatrix"
## scATAC_1 scATAC_4 scATAC_5 scATAC_8 scATAC_9
## C4 482 . . . .
## C5 282 3 8 4 1
## C2 2707 . 53 . .
## C18 8 . . 291 .
## C16 32 4 19 475 8
## C14 174 40 548 2 13
## C19 170 9 45 1191 3
## C8 10 4 118 30 6
## C11 1 1 6 . 892
## C3 244 43 228 . 10
## C12 1 7 15 . 472
## C13 . 461 6 . 1
## C6 . 777 22 . 8
## C1 . 1276 20 . .
## C7 . 513 6 . .
## C15 . 1377 3 . 2
## C20 . 23 7 . 30
## C21 . 188 13 . 22
## C10 . 104 30 . 2
## C17 . 30 5 . 1
## C9 . 4 1 3 840
# Visualize confusion matrix
cM_doubfilter <- cM_doubfilter / Matrix::rowSums(cM_doubfilter)
p_cM_doubfilter <- pheatmap::pheatmap(
mat = as.matrix(cM_doubfilter),
color = paletteContinuous("whiteBlue"),
border_color = "black"
)
p_cM_doubfilter