scATAC-pro generates results in plain texts, tables and .rds objects. This tutorial will access original results module by module, which was done by running command lines. Also, we show how some modules can be re-run using different options or parameters. We will use scATAC-pro outputs of the PBMC 10x Genomics data as in the manuscript, except for the integrate module, where output from another study was used for illustration purpose.
library(data.table)
library(magrittr)
library(ggplot2)
library(Seurat)
PEAK_CALLER = 'COMBINED'
CELL_CALLER = 'FILTER'
output_dir = '/mnt/isilon/tan_lab/yuw1/run_scATAC-pro/PBMC10k/output/'
down_dir = paste0(output_dir, 'downstream_analysis/', PEAK_CALLER, '/',
CELL_CALLER, '/')
devtools::source_url("https://github.com/wbaopaul/scATAC-pro/blob/master/scripts/src/dsAnalysis_utilities.R?raw=TRUE")
mtx_path = paste0(output_dir, 'filtered_matrix/', PEAK_CALLER, '/', CELL_CALLER, '/matrix.mtx')
mtx = read_mtx_scATACpro(mtx_path)
rownames(mtx)[1:10]
## [1] "chr1-100028750-100029265" "chr1-100037792-100039050"
## [3] "chr1-100046917-100047167" "chr1-100047455-100047597"
## [5] "chr1-100064941-100065019" "chr1-1000654-1000992"
## [7] "chr1-100065422-100065540" "chr1-100132556-100133436"
## [9] "chr1-100152259-100152542" "chr1-100184588-100184845"
colnames(mtx)[1:10]
## [1] "AAACGAAAGACACGGT" "AAACGAAAGAGGTGGG" "AAACGAAAGCACGTAG" "AAACGAAAGCGCCTAC"
## [5] "AAACGAAAGCTTTCCC" "AAACGAAAGGCGTCCT" "AAACGAAAGGCTTTAC" "AAACGAAAGTGATATG"
## [9] "AAACGAACAAACGACG" "AAACGAACAATTGCCA"
Clustering result was saved in .rds file as seurat_obj.rds and clustering label as one of the metadata
seurat_obj = readRDS(paste0(down_dir, 'seurat_obj.rds'))
## cluster label was saved in metadata active_clusters
table(seurat_obj$active_clusters)
##
## 0 1 2 3 4 5 6 7 8 9
## 2139 1877 823 622 395 362 251 146 129 39
## plot umap
DimPlot(seurat_obj, reduction = 'umap')
The GO term enrichment result was saved in a .xlsx file.
library(xlsx)
group1 = 'one'
group2 = 'rest'
go_file = paste0(down_dir, 'enrichedGO_differential_accessible_features_', group1, '_vs_', group2, '.xlsx')
# get enriched terms for cluster0 for example and show top 20 terms
go_res = xlsx::read.xlsx(go_file, sheetName = 'cluster0')
go_res = data.table(go_res)
go_res[, 'score' := -log10(p.adjust)]
go_res = go_res[order(-score), ]
ngo = min(20, nrow(go_res))
go_res = go_res[1:ngo, ]
go_res = go_res[order(score), ]
go_res$Description = factor(go_res$Description, levels = go_res$Description)
p_go <- ggplot(go_res, aes(y = score, x = Description, fill = Count)) +
geom_bar(width = 0.7, stat = 'identity') +
ggtitle("Enriched terms: cluster_0") + theme_classic() +
theme(legend.position = 'bottom', legend.direction = "horizontal") +
coord_flip() + scale_fill_continuous(name = "#genes", type = "viridis") +
xlab('') + ylab('-log10(p.adjust)')
p_go
The analysis was done through chromVAR R package.
library(RColorBrewer)
library(viridis)
library(pheatmap)
GENOME_NAME = 'hg38'
metaData = seurat_obj@meta.data
chromVar.obj = readRDS(paste0(down_dir, '/chromVar_obj.rds'))
diff_tf_enrich_file = paste0(down_dir, '/differential_TF_motif_enriched_in_clusters.txt')
da.res = fread( paste0(down_dir, '/differential_TF_motif_enriched_in_clusters.txt'))
## plot enriched TFs in heatmap
sele.tfs = da.res$feature
zscores = deviationScores(chromVar.obj)
sele.zscores = zscores[sele.tfs, ]
## change rowname of zscores (tf name) to be readable
sele.zscores <- readable_tf(sele.zscores, GENOME_NAME)
metaData$active_clusters = as.character(metaData$active_clusters)
sele.zscores = sele.zscores[!duplicated(sele.zscores), ]
bc_clusters = data.table('barcode' = rownames(metaData),
'cluster' = metaData$active_clusters)
plot_enrich_tf(sele.zscores, bc_clusters)
da.res = do_DA_motif(deviations(chromVar.obj), bc_clusters,
topn = 5)
sele.zscores = zscores[da.res$feature, ]
## change rowname of zscores (tf name) to be readable
sele.zscores <- readable_tf(sele.zscores, GENOME_NAME)
plot_enrich_tf(sele.zscores, bc_clusters)
The differential bound TFs were saved in a table of plain text format
group1_fp = '0'
group2_fp = 'rest'
footprint_stats.file = paste0(down_dir, '/differential_TF_footprint_',
group1_fp, '_vs_', group2_fp, '.txt')
if(file.exists(footprint_stats.file)){
footprint_out = fread(footprint_stats.file)
if(length(unique(footprint_out$motif)) > 100){
footprint_out[, 'N' := .N, by = higher_in_cluster]
cls = unique(footprint_out[N > 10]$higher_in_cluster)
if(length(cls) >= 1){
res0 = NULL
for(cl0 in cls){
tmp = footprint_out[higher_in_cluster == cl0]
tmp = tmp[order(P_values)][1:10, ]
res0 = rbind(res0, tmp)
}
footprint_out = rbind(footprint_out[N < 10], res0)
}
}
mm = reshape2::acast(motif ~ higher_in_cluster, data = footprint_out,
value.var = "P_values")
mm = -log10(mm)
mm[is.na(mm)] = 0
cn = colnames(mm)
cn.new = sapply(cn, function(x) gsub('_higher', '', x))
colnames(mm) = cn.new
mm[mm > 3] = 3
pheatmap(mm, cluster_cols = F, fontsize = 13, fontsize_row = 9,
color = viridis::viridis(100))
}
The cis-interactions were saved in a plain text file. Here we gonna read the interactions and plot the cis-loop within an interested region. You can change parameter Cicero_Plot_Region below to your interested genomic region:
library(cicero)
cicero_conn.file = paste0(down_dir, '/cicero_interactions.txt')
Cicero_Plot_Region = 'chr5:140610000-140640000'
if(file.exists(cicero_conn.file)){
conns = fread(cicero_conn.file)
conns = data.frame(conns)
temp <- tempfile()
if(grepl(GENOME_NAME, pattern = 'mm10', ignore.case = T)) {
download.file('ftp://ftp.ensembl.org/pub/release-95/gtf/mus_musculus/Mus_musculus.GRCm38.95.gtf.gz', temp)
}
if(grepl(GENOME_NAME, pattern = 'mm9', ignore.case = T)) {
download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/Mus_musculus.NCBIM37.67.gtf.gz', temp)
}
if(grepl(GENOME_NAME, pattern = 'hg38', ignore.case = T)) {
download.file('ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz', temp)
}
if(grepl(GENOME_NAME, pattern = 'hg19', ignore.case = T)) {
download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/homo_sapiens/Homo_sapiens.GRCh37.67.gtf.gz', temp)
}
gene_anno <- rtracklayer::readGFF(temp)
unlink(temp)
# rename some columns to match requirements
gene_anno$chromosome <- paste0("chr", gene_anno$seqid)
gene_anno$gene <- gene_anno$gene_id
gene_anno$transcript <- gene_anno$transcript_id
gene_anno$symbol <- gene_anno$gene_name
gene_anno = subset(gene_anno, select = c(chromosome, start, end, strand,
transcript, gene, symbol))
gene_anno = gene_anno[complete.cases(gene_anno), ]
chr0 = unlist(strsplit(Cicero_Plot_Region, ':'))[1] ## chr5:140610000-140640000
region0 = unlist(strsplit(Cicero_Plot_Region, ':'))[2]
start0 = as.integer(unlist(strsplit(region0, '-'))[1])
end0 = as.integer(unlist(strsplit(region0, '-'))[2])
cicero::plot_connections(conns, chr0, start0, end0,
gene_model = gene_anno,
coaccess_cutoff = .3,
connection_width = 1,
collapseTranscripts = "longest",
viewpoint_alpha = 0)
}
integrated_obj <- readRDS(paste0(output_dir, 'integrated/seurat_obj_VFACS.rds'))
DimPlot(integrated_obj, group.by = 'sample')
DimPlot(integrated_obj, group.by = 'active_clusters')
You can also re-cluster the cells with different number of reduced dimension or resolutions, for example
seurat_obj <- RunPCA(seurat_obj, npcs = 20)
seurat_obj <- FindNeighbors(seurat_obj, reduction = 'pca', dims = 1:20)
seurat_obj <- FindClusters(seurat_obj, resolution = 0.4)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 6783
## Number of edges: 257994
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9244
## Number of communities: 12
## Elapsed time: 0 seconds
seurat_obj$active_clusters = seurat_obj$seurat_clusters
DimPlot(seurat_obj)
If you want to cluster the data into k clusters, 8 for instance, we provided a query function which helps you looking for the corresponding resolution parameter.
resl <- queryResolution4Seurat(seurat_obj, k = 8, reduction = 'pca',
npc = 20, min_resl = 0.1, max_resl = 1,
max_iter = 15)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 6783
## Number of edges: 257994
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9719
## Number of communities: 8
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 6783
## Number of edges: 257994
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8580
## Number of communities: 18
## Elapsed time: 0 seconds
seurat_obj <- FindClusters(seurat_obj, resolution = resl)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 6783
## Number of edges: 257994
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9719
## Number of communities: 8
## Elapsed time: 1 seconds
seurat_obj$active_clusters = seurat_obj$seurat_clusters
DimPlot(seurat_obj)
You can recluster the data by different methods, such as kmeans ( generalCluster function), cisTopic (run_cisTopic), scABC (run_scABC), SCRAT (run_scrat) and chromVAR (run_chromVAR).
## further filter matrix
mtx <- filterMat(mtx, minFrac_in_cell = 0.01, min_depth = 1000, max_depth = 50000)
## create a new seurat obj for visualize and other analysis
seurat_obj.new <- runSeurat_Atac(mtx, npc = 20, norm_by = 'tf-idf',
top_variable_features = 5000, reg.var = 'nCount_ATAC')
seurat_obj.new <- RunUMAP(seurat_obj.new, dims = 1:20)
cl.labels <- run_scrat(mtx, reduction = 'pca', max_pc = 20, k = 8)
seurat_obj.new$active_clusters = cl.labels
DimPlot(seurat_obj.new, group.by = 'active_clusters')
This method will take a while.
cisTopic.obj <- run_cisTopic(mtx, nCores = 5, topic = c(10, 20, 30, 50, 100),
frac_in_cell = 0.05)
## select the best model and lda as a new dimension reduction the seurat obj
sele.model <- cisTopic::selectModel(cisTopic.obj, select = nREDUCTION,
keepBinaryMatrix = F, keepModels = F)
cell_topic <- t(modelMatSelection(sele.model, 'cell', 'Probability'))
cl.labels = generalCluster(cell_topic, method = 'hclust', k = 8)
seurat_obj.new[['lda']] <- CreateDimReducObject(embeddings = cell_topic,
key = '_Topic', assay = DefaultAssay(seurat_obj.new))
## You can run umap or clustering on lda by specifying reduction='lda'
seurat_obj.new <- RunUMAP(seurat_obj.new, dims = 1:ncol(cell_topic), reduction = 'lda')
seurat_obj.new <- FindNeighbors(seurat_obj.new, reduction = 'lda', dims = 1:ncol(cell_topic))
seurat_obj.new <- FindClusters(seurat_obj.new, resolution = 0.4)
seurat_obj.new$active_clusters = seurat_obj.new$seurat_cluster
DimPlot(seurat_obj.new)
cl.labels <- generalCluster(seurat_obj.new@reductions$pca@cell.embeddings,
method = 'kmeans', k = 8)
## if you have already run motif analysis, chromvar obj was saved and
chromVar.obj <- readRDS(paste0(down_dir, '/chromVar_obj.rds'))
## otherwise
#chromVar.obj <- run_chromVAR(mtx, genomeName = 'BSgenome.Hsapiens.UCSC.hg38', ncore = 4)
zscore = chromVar.obj@assays@data$z
zscore = dev.score[, colnames(dev.score) %in% colnames(mtx)]
pca_coords = doDimReduction4mat(zscore, max_pc = 20)[[1]]
cl.labels = cutree(hclust(dist(pca_coords)), k = 8)
This method is also pretty slow.
cl.labels <- run_scABC(mtx, k = 8)
This is the original LSI method (the first PC was discarded), and you can specify different number of PCs and filter peaks
cl.labels <- run_LSI(mtx, ncell.peak = 150, max_pc = 10, k = 8)
seurat_obj.new$active_clusters = cl.labels
DimPlot(seurat_obj.new, group.by = 'active_clusters')