cg <- DimPlot(ss, label = T, reduction = 'umap', do.return = F)
cg + scale_color_manual(values = hue_pal()(6))
cg
cg <- DimPlot(ss, label = T, reduction = 'umap')
cg + scale_color_brewer(palette="Dark2")
cg + scale_color_brewer(palette="Set2")
cg + scale_color_brewer(palette="Set3")
cg + scale_color_brewer(palette="Paired")
cg + scale_color_brewer(palette="Set1")
cg + scale_color_brewer(palette="Dark2")
fg <- FeaturePlot(ss, features = 'nCount_ATAC', reduction = 'umap') + ggtitle('')
#ggplotly(fg)
fg
#ggplotly(fg)
fg + scale_color_brewer(palette = "Blues")
#ggplotly(fg)
fg + scale_color_brewer(palette = "Dark2")
#ggplotly(fg)
fg + scale_color_brewer(palette = "Dark2")
#ggplotly(fg)
fg
help("FeaturePlot")
ctypes = c(levels(metaData@seurat_cluster))
ss
ctypes = c(levels(metaData$seurat_cluster))
ctypes
ctypes = brewer.pal(n=6, name = 'Dark2')
ctypes
names(ctypes) = level(metaData$seurat_cluster)
names(ctypes) = levels(metaData$seurat_cluster)
pheatmap::pheatmap(sele.zscores, cluster_cols = F, show_colnames = F,
annotation_col = ann_col, color = viridis(100), annotation_colors = list(ctypes))
ann_col
cluster = brewer.pal(n=6, name = 'Dark2')
names(cluster) = levels(metaData$seurat_cluster)
ann_colors = list(cluster)
pheatmap::pheatmap(sele.zscores, cluster_cols = F, show_colnames = F,
annotation_col = ann_col, color = viridis(100),
annotation_colors = ann_colors)
ann_colors
ann_colors = list('cluster' = cluster)
pheatmap::pheatmap(sele.zscores, cluster_cols = F, show_colnames = F,
annotation_col = ann_col, color = viridis(100),
annotation_colors = ann_colors)
rids = sample(1:ncol(sele.zscores), 1000)
ann_col = ann_col[rids, ]
pheatmap::pheatmap(sele.zscores, cluster_cols = F, show_colnames = F,
annotation_col = ann_col, color = viridis(100),
annotation_colors = ann_colors)
# resample to reduce memory used
rids = sample(1:ncol(sele.zscores), floor(ncol(sele.zscores)/4))
tail(zscores)
zscores
dim(zscores)
library(flexdashboard)
library(data.table)
library(magrittr)
library(kableExtra)
library(RColorBrewer)
library(ggplot2)
library(plotly)
ss = readRDS(paste0(params$downstream_dir, '/seurat_obj_withCluster.rds'))
library(viridis)
library(chromVAR)
library(BiocParallel)
register(SerialParam())
metaData = ss@meta.data
rm(ss)
chromVar.obj = readRDS(paste0(params$downstream_dir, '/chromVar_obj.rds'))
variability <- computeVariability(chromVar.obj)
variability = data.table(variability, stringsAsFactors = F)
variability = variability[order(-variability), ]
## plot enriched TFs in heatmap
sele.tfs = as.character(variability$name[1:50])
zscores = chromVar.obj@assays$data$z
rnames = rownames(zscores)
rownames(zscores) = sapply(rnames, function(x) unlist(strsplit(x, '_'))[2])
sele.zscores = zscores[rownames(zscores) %in% sele.tfs, ]
metaData = data.table(metaData, keep.rownames = T)
setkey(metaData, seurat_cluster)
sele.zscores = sele.zscores[, metaData$rn]
ann_col = data.table(cluster = metaData$seurat_cluster)
colnames(ann_col) = 'cluster'
rownames(ann_col) = metaData$rn
sele.zscores[sele.zscores > 10] = 10
sele.zscores[sele.zscores < -10] = -10
cluster = brewer.pal(n=6, name = 'Dark2')
names(cluster) = levels(metaData$seurat_cluster)
ann_colors = list('cluster' = cluster)
# resample to reduce memory used
rids = sample(1:ncol(sele.zscores), floor(ncol(sele.zscores)/4))
ann_col = ann_col[rids, ]
pheatmap::pheatmap(sele.zscores[, rids], cluster_cols = F, show_colnames = F,
annotation_col = ann_col, color = viridis(100),
annotation_colors = ann_colors)
ann_col
zscores = chromVar.obj@assays$data$z
rnames = rownames(zscores)
rownames(zscores) = sapply(rnames, function(x) unlist(strsplit(x, '_'))[2])
sele.zscores = zscores[rownames(zscores) %in% sele.tfs, ]
metaData = data.table(metaData, keep.rownames = T)
setkey(metaData, seurat_cluster)
sele.zscores = sele.zscores[, metaData$rn]
ann_col = data.frame(cluster = metaData$seurat_cluster)
colnames(ann_col) = 'cluster'
rownames(ann_col) = metaData$rn
sele.zscores[sele.zscores > 10] = 10
sele.zscores[sele.zscores < -10] = -10
cluster = brewer.pal(n=6, name = 'Dark2')
names(cluster) = levels(metaData$seurat_cluster)
ann_colors = list('cluster' = cluster)
# resample to reduce memory used
rids = sample(1:ncol(sele.zscores), floor(ncol(sele.zscores)/4))
ann_col = ann_col[rids, ]
pheatmap::pheatmap(sele.zscores[, rids], cluster_cols = F, show_colnames = F,
annotation_col = ann_col, color = viridis(100),
annotation_colors = ann_colors)
ann_col
rownames(ann_col)
metaData$rn
ann_col = data.frame('cluster' = metaData$seurat_cluster)
rownames(ann_col) = metaData$rn
ann_col
sele.zscores[sele.zscores > 10] = 10
sele.zscores[sele.zscores < -10] = -10
cluster = brewer.pal(n=6, name = 'Dark2')
names(cluster) = levels(metaData$seurat_cluster)
ann_colors = list('cluster' = cluster)
# resample to reduce memory used
rids = sample(1:ncol(sele.zscores), floor(ncol(sele.zscores)/4))
ann_col = ann_col[rids, ]
pheatmap::pheatmap(sele.zscores[, rids], cluster_cols = F, show_colnames = F,
annotation_col = ann_col, color = viridis(100),
annotation_colors = ann_colors)
dim(sele.zscores[, rids])
dim(ann_col)
class(ann_col)
rids
ann_col = data.frame('cluster' = metaData$seurat_cluster)
rownames(ann_col) = metaData$rn
ann_col
ann_col = ann_col[rids, ]
ann_col
length(rids)
ann_col = data.frame('cluster' = metaData$seurat_cluster)
rownames(ann_col) = metaData$rn
ann_col
dim(ann_col)
ann_col[rids, ]
ann_col[1:10, ]
ann_col
class(ann_col)
rids
vlass(rids)
class(rids)
rownames(ann_col)
ann_col = data.table('cluster' = metaData$seurat_cluster)
rownames(ann_col) = metaData$rn
rownames(ann_col)
sele.zscores[sele.zscores > 10] = 10
sele.zscores[sele.zscores < -10] = -10
cluster = brewer.pal(n=6, name = 'Dark2')
names(cluster) = levels(metaData$seurat_cluster)
ann_colors = list('cluster' = cluster)
# resample to reduce memory used
rids = sample(1:ncol(sele.zscores), floor(ncol(sele.zscores)/4))
ann_col = ann_col[rids, ]
ann_col
pheatmap::pheatmap(sele.zscores[, rids], cluster_cols = F, show_colnames = F,
annotation_col = ann_col, color = viridis(100),
annotation_colors = ann_colors)
rownames(ann_col) = colnames(sele.zscores)[rids]
pheatmap::pheatmap(sele.zscores[, rids], cluster_cols = F, show_colnames = F,
annotation_col = ann_col, color = viridis(100),
annotation_colors = ann_colors)
rownames(ann_col)
ann_col = data.table('cluster' = metaData$seurat_cluster)
rownames(ann_col) = metaData$rn
pheatmap::pheatmap(sele.zscores, cluster_cols = F, show_colnames = F,
annotation_col = ann_col, color = viridis(100),
annotation_colors = ann_colors)
dim(sele.zscores)
sele.zscores[, 1:3]
# resample to reduce memory used
rids = sort(sample(1:ncol(sele.zscores), floor(ncol(sele.zscores)/4)))
ann_col = ann_col[rids, ]
rownames(ann_col) = colnames(sele.zscores)[rids]
pheatmap::pheatmap(sele.zscores[, rids], cluster_cols = F, show_colnames = F,
annotation_col = ann_col, color = viridis(100),
annotation_colors = ann_colors)
ann_col = data.table('cluster' = metaData$seurat_cluster)
rownames(ann_col) = metaData$rn
rids = sort(sample(1:ncol(sele.zscores), floor(ncol(sele.zscores)/4)))
ann_col0 = ann_col[rids, ]
rownames(ann_col) = colnames(sele.zscores)[rids]
rownames(ann_col0) = colnames(sele.zscores)[rids]
pheatmap::pheatmap(sele.zscores[, rids], cluster_cols = F, show_colnames = F,
annotation_col = ann_col0, color = viridis(100),
annotation_colors = ann_colors)
pheatmap::pheatmap(sele.zscores, cluster_cols = F, show_colnames = F,
annotation_col = ann_col, color = viridis(100),
annotation_colors = ann_colors)
help(pheatmap)
pheatmap::pheatmap(sele.zscores[, rids], cluster_cols = F, show_colnames = F,
annotation_col = ann_col0, color = viridis(100),
annotation_colors = ann_colors, fontsize = 10)
pheatmap::pheatmap(sele.zscores[, rids], cluster_cols = F, show_colnames = F,
annotation_col = ann_col0, color = viridis(100),
annotation_colors = ann_colors, fontsize = 8)
pheatmap::pheatmap(sele.zscores[, rids], cluster_cols = F, show_colnames = F,
annotation_col = ann_col0, color = viridis(100),
annotation_colors = ann_colors, fontsize_row = 8)
pheatmap::pheatmap(sele.zscores[, rids], cluster_cols = F, show_colnames = F,
annotation_col = ann_col0, color = viridis(100),
annotation_colors = ann_colors, fontsize_row = 2)
pheatmap::pheatmap(sele.zscores[, rids], cluster_cols = F, show_colnames = F,
annotation_col = ann_col0, color = viridis(100),
annotation_colors = ann_colors, fontsize_row = 7)
library(data.table)
library(magrittr)
library(Rcpp)
## get reverse compliment of a sequence
rev.comp <- function(x, rev=TRUE){
x <- toupper(x)
y <- rep("N", nchar(x))
xx <- unlist(strsplit(x, NULL))
for (bbb in 1:nchar(x))
{
if(xx[bbb]=="A") y[bbb]<-"T"
if(xx[bbb]=="C") y[bbb]<-"G"
if(xx[bbb]=="G") y[bbb]<-"C"
if(xx[bbb]=="T") y[bbb]<-"A"
}
if(!rev)
{
for(ccc in (1:nchar(x)))
{
if(ccc==1) yy <- y[ccc] else yy <- paste(yy, y[ccc], sep="")
}
}
if(rev)
{
zz<-rep(NA, nchar(x))
for(ccc in (1:nchar(x)))
{
zz[ccc] <- y[nchar(x)+1-ccc]
if(ccc == 1) yy <- zz[ccc] else yy <- paste(yy, zz[ccc],sep="")
}
}
return(yy)
}
setwd("~/yuw1/scATAC-pro/scripts")
frag.file = '~/yuw1/run_scATAC-pro/output/fragments/fragments_MAPQ30.bed'
whitelist.file ='~/yuw1/run_scATAC-pro/737K-cratac-v1.txt'
frags = fread(frag.file, header = F)
names(frags) = c('chr', 'start', 'end', 'bc', 'ndup')
white.list = fread(whitelist.file, header = F)
names(white.list) = 'wbc'
white.list$wbc = sapply(white.list$wbc, rev.comp)
setkey(frags, bc)
setkey(white.list, wbc)
frags1 = frags[bc %in% white.list$wbc]
frags0 = frags[!bc %in% white.list$wbc]
rm(frags)
freq.white = subset(frags1, select = 'bc')
freq.white[, 'N' := .N, by = bc]
freq.white %<>% .[!duplicated(.)]
freq.white %<>% .[order(-N), ]
# for each non white barcode, get best matched bc in white list
freq.nonwhite = subset(frags0,  select = 'bc')
nbc = unique(freq.nonwhite$bc)
fraq.nonwhite
fr3q.nonwhite
freq.nonwhite
freq.nonwhite[, 'N' := .N, by = bc]
freq.nonwhite %<>% .[!duplicated(.)]
freq.nonwhite
x=nbc[1]
stringdist::amatch(x, freq.white$bc,
maxDist=1, method = 'hamming'))
stringdist::amatch(x, freq.white$bc,
maxDist=1, method = 'hamming')
stringdist::amatch(x, white.list$wbc,
maxDist=1, method = 'hamming')
white.list$wbc
sort(white.list$wbc)[1:20]
tail(sort(white.list$wbc))
nbc2.wbc = sapply(nbc, function(x) stringdist::amatch(x, freq.white$bc,
maxDist=1, method = 'hamming'))
qc_per_bc_file = '~/yuw1/run_scATAC-pro/output/qc_result/ss.bwa.qc_per_barcode.bed'
qc_stat = fread(qc_per_bc_file)
qc_stat
aa = CGACGTACTCCAACGC
aa = 'CGACGTACTCCAACGC'
aa
aa %in% freq.white$bc
aa %in% whitelist$wbc
aa %in% white.list$wbc
bcs = qc_stat$bc
all(bcs%in%white.list$wbc)
which(!bcs%in%white.list$wbc)
aa = which(!bcs%in%white.list$wbc)
qc_per_bc_file[aa]
qc_per_bc_file[aa,]
qc_per_bc_file
qc_stat[aa,]
summary(qc_stat[aa,]$total_frags)
summary(qc_stat[!aa,]$total_frags)
tmp = qc_stat[aa, ]
tmp = tmp[order(total_frags)]
tmp
stringdist::amatch('GGGGGGGGGGGGGGGG', white.list$wbc, maxDist = 1)
stringdist::amatch('GGGGGGGGGGGGGGGG', white.list$wbc, maxDist = 2)
stringdist::amatch('GGGGGGGGGGGGGGGG', white.list$wbc, maxDist = 3)
stringdist::amatch('GGGGGGGGGGGGGGGG', white.list$wbc, maxDist = 4)
stringdist::amatch('GGGGGGGGGGGGGGGG', white.list$wbc, maxDist = 5)
stringdist::amatch('GGGGGGGGGGGGGGGG', white.list$wbc, maxDist = 6)
tmp
stringdist::amatch('TGGGGGGGGGGGGGGG', white.list$wbc, maxDist = 6)
stringdist::amatch('TGGGGGGGGGGGGGGG', white.list$wbc, maxDist = 5)
stringdist::amatch('TGGGGGGGGGGGGGGG', white.list$wbc, maxDist = 4)
input_mtx_file = '~/yuw1/run_scATAC-pro/output/raw_matrix/ss.bwa.peak.barcode.mtx'
mat = readMM(input_mtx_file)
library(Matrix)
mat = readMM(input_mtx_file)
## read matrix data
input_mtx_dir = dirname(input_mtx_file)
barcodes = fread(paste0(input_mtx_dir, '/barcodes.txt'), header = F)
colnames(mat) = barcodes$V1
message(paste0("The barcodes correspond to raw matrix:", input_mtx_dir, '/barcodes.txt'))
## filter barcodes by frac_in_peak
qc_per_bc = fread(qc_per_bc_file)
peak_cov_frac = 0.03
qc_sele_bc = qc_per_bc[frac_peak >= peak_cov_frac]
qc_sele_bc
## substract counts due to contamination (rate 0.02)
CN = round(sum(qc_sele_bc$total_frags)/nrow(qc_sele_bc) * 0.02)
qc_sele_bc[, 'total_frags' := total_frags -CN]
qc_sele_bc = qc_sele_bc[total_frags >= 0]
qc_sele_bc
CN
median(qc_sele_bc$total_frags)
median(qc_sele_bc$total_frags)*0.02
qc_per_bc = fread(qc_per_bc_file)
qc_sele_bc = qc_per_bc[frac_peak >= peak_cov_frac]
## substract counts due to contamination (rate 0.02)
CN = max(1, round(meian(qc_sele_bc$total_frags)* 0.02))
qc_sele_bc[, 'total_frags' := total_frags -CN]
qc_sele_bc = qc_sele_bc[total_frags >= 0]
CN = max(1, round(median(qc_sele_bc$total_frags)* 0.02))
qc_sele_bc[, 'total_frags' := total_frags -CN]
qc_sele_bc = qc_sele_bc[total_frags >= 0]
qc_sele_bc
qc_per_bc = fread(qc_per_bc_file)
qc_sele_bc = qc_per_bc[frac_peak >= peak_cov_frac]
## substract counts due to contamination (rate 0.02)
CN = max(1, round(median(qc_sele_bc$total_frags)* 0.02))
qc_sele_bc[, 'total_frags' := total_frags -CN]
qc_sele_bc = qc_sele_bc[total_frags >= 0]
CN
qc_sele_bc
mean(qc_sele_bc$total_frags)
mean(qc_sele_bc$total_frags) * 0.02
qc_sele_bc
mat[, 'AGGGCGTCTGCTGAGA']
aa = mat[, 'AGGGCGTCTGCTGAGA']
flexmix(aa ~ 1, k = 2)
library(flexmix)
library(countreg)
flexmix(aa ~ 1, k = 2)
fm0 =  flexmix(aa ~ 1, k = 2, model = FLXMRnegbin())
flexmix(qc_sele_bc$total_frags[1:10000] ~ 1, k = 2)
fm0 <- flexmix(qc_sele_bc$total_frags[1:10000] ~ 1, k = 2, model = FLXMRnegbin())
prob1 = posterior(fm0)[, 1]
prob2 = posterior(fm0)[, 2]
parameters(fm0)
fm0 <- flexmix(qc_sele_bc$total_frags[1:100000] ~ 1, k = 2, model = FLXMRnegbin())
prob1 = posterior(fm0)[, 1]
prob2 = posterior(fm0)[, 2]
parameters(fm0)
prob1
plot(prob1, prob2)
qc_sele_bc
prob1[2]
porsterior(fm0)
possterior(fm0)
posterior(fm0)
flexmix(qc_sele_bc$total_frags[1:10000] ~ 1, k = 2)
flexmix(qc_sele_bc$total_frags[1:100000] ~ 1, k = 2)
fm0 <- flexmix(qc_sele_bc$total_frags[1:100000] ~ 1, k = 2, model = FLXMRnegbin())
prob1 = posterior(fm0)[, 1]
prob2 = posterior(fm0)[, 2]
parameters(fm0)
prob1[1:10]
aa = which(prob1/prob2 > 100000)
aa
aa = which(prob1/prob2 < 1/100000)
aa
length(aa)
parameters(fm0)
## fit two NB mixture model & using signal to noisy ratio to select cells
n_in_peak = Matrix::colSums(mat)
head(n_in_peak)
n_in_peak = n_in_peak[names(n_in_peak) %in% qc_sele_bc$bc]
length(n_in_peak)
qc_sele_bc
dim(mat)
flexmix(n_in_peak ~ 1, k = 2)
fm0 <- flexmix(n_in_peak ~ 1, k = 2, model = FLXMRnegbin())
prob1 = posterior(fm0)[, 1]
prob2 = posterior(fm0)[, 2]
mus = parameters(fm0)[1, ]
mus
if(mus[1] > mus[2]){
odd = prob1/prob2
}else{
odd = prob2/prob1
}
aa = which(odd > 100000)
aa
qc_per_bc[aa]
qc_per_bc[bc %in% names(n_in_peak)[aa]]
summary(qc_per_bc[bc %in% names(n_in_peak)[aa]]$total_frags)
aa = which(odd > 1000000)
length(aa)
aa = which(odd > 10000000)
length(aa)
qc_sele_bc = qc_sele_bc[order(-total_frags), ]
qc_sele_bc
xx = qc_sele_bc$total_frags
yy = round(qc_sele_bc$total_frags * qc_sele_bc$frac_peak)
plot(xx, yy, log = 'x')
plot(1:length(xx), yy, log = 'y')
ss = sort(sample(1:length(xx), 10000))
plot(ss, yy[ss], log='y')
plot(ss, yy[ss], log='y', log='x')
plot(ss, yy[ss], log='xy')
aa = which(odd > 100000)
length(aa)
flexmix(n_in_peak ~ 1, k = 2, model = FLXMRnegbin(theta = 1))
fm0 <- flexmix(n_in_peak ~ 1, k = 2, model = FLXMRnegbin())
prob1 = posterior(fm0)[, 1]
prob2 = posterior(fm0)[, 2]
mus = parameters(fm0)[1, ]
parameters(fm0)
if(mus[1] > mus[2]){
odd = prob1/prob2
}else{
odd = prob2/prob1
}
aa = which(odd > 100000)
length(aa)
mean(odd[aa]) / mean(odd[-aa])
mean(odd[aa])
odd[aa]
odd[-aa]
median(odd[aa])
median(odd[-aa])
str(fm0)
table(fm0$cluster)
table(fm0@cluster)
length(which(prob1>0.5))
length(prob1 > 0.999)
length(prob1 > 0.99999)
length(which(prob1 > 0.99999))
length(which(prob1 > 0.999999))
length(which(prob1 > 0.9999999))
length(which(prob1 > 0.99999999))
length(which(prob1 > 0.999999999))
length(which(prob1 > 0.9999999999))
length(which(prob1 > 0.99999999999))
length(which(prob1 > 0.999999999999))
length(which(prob1 > 0.9999999999999))
length(which(prob1 > 0.99999999999999))
length(which(prob1 > 0.999999999999999))
length(which(prob1 > 0.9999999999999999))
length(which(prob1 > 0.99999999999999999))
length(which(prob1 > 0.9999999999999999))
min(prob2)
length(which(prob1 == 1))
length(which(prob1 > 0.9999))
aa = which(odd == 1)
select.cells = names(n_in_peak)[aa]
qc_sele_bc[bc%in%select.cells]
select.cells
aa = which(odd == 1)
if(mus[1] > mus[2]){
#odd = prob1/prob2
odd = prob1
}else{
#odd = prob2/prob1
odd = prob2
}
aa = which(odd == 1)
select.cells = names(n_in_peak)[aa]
length(select.cells)
qc_sele_bc[bc%in%select.cells]
qc_sele_bc[bc%in%select.cells & total_frags > 500]
qc_sele_bc[bc%in%select.cells & total_frags > 100]
nrow(mat) * 4000
nrow(mat) * 4000/3*10^9
nrow(mat) * 4000/(3*10^9)
genome_size=3*10^9
peak_cov_frac = nrow(mat) * 4000/genome_size
peak_cov_frac
qc_sele_bc = qc_per_bc[frac_peak >= peak_cov_frac]
qc_sele_bc
peak_cov_frac = nrow(mat) * 1000/genome_size
peak_cov_frac
peak_cov_frac = min(0.05, nrow(mat) * 1000/genome_size)
qc_sele_bc = qc_per_bc[frac_peak >= peak_cov_frac]
qc_sele_bc
