names(gene_ann)[4] <- "gene"
input_cds <- annotate_cds_by_site(input_cds, gene_ann)
# generate unnormalized gene activity matrix
unnorm_ga <- build_gene_activity_matrix(input_cds, conns)
# make a list of num_genes_expressed
num_genes <- pData(input_cds)$num_genes_expressed
names(num_genes) <- row.names(pData(input_cds))
# normalize
cicero_gene_activities <- normalize_gene_activities(unnorm_ga, num_genes)
dim(unnorm_ga)
dim(num_genes)
length(num_genes)
normalize_gene_activities
num_genes
length(num_genes)
length(unique(num_genes))
pData(input_cds)
input_cds
normalize_gene_activities
help("normalize_gene_activities")
class(unnorm_ga)
dim(unnorm_ga)
# normalize
cicero_gene_activities <- normalize_gene_activities(unnorm_ga, num_genes)
normalize_gene_activities
activity_matrices = unnorm_ga
cell_num_genes
cell_num_genes = num_genes
if (!is.list(activity_matrices)) {
scores <- activity_matrices
normalization_df <- data.frame(cell = colnames(activity_matrices),
cell_group = 1)
}
scores
dim(scores)
dim(activity_matrices)
length(cell_num_genes)
rownames(scores)[1:10]
scores <- scores[Matrix::rowSums(scores) != 0, Matrix::colSums(scores) !=
0]
dim(scores)
length(num_genes)
normalization_df$cell_group <- factor(normalization_df$cell_group)
normalization_df$total_activity <- Matrix::colSums(scores)
dim(normalization_df)
dim(scores)
normalization_df = subset(normalization_df, cell %in% colnames(scores))
normalization_df$cell_group <- factor(normalization_df$cell_group)
normalization_df$total_activity <- Matrix::colSums(scores)
cell_num_genes = cell_num_genes[cell %in% colnames(scores)]
if (!is.list(activity_matrices)) {
scores <- activity_matrices
normalization_df <- data.frame(cell = colnames(activity_matrices),
cell_group = 1)
}
scores <- scores[Matrix::rowSums(scores) != 0, Matrix::colSums(scores) !=
0]
cell_num_genes = cell_num_genes[normalization_df$cell %in% colnames(scores)]
normalization_df = subset(normalization_df, cell %in% colnames(scores))
normalization_df$cell_group <- factor(normalization_df$cell_group)
normalization_df$total_activity <- Matrix::colSums(scores)
normalization_df$total_sites <- cell_num_genes[as.character(normalization_df$cell)]
if (!is.list(activity_matrices)) {
activity_model <- stats::lm(log(total_activity) ~ log(total_sites),
data = normalization_df)
}
else {
activity_model <- stats::lm(log(total_activity) ~ log(total_sites) *
cell_group, data = normalization_df)
}
normalization_df$fitted_curve <- exp(as.vector(predict(activity_model,
type = "response")))
size_factors <- log(normalization_df$fitted_curve)/mean(log(normalization_df$fitted_curve))
size_factors <- Matrix::Diagonal(x = 1/size_factors)
row.names(size_factors) <- normalization_df$cell
colnames(size_factors) <- row.names(size_factors)
scores <- Matrix::t(size_factors %*% Matrix::t(scores))
scores@x <- pmin(1e+09, exp(scores@x) - 1)
sum_activity_scores <- Matrix::colSums(scores)
scale_factors <- Matrix::Diagonal(x = 1/sum_activity_scores)
row.names(scale_factors) <- normalization_df$cell
colnames(scale_factors) <- row.names(scale_factors)
scores <- Matrix::t(scale_factors %*% Matrix::t(scores))
if (!is.list(activity_matrices)) {
ret <- scores[row.names(activity_matrices), colnames(activity_matrices)]
}
ret <- scores[row.names(activity_matrices), colnames(activity_matrices)]
scores <- Matrix::t(size_factors %*% Matrix::t(scores))
dim(scores)
scores@x <- pmin(1e+09, exp(scores@x) - 1)
sum_activity_scores <- Matrix::colSums(scores)
scale_factors <- Matrix::Diagonal(x = 1/sum_activity_scores)
row.names(scale_factors) <- normalization_df$cell
colnames(scale_factors) <- row.names(scale_factors)
scores <- Matrix::t(scale_factors %*% Matrix::t(scores))
rownames(scores)
dim(scores)
ret <- scores[row.names(activity_matrices), colnames(activity_matrices)]
if (!is.list(activity_matrices)) {
scores <- activity_matrices
normalization_df <- data.frame(cell = colnames(activity_matrices),
cell_group = 1)
}
else {
scores <- do.call(cbind, activity_matrices)
normalization_df <- do.call(rbind, lapply(seq_along(activity_matrices),
function(x) {
data.frame(cell = colnames(activity_matrices[[x]]),
cell_group = rep(x, ncol(activity_matrices[[x]])))
}))
}
if (!is.list(activity_matrices)) {
scores <- activity_matrices
normalization_df <- data.frame(cell = colnames(activity_matrices),
cell_group = 1)
}
else {
scores <- do.call(cbind, activity_matrices)
normalization_df <- do.call(rbind, lapply(seq_along(activity_matrices),
function(x) {
data.frame(cell = colnames(activity_matrices[[x]]),
cell_group = rep(x, ncol(activity_matrices[[x]])))
}))
}
scores <- scores[Matrix::rowSums(scores) != 0, Matrix::colSums(scores) !=
0]
if (!is.list(activity_matrices)) {
scores <- activity_matrices
normalization_df <- data.frame(cell = colnames(activity_matrices),
cell_group = 1)
}else {
scores <- do.call(cbind, activity_matrices)
normalization_df <- do.call(rbind, lapply(seq_along(activity_matrices),
function(x) {
data.frame(cell = colnames(activity_matrices[[x]]),
cell_group = rep(x, ncol(activity_matrices[[x]])))
}))
}
scores <- scores[Matrix::rowSums(scores) != 0, Matrix::colSums(scores) !=
0]
## correct by adding following 3 lines
activity_matrices <- scores
cell_num_genes = cell_num_genes[normalization_df$cell %in% colnames(scores)]
normalization_df = subset(normalization_df, cell %in% colnames(scores))
normalization_df$cell_group <- factor(normalization_df$cell_group)
normalization_df$total_activity <- Matrix::colSums(scores)
normalization_df$total_sites <- cell_num_genes[as.character(normalization_df$cell)]
if (!is.list(activity_matrices)) {
activity_model <- stats::lm(log(total_activity) ~ log(total_sites),
data = normalization_df)
}
else {
activity_model <- stats::lm(log(total_activity) ~ log(total_sites) *
cell_group, data = normalization_df)
}
if (!is.list(activity_matrices)) {
activity_model <- stats::lm(log(total_activity) ~ log(total_sites),
data = normalization_df)
} else {
activity_model <- stats::lm(log(total_activity) ~ log(total_sites) *
cell_group, data = normalization_df)
}
normalization_df$fitted_curve <- exp(as.vector(predict(activity_model,
type = "response")))
size_factors <- log(normalization_df$fitted_curve)/mean(log(normalization_df$fitted_curve))
activity_matrices
activity_matrices = unnorm_ga
cell_num_genes = num_genes
if (!is.list(activity_matrices)) {
scores <- activity_matrices
normalization_df <- data.frame(cell = colnames(activity_matrices),
cell_group = 1)
}else {
scores <- do.call(cbind, activity_matrices)
normalization_df <- do.call(rbind, lapply(seq_along(activity_matrices),
function(x) {
data.frame(cell = colnames(activity_matrices[[x]]),
cell_group = rep(x, ncol(activity_matrices[[x]])))
}))
}
scores <- scores[Matrix::rowSums(scores) != 0, Matrix::colSums(scores) !=
0]
dim(scores)
cell_num_genes = cell_num_genes[normalization_df$cell %in% colnames(scores)]
normalization_df = subset(normalization_df, cell %in% colnames(scores))
##
normalization_df$cell_group <- factor(normalization_df$cell_group)
normalization_df$total_activity <- Matrix::colSums(scores)
normalization_df$total_sites <- cell_num_genes[as.character(normalization_df$cell)]
if (!is.list(activity_matrices)) {
activity_model <- stats::lm(log(total_activity) ~ log(total_sites),
data = normalization_df)
} else {
activity_model <- stats::lm(log(total_activity) ~ log(total_sites) *
cell_group, data = normalization_df)
}
activity_matrices
normalization_df$fitted_curve <- exp(as.vector(predict(activity_model,
type = "response")))
size_factors <- log(normalization_df$fitted_curve)/mean(log(normalization_df$fitted_curve))
size_factors <- Matrix::Diagonal(x = 1/size_factors)
row.names(size_factors) <- normalization_df$cell
colnames(size_factors) <- row.names(size_factors)
dim(Matrix::t(size_factors %*% Matrix::t(scores)))
dim(scores)
scores <- Matrix::t(size_factors %*% Matrix::t(scores))
scores@x <- pmin(1e+09, exp(scores@x) - 1)
dim(scors)
dim(scores)
sum_activity_scores <- Matrix::colSums(scores)
scale_factors <- Matrix::Diagonal(x = 1/sum_activity_scores)
row.names(scale_factors) <- normalization_df$cell
colnames(scale_factors) <- row.names(scale_factors)
dim(Matrix::t(scale_factors %*% Matrix::t(scores)))
scores <- Matrix::t(scale_factors %*% Matrix::t(scores))
ret <- scores[row.names(activity_matrices),
colnames(activity_matrices)]
row.names(activity_matrices)
dim(scores)
dim(activity_matrices)
rn = row.names(activity_matrices)
cn = colnames(activity_matrices)
rn = rn[rn %in% row.names(scores)]
cn = rn[cn %in% colnames(scores)]
rn = row.names(activity_matrices)
cn = colnames(activity_matrices)
rn = rn[rn %in% row.names(scores)]
cn = cn[cn %in% colnames(scores)]
ret <- scores[rn, cn]
normalize_gene_activities.corrected <- function (activity_matrices, cell_num_genes)
{
if (!is.list(activity_matrices)) {
scores <- activity_matrices
normalization_df <- data.frame(cell = colnames(activity_matrices),
cell_group = 1)
}else {
scores <- do.call(cbind, activity_matrices)
normalization_df <- do.call(rbind, lapply(seq_along(activity_matrices),
function(x) {
data.frame(cell = colnames(activity_matrices[[x]]),
cell_group = rep(x, ncol(activity_matrices[[x]])))
}))
}
scores <- scores[Matrix::rowSums(scores) != 0, Matrix::colSums(scores) !=
0]
## correct by adding following lines
cell_num_genes = cell_num_genes[normalization_df$cell %in% colnames(scores)]
normalization_df = subset(normalization_df, cell %in% colnames(scores))
##
normalization_df$cell_group <- factor(normalization_df$cell_group)
normalization_df$total_activity <- Matrix::colSums(scores)
normalization_df$total_sites <- cell_num_genes[as.character(normalization_df$cell)]
if (!is.list(activity_matrices)) {
activity_model <- stats::lm(log(total_activity) ~ log(total_sites),
data = normalization_df)
} else {
activity_model <- stats::lm(log(total_activity) ~ log(total_sites) *
cell_group, data = normalization_df)
}
normalization_df$fitted_curve <- exp(as.vector(predict(activity_model,
type = "response")))
size_factors <- log(normalization_df$fitted_curve)/mean(log(normalization_df$fitted_curve))
size_factors <- Matrix::Diagonal(x = 1/size_factors)
row.names(size_factors) <- normalization_df$cell
colnames(size_factors) <- row.names(size_factors)
scores <- Matrix::t(size_factors %*% Matrix::t(scores))
scores@x <- pmin(1e+09, exp(scores@x) - 1)
sum_activity_scores <- Matrix::colSums(scores)
scale_factors <- Matrix::Diagonal(x = 1/sum_activity_scores)
row.names(scale_factors) <- normalization_df$cell
colnames(scale_factors) <- row.names(scale_factors)
scores <- Matrix::t(scale_factors %*% Matrix::t(scores))
if (!is.list(activity_matrices)) {
rn = row.names(activity_matrices)
cn = colnames(activity_matrices)
rn = rn[rn %in% row.names(scores)]
cn = cn[cn %in% colnames(scores)]
ret <- scores[rn, cn]
}
else {
ret <- lapply(activity_matrices, function(x) {
rn = row.names(x)
cn = colnames(x)
rn = rn[rn %in% row.names(x)]
cn = cn[cn %in% colnames(x)]
scores[rn, cn]
})
}
return(ret)
}
# normalize
cicero_gene_activities <- normalize_gene_activities.corrected(unnorm_ga, num_genes)
# if you had two datasets to normalize, you would pass both:
# num_genes should then include all cells from both sets
#unnorm_ga2 <- unnorm_ga
#cicero_gene_activities <- normalize_gene_activities(list(unnorm_ga, unnorm_ga2), num_genes)
conns = data.table(conns)
conns = conns[coaccess > coaccess_thr, ]
conns = conns[coaccess > 0.4, ]
res = list('conns' = conns, 'ga_score' = cicero_gene_activities)
connes
conns
library(chromVAR)
library(BiocParallel)
register(SerialParam())
seurat_file ='~/yuw1/run_scATAC-pro/buenrostro2018/output_trim/downstream_analysis/MACS2/FILTER/seurat_obj.rds'
ss = readRDS(seurat_file)
metaData = ss@meta.data
rm(ss)
chromVAR.obj = readRDS('~/yuw1/run_scATAC-pro/buenrostro2018/output_trim/downstream_analysis/MACS2/FILTER/chromVar_obj.rds')
library(chromVAR)
seuratObj_file = seurat_file
group1
group1='0;1'
group2='2'
seurat.obj = readRDS(seuratObj_file)
seurat.obj$active_clusters = as.character(seurat.obj$active_clusters)
group1 = tolower(group1)
group2 = tolower(group2)
confVar = 'nCount_ATAC'
if(test_use == 'wilcox' || test_use == 'DESeq2') confVar = NULL
test_use = 'wilcox'
mtx = seurat.obj@assays$ATAC@data
if(test_use %in% c('DESeq2', 'negbinom')){
mtx = seurat.obj@assays$ATAC@counts
}
## features that are only expressed less than 15% in any of the cluster
cls = unique(seurat.obj$active_clusters)
mean_frac_cls = sapply(cls, function(x){
dat0 = mtx[, seurat.obj$active_clusters == x]
Matrix::rowMeans(dat0 > 0)
})
sele.features = rownames(mean_frac_cls)[rowSums(mean_frac_cls > 0.15) > 1]
cls
length(sele.features)
## select variable features
#seurat.obj <- FindVariableFeatures(object = seurat.obj,
#                                         selection.method = 'vst',
#                                         nfeatures = floor(nrow(mtx) * 0.5))
#vFeatures = VariableFeatures(seurat.obj)
rnames = rownames(mtx)
mtx = mtx[rnames %in% sele.features, ]
group1 = unlist(strsplit(group1, ';'))
group2 = unlist(strsplit(group2, ';'))
group1
group2
if(group1[1] == 'all') {
cls = sort(unique(seurat.obj$active_clusters))
markers = NULL
for(cluster0 in cls){
cells1 = names(which(seurat.obj$active_clusters == cluster0))
if(group2[1] == 'rest') {
cells2 = names(which(seurat.obj$active_clusters != cluster0))
}else{
cells2 = names(which(seurat.obj$active_clusters %in% group2))
}
if(length(cells1) <= 10 || length(cells2) <= 10) next
mm = FindMarkers(mtx,
cells.1 = cells1, cells.2 = cells2,
test.use = test_use, logfc.threshold = 0.0,
max.cells.per.ident = 500,
only.pos = T, latent.vars = confVar)
mm$cluster = cluster0
mm$fdr = p.adjust(mm$p_val, method = 'fdr')
mm$peak = rownames(mm)
markers = rbind(markers, mm)
}
}else{
cells1 = names(which(seurat.obj$active_clusters %in% group1))
if(length(cells1) <= 10) stop('Not enough cells in group1')
if(group2[1] == 'rest'){
cells2 = names(which(!seurat.obj$active_clusters %in% group1))
markers = FindMarkers(mtx,
cells.1 = cells1, cells.2 = cells2, test.use = test_use,
logfc.threshold = 0.0, max.cells.per.ident = 500,
only.pos = F, latent.vars = confVar)
markers$cluster = ifelse(markers$avg_logFC > 0, group1, group2)
markers$fdr = p.adjust(markers$p_val, method = 'fdr')
}else{
cells2 = names(which(seurat.obj$active_clusters %in% group2))
if(length(cells2) <= 10) stop('Not enough cells in group2')
markers = FindMarkers(mtx,
cells.1 = cells1, cells.2 = cells2, test.use = test_use,
max.cells.per.ident = 500, logfc.threshold = 0.0,
only.pos = F, latent.vars = confVar)
markers$cluster = ifelse(markers$avg_logFC > 0, group1, group2)
markers$fdr = p.adjust(markers$p_val, method = 'fdr')
}
markers$peak = rownames(markers)
}
length(cells1)
group1
which(seurat.obj$active_clusters %in% group1)
cn = colnames(seurat.obj$active_clusters)
cn = colnames(seurat.obj)
cn
cells1 = cn(which(seurat.obj$active_clusters %in% group1))
cn = colnames(seurat.obj)
cells1 = cn(which(seurat.obj$active_clusters %in% group1))
cn = colnames(seurat.obj)
cells1 = cn[which(seurat.obj$active_clusters %in% group1)]
cells1
length(cells1)
if(group1[1] == 'all') {
cls = sort(unique(seurat.obj$active_clusters))
markers = NULL
for(cluster0 in cls){
cells1 = cn[which(seurat.obj$active_clusters == cluster0)]
if(group2[1] == 'rest') {
cells2 = cn[which(seurat.obj$active_clusters != cluster0)]
}else{
cells2 = cn[which(seurat.obj$active_clusters %in% group2)]
}
if(length(cells1) <= 10 || length(cells2) <= 10) next
mm = FindMarkers(mtx,
cells.1 = cells1, cells.2 = cells2,
test.use = test_use, logfc.threshold = 0.0,
max.cells.per.ident = 500,
only.pos = T, latent.vars = confVar)
mm$cluster = cluster0
mm$fdr = p.adjust(mm$p_val, method = 'fdr')
mm$peak = rownames(mm)
markers = rbind(markers, mm)
}
}else{
cells1 = cn[which(seurat.obj$active_clusters %in% group1)]
if(length(cells1) <= 10) stop('Not enough cells in group1')
if(group2[1] == 'rest'){
cells2 = cn[which(!seurat.obj$active_clusters %in% group1)]
markers = FindMarkers(mtx,
cells.1 = cells1, cells.2 = cells2, test.use = test_use,
logfc.threshold = 0.0, max.cells.per.ident = 500,
only.pos = F, latent.vars = confVar)
markers$cluster = ifelse(markers$avg_logFC > 0, group1, group2)
markers$fdr = p.adjust(markers$p_val, method = 'fdr')
}else{
cells2 = cn[which(seurat.obj$active_clusters %in% group2)]
if(length(cells2) <= 10) stop('Not enough cells in group2')
markers = FindMarkers(mtx,
cells.1 = cells1, cells.2 = cells2, test.use = test_use,
max.cells.per.ident = 500, logfc.threshold = 0.0,
only.pos = F, latent.vars = confVar)
markers$cluster = ifelse(markers$avg_logFC > 0, group1, group2)
markers$fdr = p.adjust(markers$p_val, method = 'fdr')
}
markers$peak = rownames(markers)
}
length(cells1)
if(test_use == 'wilcox' || test_use == 'DESeq2') confVar = NULL
confVar = 'nCount_ATAC'
if(test_use == 'wilcox' || test_use == 'DESeq2') confVar = NULL
if(group1[1] == 'all') {
cls = sort(unique(seurat.obj$active_clusters))
markers = NULL
for(cluster0 in cls){
cells1 = cn[which(seurat.obj$active_clusters == cluster0)]
if(group2[1] == 'rest') {
cells2 = cn[which(seurat.obj$active_clusters != cluster0)]
}else{
cells2 = cn[which(seurat.obj$active_clusters %in% group2)]
}
if(length(cells1) <= 10 || length(cells2) <= 10) next
mm = FindMarkers(mtx,
cells.1 = cells1, cells.2 = cells2,
test.use = test_use, logfc.threshold = 0.0,
max.cells.per.ident = 500,
only.pos = T, latent.vars = confVar)
mm$cluster = cluster0
mm$fdr = p.adjust(mm$p_val, method = 'fdr')
mm$peak = rownames(mm)
markers = rbind(markers, mm)
}
}else{
cells1 = cn[which(seurat.obj$active_clusters %in% group1)]
if(length(cells1) <= 10) stop('Not enough cells in group1')
if(group2[1] == 'rest'){
cells2 = cn[which(!seurat.obj$active_clusters %in% group1)]
markers = FindMarkers(mtx,
cells.1 = cells1, cells.2 = cells2, test.use = test_use,
logfc.threshold = 0.0, max.cells.per.ident = 500,
only.pos = F, latent.vars = confVar)
markers$cluster = ifelse(markers$avg_logFC > 0, group1, group2)
markers$fdr = p.adjust(markers$p_val, method = 'fdr')
}else{
cells2 = cn[which(seurat.obj$active_clusters %in% group2)]
if(length(cells2) <= 10) stop('Not enough cells in group2')
markers = FindMarkers(mtx,
cells.1 = cells1, cells.2 = cells2, test.use = test_use,
max.cells.per.ident = 500, logfc.threshold = 0.0,
only.pos = F, latent.vars = confVar)
markers$cluster = ifelse(markers$avg_logFC > 0, group1, group2)
markers$fdr = p.adjust(markers$p_val, method = 'fdr')
}
markers$peak = rownames(markers)
}
markers = data.table(markers)
markers[, 'peak0' := unlist(strsplit(peak, ','))[1], by = peak]
markers[, 'chr' := unlist(strsplit(peak0, '-'))[1], by = peak0]
markers[, 'start' := unlist(strsplit(peak0, '-'))[2], by = peak0]
markers[, 'end' := unlist(strsplit(peak0, '-'))[3], by = peak0]
setcolorder(markers, c('chr', 'start', 'end', 'p_val','avg_logFC','pct.1','pct.2',
'p_val_adj', 'fdr', 'cluster', 'peak', 'peak0'))
#markers = markers[abs(avg_logFC) > 0, ]
markers = markers[fdr <= 0.05, ]
markers
markers[abs(avg_logFC) > 0.1, ]
markers[abs(avg_logFC) > 0.01, ]
markers[abs(avg_logFC) > 0.0001, ]
ss = readRdS('/home/yuw1/yuw1/run_scATAC-pro/cusanovich2018_mouse/output/downstream_analysis/MACS2/FILTER/seurat_obj.rds')
ss = readRDA('/home/yuw1/yuw1/run_scATAC-pro/cusanovich2018_mouse/output/downstream_analysis/MACS2/FILTER/seurat_obj.rds')
ss = readRDS('/home/yuw1/yuw1/run_scATAC-pro/cusanovich2018_mouse/output/downstream_analysis/MACS2/FILTER/seurat_obj.rds')
table(ss$sample)
