options(stringsAsFactors=FALSE) args = commandArgs(trailingOnly=TRUE) def.args = c("/avicenna/guest_mehran/ATAC_20210615", "/avicenna/mkarimzadeh/cisTransBreastProject/archrRunV3") if(length(args) < 2){ cat("Usage: Rscript process_atac.R \n") cat("Using default aruments\n") args = def.args } print(args) # this is a 2-column file mapping cell types to their colours # CellType,Color # LUM_HR-pos,#44aa99 # LUM_HR-neg,#88ccee # Basal,#117733 # Fibroblast,#332288 # Adipocyte,#ddcc77 # Blood_EC,#882255 # Lymph_EC,#aa4499 # Vasc.Acc.,#cc6677 # Myeloid,#999933 # Lymphoid,#dddddd cols = read.csv("/avicenna/mkarimzadeh/cisTransBreastProject/rnaObjs/Colors_CellType.csv", header=T) require(ArchR) require(reshape2) require(scales) require(ggplot2) require(cowplot) indir = args[1] outdir = args[2] setwd(outdir) set.seed(42) addArchRThreads(threads = 16) addArchRGenome("hg38") samplenames = dir(indir)[grep("ATAC", dir(indir))] samplenames = samplenames[!grepl("(CF-249-347|CF-19301)", samplenames)] inpaths = lapply(samplenames, function(samplename){ fullpath = sprintf("%s/%s/outs/fragments.tsv.gz", indir, samplename) if(file.exists(fullpath)){ names(fullpath) = samplename return(fullpath) }else{ cat(sprintf("Didn't find %s\n", fullpath)) } }) ArrowFiles = createArrowFiles( inputFiles=unlist(inpaths), sampleNames=samplenames, minTSS=2, minFrags=500, maxFrags=1e8, addTileMat=TRUE, addGeneScoreMat = TRUE) projCisTrans <- ArchRProject( ArrowFiles = ArrowFiles, outputDirectory = outdir, copyArrows = TRUE #This is recommened so that if you modify the Arrow files you have an original copy for later usage. ) saveArchRProject(ArchRProj = projCisTrans, outputDirectory = outdir, load = FALSE) doubScores <- addDoubletScores( input = projCisTrans, k = 10, #Refers to how many cells near a "pseudo-doublet" to count. knnMethod = "UMAP", #Refers to the embedding to use for nearest neighbor search with doublet projection. ) ## Investigate the cells with different cutoffs peakobj = doubScores require(reshape2) newdf = melt(as.data.frame(peakobj@cellColData[,c("TSSEnrichment", "NucleosomeRatio", "BlacklistRatio", "DoubletEnrichment")])) P = ggplot(newdf, aes(x=value, fill=variable), colour="black") + geom_histogram() + facet_wrap(~variable, scales="free", nrow=1) + scale_fill_brewer(name="", palette="Set1") + xlab("") + theme_bw(base_size=14) + theme(legend.position="bottom") create_plot(P, sprintf("%s/Plots/01_basicStats", outdir), width=15, height=5) filtered.obj <- filterDoublets(doubScores) newdf = melt(as.data.frame(filtered.obj@cellColData[,c("PromoterRatio", "NucleosomeRatio", "BlacklistRatio", "DoubletEnrichment")])) P = ggplot(newdf, aes(x=value, fill=variable), colour="black") + geom_histogram() + facet_wrap(~variable, scales="free", nrow=1) + scale_fill_brewer(name="", palette="Set1") + xlab("") + theme_bw(base_size=14) + theme(legend.position="bottom") create_plot(P, sprintf("%s/Plots/01_basicStats", outdir), width=15, height=5) idx_cells = which(filtered.obj@cellColData$PromoterRatio >= 0.1 & filtered.obj@cellColData$PromoterRatio <= 0.8 & filtered.obj@cellColData$nFrags >= 1e3 & filtered.obj@cellColData$nFrags <= 5e4 & filtered.obj@cellColData$NucleosomeRatio < 4) # filtered.obj@cellColData$DoubletEnrichment < 3.5) filtered.obj = filtered.obj[idx_cells, ] saveArchRProject(ArchRProj = filtered.obj, outputDirectory = outdir, load = FALSE) # LSI # What if we use peak matrix instead of TileMatrix filtered.obj <- addIterativeLSI( ArchRProj = filtered.obj, useMatrix = "TileMatrix", name = "IterativeLSI", iterations = 5, clusterParams = list( #See Seurat::FindClusters resolution = c(0.2), sampleCells = 10000, n.start = 10 ), varFeatures = 25000, dimsToUse = 1:30, force=TRUE ) ## Batch correction with Harmony # this file has the following information # Sample BatchID Date Prep. Method Index Conc.(nM) load. conc. Cells CellRanger # TM-3937 BATCH_1 43971 sort 150k, resus. 20ul SI-NA-A1 23.7 800 2359 # CF-2797 BATCH_1 43971 sort 150k, resus. 20ul SI-NA-A2 30.5 600 378 # TM-8249 BATCH_2 43998 sort 150k, resus. 10ul SI-NA-A3 68.5 880 2089 # CF-318-813 BATCH_2 43998 sort 150k, resus. 10ul SI-NA-A4 52.9 700 1424 # TM-7567 BATCH_3 44001 sort 196k, resus. 5ul SI-NA-A5 83.8 3500 12244 # CF-0404 BATCH_3 44001 sort 268k, resus. 5ul SI-NA-A6 160.3 2500 6815 # CF-19301 BATCH_4 44004 sort 340k, resus. 5ul SI-NA-A7 119.2 5000 10457 # CF-3920 BATCH_5 44007 sort 107k, resus. 5ul SI-NA-A8 27.7 1950 1969 # TM-9469 BATCH_5 44007 sort 341k, resus. 7.5ul SI-NA-A9 49.0 1260 3058 # TM-6544 BATCH_6 44008 sort 100k, resus. 5ul SI-NA-A10 18.0 1250 2183 # CF-428-112 BATCH_6 44008 sort 328k, resus. 7.5ul SI-NA-A11 67.8 2140 2445 # CF-7780 BATCH_7 44011 sort 317k, resus. 7.5ul SI-NA-B1 38.3 1400 3727 # TM-6477 BATCH_7 44011 sort 300k, resus. 7.5ul SI-NA-B2 66.3 2140 5799 # TM-1956 BATCH_8 44012 sort 350k, resus. 7.5ul SI-NA-B3 63.4 3260 8242 # CF-4014 BATCH_8 44012 sort 443k, resus. 7.5ul SI-NA-B4 98.8 2870 6835 # TM-2768 BATCH_8 44012 sort 445k, resus. 7.5ul SI-NA-B5 91.5 1750 4361 # CF-249-347 BATCH_9 44013 sort 184k, resus. 5ul SI-NA-B8 150.5 980 2888 # CF-2797-2 BATCH_9 44013 sort 439k, resus. 7.5ul SI-NA-B9 76.4 1150 1180 # CF-1380 BATCH_10 44342 sort 250k, resus. 10ul SI-NA-B10 102.1 4560 7439 # CF-2099 BATCH_10 44342 sort 90k, resus. 10ul SI-NA-B11 122.4 2280 9501 batchpath = "/avicenna/mkarimzadeh/cisTransBreastProject/batchinfo_v2.tsv" batchdf = read.csv(batchpath, header=T, sep="\t", row.names=1) filtered.obj@cellColData$SampleName = gsub("_ATAC", "", filtered.obj@cellColData$Sample) filtered.obj@cellColData$BatchID = batchdf[as.character(filtered.obj@cellColData$SampleName), 1] filtered.obj <- addHarmony( ArchRProj = filtered.obj, reducedDims = "IterativeLSI", name = "Harmony", groupBy = "BatchID", force=TRUE ) saveArchRProject(ArchRProj = filtered.obj, outputDirectory = outdir, load = FALSE) filtered.obj = loadArchRProject(outdir) minDist = 0.2 nNeighbors = 60 filtered.obj <- addUMAP( ArchRProj = filtered.obj, reducedDims = "Harmony", name = "AfterHarmony", nNeighbors = nNeighbors, minDist = minDist, metric = "cosine", force=TRUE ) metadf = as.data.frame(filtered.obj@cellColData) saveRDS(metadf, file=sprintf("%s/metadata_archr.RDS", outdir), compress=TRUE) ## Compare a few harmony variations peakobj = filtered.obj # scaleDims = True peakobj = addHarmony( ArchRProj = peakobj, reducedDims = "IterativeLSI", name = "HarmonyScaled", scaleDims = TRUE, groupBy = "BatchID", force=TRUE ) minDist = 0.2 nNeighbors = 60 peakobj <- addUMAP( ArchRProj = peakobj, reducedDims = "HarmonyScaled", name = "AfterHarmonyScaled", nNeighbors = nNeighbors, minDist = minDist, metric = "cosine", force=TRUE ) ## Plot umap cols[,1] = gsub("Vasc_Accessory", "Vasc.Acc.", cols[,1]) cols[,1] = gsub("Lymphatic_EC", "Lymph_EC", cols[,1]) metadf = as.data.frame(peakobj@cellColData) umapdf = as.data.frame(peakobj@embeddings$AfterHarmonyScaled@listData$df) colnames(umapdf) = c("UMAP1", "UMAP2") umapdf$CellType = metadf[rownames(umapdf), "predictedCellType"] textdf = do.call("rbind", lapply(unique(umapdf$CellType), function(x){ tempdf = umapdf[umapdf$CellType==x, ] outdf = data.frame(UMAP1=mean(tempdf[,1]), UMAP2=mean(tempdf[,2]), CellType=x) return(outdf)})) umapdf$CellType = factor(umapdf$CellType, levels=cols[,1]) umapdf$SampleType = ifelse(grepl("CF", metadf[rownames(umapdf), "SampleName"]), "Cis female", "Trans male") textdf$SampleType = "Cis female" outdir_temp = paste(outdir, "Plots/temp", sep="/") P = ggplot(umapdf, aes(x=UMAP1, y=UMAP2, colour=CellType)) + geom_point(alpha=0.8) + scale_colour_manual(name="", values=cols[,2]) + geom_label_repel(data=textdf, aes(label=CellType)) + theme_bw(base_size=14) create_plot(P, sprintf("%s/%s_umap_atac_scaledHarmony", outdir_temp, "CellType"), 9, 6) P = ggplot(umapdf, aes(x=UMAP1, y=UMAP2, colour=CellType)) + geom_point(alpha=0.8) + facet_wrap(~SampleType, nrow=1) + scale_colour_manual(name="", values=cols[,2]) + geom_label_repel(data=textdf, aes(label=CellType)) + theme_bw(base_size=14) create_plot(P, sprintf("%s/%s_umap_atac_scaledHarmony", outdir_temp, "CellTypeSampleType"), 12, 6) ## Add a few measures: Promoter ratio /doublet/etc. umapdf = as.data.frame(filtered.obj@embeddings$AfterHarmony@listData$df) colnames(umapdf) = c("UMAP1", "UMAP2") list.data = lapply(c("PromoterRatio", "NucleosomeRatio", "nFrags", "DoubletEnrichment"), function(variable){ addf = umapdf addf$Value = metadf[rownames(addf), variable] / max(metadf[,variable]) addf$Variable = variable return(addf)}) plotdf = do.call("rbind", list.data) P = ggplot(plotdf, aes(x=UMAP1, y=UMAP2, colour=Value)) + geom_point(alpha=0.5, size=2) + facet_wrap(~Variable, scales="free", nrow=2) + scale_colour_distiller(palette="RdYlBu") + theme_bw(base_size=14) + theme(legend.position="bottom") create_plot(P, sprintf("%s/umap_atac_quality", outdir_temp), 12, 12) umapdf[,"nFrags"] = metadf[rownames(umapdf), "nFrags"] #umapdf$GroupsNfrag = as.character(cut(umapdf$nFrags, 10)) #umapdf$GroupsNfrag[umapdf$nFrags < 75000] = "< 75000" umapdf$GroupsNfrag = ifelse(umapdf$nFrags < 40000, "nFrags < 40000", "nFrags > 40000") P = ggplot(umapdf, aes(x=UMAP1, y=UMAP2, colour=nFrags)) + geom_point(alpha=0.5, size=2) + facet_wrap(~GroupsNfrag, scales="free", nrow=1) + scale_colour_distiller(palette="RdYlBu") + theme_bw(base_size=14) + theme(legend.position="bottom") create_plot(P, sprintf("%s/umap_atac_nFrags", outdir_temp), 12, 6) filtered.obj <- addClusters( input = filtered.obj, reducedDims = "Harmony", method = "Seurat", name = "Clusters", resolution = 0.8 ) umapdf = as.data.frame(filtered.obj@embeddings$AfterHarmony@listData$df) colnames(umapdf) = c("UMAP1", "UMAP2") metadf = as.data.frame(filtered.obj@cellColData) umapdf$Clusters = metadf[rownames(umapdf), "Clusters"] textdf = do.call("rbind", lapply(unique(umapdf$Clusters), function(x){ tempdf = umapdf[umapdf$Clusters==x, ] outdf = data.frame(UMAP1=mean(tempdf[,1]), UMAP2=mean(tempdf[,2]), Clusters=x) return(outdf)})) require(ggrepel) P = ggplot(umapdf, aes(x=UMAP1, y=UMAP2, colour=Clusters)) + geom_point() + geom_label_repel(data=textdf, aes(label=Clusters)) + theme_bw(base_size=14) create_plot(P, sprintf("%s/%s_umap_atac", outdir_temp, "clusters"), 12, 6) umapdf$Group = ifelse(umapdf$Clusters %in% paste("C", c(21:25), sep=""), "Exclude", "Keep") # umapdf$Group = ifelse(umapdf$Clusters %in% "C24", "Exclude", "Keep") P = ggplot(umapdf, aes(x=UMAP1, y=UMAP2, colour=Clusters)) + geom_point() + facet_wrap(~Group, nrow=1) + geom_label_repel(data=textdf, aes(label=Clusters)) + theme_bw(base_size=14) # create_plot(P, sprintf("%s/%s_umap_atac", outdir_temp, "clusters"), 15, 6) ## For each cluster, re-perform LSI/UMAP and save it saveArchRProject(ArchRProj = filtered.obj, outputDirectory = outdir, load = FALSE) setwd(outdir) require(Seurat) # tilemat = getMatrixFromProject(filtered.obj2, "TileMatrix", binarize = TRUE) # saveRDS(tilemat, sprintf("%s/scAtacSeqTileMat.RDS", outdir), compress=TRUE) # this file is available on GEO rnaobj = readRDS("/avicenna/mkarimzadeh/cisTransBreastProject/rnaObjs/Sobj_Final_Scaled.rds") rnaobj@meta.data$ExcludeCfs = ifelse(rnaobj@meta.data$orig.ident %in% c("CF-249-347", "CF-19301"), "Exclude", "Keep") rnaobj = subset(x=rnaobj, subset=ExcludeCfs=="Keep") seRNA = CreateSeuratObject(counts=rnaobj@assays$RNA@counts, min.cells = 3, min.features = 200, meta.data=rnaobj@meta.data) peakobj <- addGeneIntegrationMatrix( ArchRProj = filtered.obj, useMatrix = "GeneScoreMatrix", matrixName = "GeneIntegrationMatrix", reducedDims = "Harmony", seRNA = seRNA, addToArrow = TRUE, groupRNA = "CellType", nameCell = "predictedCell", nameGroup = "predictedCellType", force = TRUE ) peakobj <- addImputeWeights(peakobj) saveArchRProject(ArchRProj = peakobj, outputDirectory = outdir, load = FALSE) ## Extract joint umap geneintegmat = getMatrixFromProject(peakobj, "GeneIntegrationMatrix") ## Plot umap cols[,1] = gsub("Vasc_Accessory", "Vasc.Acc.", cols[,1]) cols[,1] = gsub("Lymphatic_EC", "Lymph_EC", cols[,1]) umapdf = as.data.frame(peakobj@embeddings$AfterHarmony@listData$df) umapdf = na.omit(umapdf) colnames(umapdf) = c("UMAP1", "UMAP2") metadf = as.data.frame(peakobj@cellColData) umapdf$CellType = metadf[rownames(umapdf), "predictedCellType"] textdf = do.call("rbind", lapply(unique(umapdf$CellType), function(x){ tempdf = umapdf[umapdf$CellType==x, ] outdf = data.frame(UMAP1=mean(tempdf[,1]), UMAP2=mean(tempdf[,2]), CellType=x) return(outdf)})) umapdf$CellType = factor(umapdf$CellType, levels=cols[,1]) umapdf$SampleType = ifelse(grepl("CF", metadf[rownames(umapdf), "SampleName"]), "Cis female", "Trans male") textdf$SampleType = "Cis female" P = ggplot(umapdf, aes(x=UMAP1, y=UMAP2, colour=CellType)) + geom_point(alpha=0.8) + scale_colour_manual(name="", values=cols[,2]) + geom_label_repel(data=textdf, aes(label=CellType)) + theme_bw(base_size=14) create_plot(P, sprintf("%s/%s_umap_atac2", outdir_temp, "CellType"), 9, 6) P = ggplot(umapdf, aes(x=UMAP1, y=UMAP2, colour=CellType)) + geom_point(alpha=0.8) + facet_wrap(~SampleType, nrow=1) + scale_colour_manual(name="", values=cols[,2]) + geom_label_repel(data=textdf, aes(label=CellType)) + theme_bw(base_size=14) create_plot(P, sprintf("%s/%s_umap_atac2", outdir_temp, "CellTypeSampleType"), 12, 6) P1 = ggplot(umapdf, aes(x=UMAP1, y=UMAP2, colour=SampleType)) + geom_point(alpha=0.8) + scale_colour_manual(name="", values=c("#86007D", "#FFA52C")) + geom_label_repel(data=textdf, aes(label=CellType), colour="black") + theme_bw(base_size=14) create_plot(P1, sprintf("%s/%s_umap_atac2", outdir_temp, "SampleType"), 9, 6) ## Call peaks peakobj@cellColData$SampleType = ifelse(grepl("CF", peakobj@cellColData$SampleName), "Cis female", "Trans male") peakobj@cellColData$CellTypeAndSampleType = paste(peakobj@cellColData$predictedCellType, peakobj@cellColData$SampleType) saveArchRProject(ArchRProj = peakobj, outputDirectory = outdir, load = FALSE) peakobj <- addGroupCoverages(ArchRProj = peakobj, groupBy = "CellTypeAndSampleType", force=TRUE) pathToMacs2 = "/avicenna/mkarimzadeh/anaconda3/envs/bio/bin/macs2" peakobj <- addReproduciblePeakSet( ArchRProj = peakobj, peakMethod="Macs2", additionalParams="--nomodel --nolambda --bdg --SPMR --call-summits", groupBy = "CellTypeAndSampleType", pathToMacs2 = pathToMacs2 ) peakset = getPeakSet(peakobj) peakobj = addPeakMatrix(peakobj) saveArchRProject(ArchRProj = peakobj, outputDirectory = outdir, load = FALSE) peakmat = getMatrixFromProject(peakobj, "PeakMatrix") saveRDS(peakmat, file=sprintf("%s/peakMatrix.RDS", outdir), compress=TRUE) ## Perform umap and etc. with peak matrix peakobj <- addIterativeLSI( ArchRProj = peakobj, useMatrix = "PeakMatrix", name = "IterativeLSIPeak", iterations = 5, clusterParams = list( #See Seurat::FindClusters resolution = c(0.2), sampleCells = 10000, n.start = 10 ), varFeatures = 25000, dimsToUse = 1:30, force=TRUE ) peakobj <- addHarmony( ArchRProj = peakobj, reducedDims = "IterativeLSIPeak", name = "HarmonyPeak", groupBy = "BatchID", force=TRUE ) minDist = 0.1 nNeighbors = 60 peakobj <- addUMAP( ArchRProj = peakobj, reducedDims = "HarmonyPeak", name = "AfterHarmonyPeak", nNeighbors = nNeighbors, minDist = minDist, metric = "cosine", force=TRUE ) peakobj <- addClusters( input = peakobj, reducedDims = "HarmonyPeak", method = "Seurat", name = "ClustersPeaks", resolution = 0.8, force=TRUE ) metadf = as.data.frame(peakobj@cellColData) umapdf = as.data.frame(peakobj@embeddings$AfterHarmonyPeak@listData$df) colnames(umapdf) = c("UMAP1", "UMAP2") umapdf$Group = cluster umapdf$nFrags = metadf[rownames(umapdf), "nFrags"] umapdf$Clusters = metadf[rownames(umapdf), "ClustersPeaks"] umapdf$CellType = metadf[rownames(umapdf), "predictedCellType"] umapdf$PromoterRatio = metadf[rownames(umapdf), "PromoterRatio"] umapdf$Sample = metadf[rownames(umapdf), "Sample"] textdf = do.call("rbind", lapply(unique(umapdf$Clusters), function(x){ tempdf = umapdf[umapdf$Clusters==x, ] outdf = tempdf[1, ] outdf$UMAP1 = mean(tempdf[,1]) outdf$UMAP2 = mean(tempdf[,2]) return(outdf)})) P1 = ggplot(umapdf, aes(x=UMAP1, y=UMAP2, colour=Clusters)) + geom_point(alpha=0.8) + theme_bw(base_size=14) + geom_label_repel(data=textdf, aes(label=Clusters)) + theme(legend.position="bottom") P2 = ggplot(umapdf, aes(x=UMAP1, y=UMAP2, colour=nFrags)) + geom_point(alpha=0.8) + theme_bw(base_size=14) + scale_colour_distiller(name="nFrags", palette="BrBG") + theme(legend.position="bottom") P3 = ggplot(umapdf, aes(x=UMAP1, y=UMAP2, colour=CellType)) + geom_point(alpha=0.8) + theme_bw(base_size=14) + #scale_colour_distiller(name="nFrags", palette="BrBG") + theme(legend.position="bottom") P4 = ggplot(umapdf, aes(x=UMAP1, y=UMAP2, colour=Sample)) + geom_point(alpha=0.8) + theme_bw(base_size=14) + # scale_colour_distiller(name="PromoterRatio", palette="BrBG") + theme(legend.position="bottom") P = plot_grid(P1, P2, P3, P4, nrow=2) create_plot(P, sprintf("%s/%s_umap", outdir_temp, "peakMatrix"), width=12, height=12) ## Add co-accessibility and gene-links markersPeaks <- getMarkerFeatures( ArchRProj = peakobj, useMatrix = "PeakMatrix", groupBy = "CellTypeAndSampleType", bias = c("TSSEnrichment", "log10(nFrags)"), testMethod = "wilcoxon" ) cutoff_condition = "FDR <= 0.05 & Log2FC >= 0.25" markerList <- getMarkers(markersPeaks, cutOff = cutoff_condition) dir.create(sprintf("%s/PeakCalls/peakTsvFiles", outdir)) for(clustername in names(markerList)){ peakdf = as.data.frame(markerList[[clustername]]) if(nrow(peakdf) > 0){ outpath = sprintf("%s/PeakCalls/peakTsvFiles/%s.tsv", outdir, gsub(" ", "_", clustername)) cat(sprintf("Saving to %s\n", outpath)) write.table(peakdf, file=outpath, sep="\t", quote=F, row.names=F) } } heatmapPeaks <- markerHeatmap( seMarker = markersPeaks, cutOff = cutoff_condition, transpose = TRUE ) plotPDF(heatmapPeaks, name = "Peak-Marker-Heatmap", width = 12, height = 8, ArchRProj = peakobj, addDOC = FALSE) system(sprintf("topng %s/Plots/Peak-Marker-Heatmap.pdf", outdir)) pma <- markerPlot(seMarker = markersPeaks, name = "LUM_HR-pos Cis female", cutOff = cutoff_condition, plotAs = "MA") pv <- markerPlot(seMarker = markersPeaks, name = "LUM_HR-pos Cis female", cutOff = cutoff_condition, plotAs = "Volcano") plotPDF(pma, pv, name = "LUM_HR-pos-CisFemale-Markers-MA-Volcano", width = 5, height = 5, ArchRProj = peakobj, addDOC = FALSE) system(sprintf("topng %s/Plots/LUM_HR-pos-CisFemale-Markers-MA-Volcano.pdf", outdir)) peakobj <- addMotifAnnotations(ArchRProj = peakobj, motifSet = "JASPAR2020", name = "Motif") peakobj <- addMotifAnnotations(ArchRProj = peakobj, motifSet = "cisbp", name = "Motif.cisBP") saveArchRProject(ArchRProj = peakobj, outputDirectory = outdir, load = FALSE) ## add chromVAR matrix peakobj <- addDeviationsMatrix( ArchRProj = peakobj, peakAnnotation = "Motif.cisBP", force = TRUE ) saveArchRProject(ArchRProj = peakobj, outputDirectory = outdir, load = FALSE)