# Set seeds and initial parameters for ArchR
set.seed(1)

# Number of cores to use (adapt to your machine, but has to be > 1 otherwise plotTSSenrichment does not work properly)
addArchRThreads(threads = 8)

## Setting default number of Parallel threads to 8.

# Reference genome (adapt to your data, use the same reference that was used for the alignment)
addArchRGenome("mm10")

## Setting default genome to Mm10.

# Load ArchRProject
proj = loadArchRProject(
  path = "ArchRProject", 
  force = TRUE, 
  showLogo = FALSE
  )

# Get an overview of the project
proj

## 
##            ___      .______        ______  __    __  .______      
##           /   \     |   _  \      /      ||  |  |  | |   _  \     
##          /  ^  \    |  |_)  |    |  ,----'|  |__|  | |  |_)  |    
##         /  /_\  \   |      /     |  |     |   __   | |      /     
##        /  _____  \  |  |\  \\___ |  `----.|  |  |  | |  |\  \\___.
##       /__/     \__\ | _| `._____| \______||__|  |__| | _| `._____|
##

## class: ArchRProject 
## outputDirectory: /omics/groups/OE0436/internal/Regensburg/Mainz/03_archr/scripts/methods/paper/ArchRProject 
## samples(5): scATAC_4 scATAC_1 scATAC_9 scATAC_8 scATAC_5
## sampleColData names(1): ArrowFiles
## cellColData names(34): Sample TSSEnrichment ... Treg_trajectory_skin
##   Treg_trajectory_colon
## numberOfCells(1): 14435
## medianTSS(1): 20.872
## medianFrags(1): 29176

1 Final filtered Dataset

1.1 Visualisation in UMAP Embedding

# Color by clusters
p_clusters = plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "Clusters", 
  embedding = "UMAP", 
  size = 0.5
  )

# Color by samples
p_samples = plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "Sample", 
  embedding = "UMAP", 
  size = 0.5
  )

plot_grid(p_clusters, p_samples, align = "h", ncol = 2)

# Color by doublet enrichment score
p_doubscore = plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "DoubletScore", 
  embedding = "UMAP", 
  plotAs = "points", 
  size = 0.5
)

p_doubenr=plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "DoubletEnrichment", 
  embedding = "UMAP", 
  plotAs = "points", 
  size = 0.5
)

# Color by number of fragments per cell
p_nfrags = plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "nFrags", 
  embedding = "UMAP", 
  plotAs = "points", 
  size = 0.5
)

# Plot
plot_grid(p_doubscore, p_doubenr, p_nfrags, align = "h", ncol = 3, scale = 1.1)

1.2 Overlay Gene Scores on UMAP

# Overlay gene scores on UMAP embedding, use MAGIC smoothing
# Get marker features
markersGS = getMarkerFeatures(
  ArchRProj = proj, 
  useMatrix = "GeneScoreMatrix", 
  groupBy = "Clusters",
  bias = c("TSSEnrichment", "log10(nFrags)"),
  testMethod = "wilcoxon"
)

# Define which genes to plot
markerGenes = c("Cd4", "Il2ra", "Foxp3", "Batf", "Ccr8", "Ikzf2", "Rorc", "Tbx21", "Ifng", "Gata3", "Il2", "Sell")

# Add impute weights
proj = addImputeWeights(proj)

# Plot
magic_genes = plotEmbedding(
  ArchRProj = proj, 
  colorBy = "GeneScoreMatrix", 
  name = markerGenes, 
  embedding = "UMAP",
  plotAs = "points",
  imputeWeights = getImputeWeights(proj),
  size = 0.5
)

# Plot all genes in cowplot
magic_genes_cow = lapply(magic_genes, function(x){
  x + guides(color = FALSE, fill = FALSE) + 
    theme_ArchR(baseSize = 6.5) +
    theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
    theme(
      axis.text.x=element_blank(), 
      axis.ticks.x=element_blank(), 
      axis.text.y=element_blank(), 
      axis.ticks.y=element_blank()
    )
})
do.call(cowplot::plot_grid, c(list(ncol = 4), magic_genes_cow))

# Save as PDF
plotPDF(magic_genes_cow,  
  name = "UMAP_genescores.pdf", 
  ArchRProj = proj, 
  addDOC = FALSE, 
  width = 5, 
  height = 5
)

p_Cd4 = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Cd4",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Cd4

p_Il2ra = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Il2ra",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Il2ra

p_Foxp3 = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Foxp3",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Foxp3

p_Batf = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Batf",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Batf

p_Ccr8 = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Ccr8",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Ccr8

p_Ikzf2 = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Ikzf2",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Ikzf2

p_Rorc = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Rorc",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Rorc

p_Tbx21 = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Tbx21",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Tbx21

p_Ifng = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Ifng",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Ifng

p_Gata3 = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Gata3",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Gata3

# Look at tissue Tregs and their precursor stages in the spleen
markerGenes_treg = c("Foxp3", "Batf", "Nfil3", "Klrg1", "Gata3", "Il1rl1", "Id3", "Id2", "Ikzf2")

# Plot
magic_genes_treg = plotEmbedding(
  ArchRProj = proj, 
  colorBy = "GeneScoreMatrix", 
  name = markerGenes_treg, 
  embedding = "UMAP",
  plotAs = "points", 
  imputeWeights = getImputeWeights(proj),
  size = 0.5
)

# Plot all genes in cowplot
magic_genes_cow_treg = lapply(magic_genes_treg, function(x){
  x + guides(color = FALSE, fill = FALSE) + 
    theme_ArchR(baseSize = 6.5) +
    theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
    theme(
      axis.text.x=element_blank(), 
      axis.ticks.x=element_blank(), 
      axis.text.y=element_blank(), 
      axis.ticks.y=element_blank()
    )
})
do.call(cowplot::plot_grid, c(list(ncol = 3), magic_genes_cow_treg))

# To further characterize Treg cells in our dataset, we look at various Treg
# markers
p_Foxp3 = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Foxp3",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Foxp3

p_Batf = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Batf",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Batf

p_Nfil3 = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Nfil3",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Nfil3

p_Klrg1 = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Klrg1",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Klrg1

p_Gata3 = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Gata3",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Gata3

p_Il1rl1 = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Il1rl1",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Il1rl1

p_Id3 = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Id3",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Id3

p_Id2 = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Id2",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Id2

p_Ikzf2 = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Ikzf2",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Ikzf2

1.3 Define Clusters

# Define clusters for comparisons/ project subsetting (to be performed later)
tisTreg_cluster = c("C16", "C8", "C9", "C6") #C16 spleen, C8 fat, C9 skin, C6 colon
tTreg_cluster = c("C19", "C18") #C19 early precursor, C18 late precursor  
pTreg_cluster = "C7"

# Also scATAC_8 is from spleen, however this sample includes only CD25+ cells 
# whereas all other samples are CD4+
spleen = "scATAC_1" 
skin = "scATAC_9"
colon = "scATAC_4"
fat = "scATAC_5"
tissue = c(skin, colon, fat)

# Add cluster names to AchRProject
clusters = as.data.frame(proj@cellColData$Clusters)
colnames(clusters) = "V1"

clusters = clusters %>% mutate(across("V1", str_replace, "^C16$", "C16 tisTreg spleen")) 
clusters = clusters %>% mutate(across("V1", str_replace, "^C8$", "C8 tisTreg fat"))
clusters = clusters %>% mutate(across("V1", str_replace, "^C9$", "C9 tisTreg skin"))
clusters = clusters %>% mutate(across("V1", str_replace, "^C6$", "C6 tisTreg colon"))
clusters = clusters %>% mutate(across("V1", str_replace, "^C7$", "C7 pTreg colon")) 
clusters = clusters %>% mutate(across("V1", str_replace, "^C19$", "C19 early precursor spleen"))
clusters = clusters %>% mutate(across("V1", str_replace, "^C18$", "C18 late precursor spleen"))
clusters = clusters %>% mutate(across("V1", str_replace, "^C1$", "C1 naive CD4+ colon"))
clusters = clusters %>% mutate(across("V1", str_replace, "^C3$", "C3 naive CD4+ spleen/fat"))
clusters = clusters %>% mutate(across("V1", str_replace, "^C2$", "C2 naive CD4+ spleen"))
clusters = clusters %>% mutate(across("V1", str_replace, "^C4$", "C4 naive CD4+ spleen"))
clusters = clusters %>% mutate(across("V1", str_replace, "^C5$", "C5 naive CD4+ spleen"))
clusters = clusters %>% mutate(across("V1", str_replace, "^C14$", "C14 Th1 spleen/fat"))
clusters = clusters %>% mutate(across("V1", str_replace, "^C15$", "C15 Th1 colon"))
clusters = clusters %>% mutate(across("V1", str_replace, "^C11$", "C11 Th1 skin"))
clusters = clusters %>% mutate(across("V1", str_replace, "^C12$", "C12 Th17 skin"))
clusters = clusters %>% mutate(across("V1", str_replace, "^C10$", "C10 Th2 colon"))

clusters_vector = as.vector(clusters$V1)

proj@cellColData$Clusters2 = clusters_vector

2 Pseudo-bulk Replicates

# We have a look at the number of cells in the individual clusters
cM_doubfilter = confusionMatrix(paste0(proj$Clusters2), paste0(proj$Sample))
cM_doubfilter

## 21 x 5 sparse Matrix of class "dgCMatrix"
##                            scATAC_1 scATAC_4 scATAC_5 scATAC_8 scATAC_9
## C4 naive CD4+ spleen            482        .        .        .        .
## C5 naive CD4+ spleen            282        3        8        4        1
## C2 naive CD4+ spleen           2707        .       53        .        .
## C18 late precursor spleen         8        .        .      291        .
## C16 tisTreg spleen               32        4       19      475        8
## C14 Th1 spleen/fat              174       40      548        2       13
## C19 early precursor spleen      170        9       45     1191        3
## C8 tisTreg fat                   10        4      118       30        6
## C11 Th1 skin                      1        1        6        .      892
## C3 naive CD4+ spleen/fat        244       43      228        .       10
## C12 Th17 skin                     1        7       15        .      472
## C13                               .      461        6        .        1
## C6 tisTreg colon                  .      777       22        .        8
## C1 naive CD4+ colon               .     1276       20        .        .
## C7 pTreg colon                    .      513        6        .        .
## C15 Th1 colon                     .     1377        3        .        2
## C20                               .       23        7        .       30
## C21                               .      188       13        .       22
## C10 Th2 colon                     .      104       30        .        2
## C17                               .       30        5        .        1
## C9 tisTreg skin                   .        4        1        3      840

# The key parameter here is groupBy, which defines the groups for which 
# pseudo-bulk replicates should be made
proj = addGroupCoverages(
  ArchRProj = proj,
  groupBy = "Clusters2",
  minCells = 40, 
  maxCells = 500, 
  minReplicates = 2,
  maxReplicates = 5, 
  sampleRatio = 0.8,
  force = TRUE
  )

3 Peak-Calling

# You can use the following function to search the path to Macs2
# However, sometimes this might not work and you have to manually add the path 
# like it is shown in the second line. 
pathToMacs2 = findMacs2()
# If you manually add the path, you have to change this line!
pathToMacs2 = "../../../../Macs2/bin/macs2"

proj = addReproduciblePeakSet(
  ArchRProj = proj, 
  groupBy = "Clusters2", 
  pathToMacs2 = pathToMacs2
)

# Add peak matrix to ArchRProject
proj = addPeakMatrix(proj)

# Confirm that peak matrix has been added
getAvailableMatrices(proj)

3.1 Tissue Signatures

3.1.1 Import Tissue Signatures

# Check for enrichment of Treg signatures in Clusters
# signatures from PMID:33789089

signature_names <- c("early_progenitor_tisTreg","late_progenitor_tisTreg","tisTreg_core","tisTreg_skin","tisTreg_VAT","tisTreg_colon")
granges_list <- list()
n <- 1

for (x in signature_names){
  filename = paste("data/signatures/",x,".csv", sep = "")
  sig_csv = read.csv(filename, sep = "-", header = FALSE)
  sig_csv = data.frame(seqnames=sig_csv[,1], start=as.character(sig_csv[,2]), end=as.character(sig_csv[,3]))
  granges_list[[n]] = makeGRangesFromDataFrame(sig_csv, seqnames.field="seqnames", start.field="start", end.field="end")
  n <- n+1
}

early_progenitor_tisTreg_GR = granges_list[[1]]
late_progenitor_tisTreg_GR = granges_list[[2]]
tisTreg_core_GR = granges_list[[3]]
tisTreg_skin_GR = granges_list[[4]]
tisTreg_VAT_GR = granges_list[[5]]
tisTreg_colon_GR = granges_list[[6]]

3.1.2 Calculate Signatures Scores

# Signature z-scores can be calculated via the 'addDeviationsMatrix' function 
# of the chromVAR package. This wrapper function accepts the ArchRProject, the 
# GRanges objects with my signatures as a list, and the name for the signature 
# set as a string. The output is the ArchRProject with the z-scores and 
# deviation-scores added as additional columns to cellColData (one column for 
# each signature).

archr_add_peak_signatures = function(project, signature_list, signature_name){
  #signature_list: list of granges
  #signature_name: name string for the set of signatures
    add_df_to_cellcoldata = function(pro, pheno_df, force=FALSE){
      stopifnot(identical(rownames(pro@cellColData), rownames(pheno_df)))
      cnames = colnames(pheno_df)
      for(i in 1:ncol(pheno_df)){
        pro = addCellColData(ArchRProj = pro, data=pheno_df[, i], name = cnames[i],
                         cells = rownames(pro@cellColData), force = force)
      }
      return(pro)
    }
 if(length(signature_list)<2){
    stop('Currently, only works if at least two signatures are provided')
  }
  for(i in seq_along(signature_list)){
    names(signature_list[[i]]) = NULL
  }
  project <- addPeakAnnotations(ArchRProj = project, 
                             regions = signature_list,
                             name = signature_name,
                             force=T)

  method_use = "chromVAR" #does only work with fixed width peaks
  if(any(sapply(signature_list, function(x) length(unique(width(x)))) > 1)){
     method_use = 'ArchR'
  }
  
  project <- addBgdPeaks(project, force = T, method=method_use)
  
  project <- addDeviationsMatrix(
    ArchRProj = project, 
    peakAnnotation = signature_name,
    binarize=TRUE,
    bgdPeaks = getBgdPeaks(project, method = method_use),
    force = TRUE
  )
  
  dr_df = as.data.frame(project@cellColData)
  sig_se = getMatrixFromProject(project, paste0(signature_name, 'Matrix'))
  z_score_mat = t(assays(sig_se)[['z']])
  z_score_mat = z_score_mat[match(rownames(dr_df),rownames(z_score_mat)), ]
  colnames(z_score_mat) = paste0('z_', colnames(z_score_mat))
  stopifnot(identical(rownames(z_score_mat), rownames(dr_df)))
  dev_score_mat = t(assays(sig_se)[['deviations']])
  dev_score_mat = dev_score_mat[match(rownames(dr_df),rownames(dev_score_mat)), ]
  colnames(dev_score_mat) = paste0('dev_', colnames(dev_score_mat))
  stopifnot(identical(rownames(dev_score_mat), rownames(dr_df)))
  
  project = add_df_to_cellcoldata(project, z_score_mat, force=T)
  project = add_df_to_cellcoldata(project, dev_score_mat, force=T)
  return(project)
}

signature_list = list(early_progenitor_tisTreg_sig = early_progenitor_tisTreg_GR, 
  late_progenitor_tisTreg_sig = late_progenitor_tisTreg_GR, 
  tisTreg_core_sig = tisTreg_core_GR, 
  tisTreg_skin_sig = tisTreg_skin_GR, 
  tisTreg_VAT_sig = tisTreg_VAT_GR, 
  tisTreg_colon_sig = tisTreg_colon_GR 
)

proj <- archr_add_peak_signatures(proj, signature_list, "signatures")

# Overlay signatures on UMAP
p_early_progenitor_tisTreg_sig = plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "z_early_progenitor_tisTreg_sig", 
  embedding = "UMAP",
  plotAs = "points",
  size = 0.5
  )

p_late_progenitor_tisTreg_sig = plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "z_late_progenitor_tisTreg_sig", 
  embedding = "UMAP",
  plotAs = "points",
  size = 0.5
  )

p_tisTreg_core_sig = plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "z_tisTreg_core_sig", 
  embedding = "UMAP",
  plotAs = "points",
  size = 0.5
  )

p_tisTreg_skin_sig = plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "z_tisTreg_skin_sig", 
  embedding = "UMAP",
  plotAs = "points",
  size = 0.5
  )

p_tisTreg_VAT_sig = plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "z_tisTreg_VAT_sig", 
  embedding = "UMAP",
  plotAs = "points",
  size = 0.5
  )

p_tisTreg_colon_sig = plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "z_tisTreg_colon_sig", 
  embedding = "UMAP",
  plotAs = "points",
  size = 0.5
  )

# Plot
ggAlignPlots(p_early_progenitor_tisTreg_sig, p_late_progenitor_tisTreg_sig, p_tisTreg_core_sig, type = "h")

ggAlignPlots(p_tisTreg_skin_sig, p_tisTreg_VAT_sig, p_tisTreg_colon_sig, type = "h")

3.2 Identifying Marker Peaks

3.2.1 Identifying Marker Peaks grouped by Clusters

# Identify marker peaks (differential peaks) between clusters (cell types):
markersPeaks = getMarkerFeatures(
  ArchRProj = proj, 
  useMatrix = "PeakMatrix", 
  groupBy = "Clusters2",
  bias = c("TSSEnrichment", "log10(nFrags)"),
  testMethod = "wilcoxon"
)

# Retrieve particular slices of this SummarizedExperiment that we are interested
# in: The default behavior of this function is to return a list of DataFrame 
# objects, one for each cell group
markerList = getMarkers(markersPeaks, cutOff = "FDR <= 0.01 & Log2FC >= 1")

# Instead of a list of DataFrame objects, we can use getMarkers() to return a 
# GRangesList object by setting returnGR = TRUE
markerList2 = getMarkers(markersPeaks, cutOff = "FDR <= 0.01 & Log2FC >= 1", returnGR = TRUE)

# Plot marker peaks as heatmap
heatmapPeaks = plotMarkerHeatmap(
  seMarker = markersPeaks, 
  cutOff = "FDR <= 0.001 & Log2FC >= 3",
  transpose = FALSE,
)

# Plot this heatmap using draw()
heatmap_peaks = ComplexHeatmap::draw(heatmapPeaks, 
  heatmap_legend_side = "right"
)

4 Motif Enrichment

4.1 Add Motif Annotations

# Look for motifs that are enriched in peaks that are up or down in various cell
# types
# We must first add these motif annotations to our ArchRProject; this 
# effectively creates a binary matrix where the presence of a motif in each peak
# is indicated numerically
proj = addMotifAnnotations(ArchRProj = proj, motifSet = "cisbp", name = "Motif")

4.2 Motif Enrichment in Marker Peaks

4.2.1 Motif Enrichment in Marker Peaks grouped by Clusters

# We perform motif enrichment on our marker peaks 
# In this dataset we used more strict thresholds for the FDR and Log2FC 
# because of the size of the data; otherwise plots could not be generated.
enrichMotifs = peakAnnoEnrichment(
  seMarker = markersPeaks, 
  ArchRProj = proj,
  peakAnnotation = "Motif",
  cutOff = "FDR <= 0.001 & Log2FC >= 2"
)

# Plot these motif enrichments across all cell groups
heatmapEM = plotEnrichHeatmap(enrichMotifs, 
  n = 10, 
  transpose = TRUE
)

# Visualize
row_order = c("C1 naive CD4+ colon","C3 naive CD4+ spleen/fat","C2 naive CD4+ spleen","C4 naive CD4+ spleen","C5 naive CD4+ spleen","C19 early precursor spleen","C18 late precursor spleen","C16 tisTreg spleen","C8 tisTreg fat","C9 tisTreg skin","C6 tisTreg colon","C7 pTreg colon","C14 Th1 spleen/fat","C15 Th1 colon","C11 Th1 skin","C12 Th17 skin","C10 Th2 colon","C13","C17","C20","C21")

heatmapEM2 = ComplexHeatmap::draw(heatmapEM, 
  heatmap_legend_side = "bot", 
  annotation_legend_side = "bot",
  row_order = row_order
)

5 ChromVAR Deviations Enrichment

ChromVAR is designed for predicting enrichment of TF activity on a per-cell basis from sparse chromatin accessibility data

5.1 Add Motif Deviations

# Add a set of background peaks; sample peaks based on similarity in GC-content 
# and nFrags across all samples using the Mahalanobis distance
proj = addBgdPeaks(proj, force = T)

# Compute per-cell deviations across all of our motif annotations
proj = addDeviationsMatrix(
  ArchRProj = proj,
  peakAnnotation = "Motif",
  force = TRUE
)

# Access deviations
# If we want this function to return a ggplot object, we set plot = TRUE 
# otherwise, this function would return the DataFrame object.
plotVarDev = getVarDeviations(proj, name = "MotifMatrix", plot = TRUE)

## DataFrame with 6 rows and 6 columns
##      seqnames     idx        name combinedVars combinedMeans      rank
##         <Rle> <array>     <array>    <numeric>     <numeric> <integer>
## f104        z     104     Fos_104     117.3228  -0.008371192         1
## f843        z     843 Smarcc1_843     117.0174  -0.000772193         2
## f108        z     108   Bach1_108      89.5822   0.003612684         3
## f119        z     119   Bach2_119      71.1386   0.039212150         4
## f98         z      98     Fosb_98      69.8579  -0.008232795         5
## f135        z     135    Jund_135      62.7090   0.083707388         6

# plot variable deviations:
plotVarDev

5.2 Plot the Distribution of Motifs in the Clusters

# If we want to extract a subset of motifs for downstream analysis,
# we can do this using the getFeatures() function.
motifs = c("Batf")
markerMotifs = getFeatures(proj, select = paste(motifs, collapse="|"), useMatrix = "MotifMatrix")
markerMotifs

## [1] "z:Batf_790"           "z:Batf3_789"          "deviations:Batf_790" 
## [4] "deviations:Batf3_789"

# Exclude Batf3
markerMotifs = grep("z:", markerMotifs, value = TRUE)
markerMotifs = markerMotifs[markerMotifs %ni% "z:Batf3_789"]
markerMotifs

## [1] "z:Batf_790"

# We supply the impute weights that we calculated previously during our gene 
# score analyses (impute weights allow us to smooth the signal across nearby 
# cells, helpful in the context of our sparse scATAC-seq data).

# First, re-run addImputeWeights (subset cells from the original imputation)
proj = addImputeWeights(proj)

# Then, plot distribution of chromVAR deviation scores
p_zscore = plotGroups(ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "MotifMatrix", 
  name = markerMotifs,
  imputeWeights = getImputeWeights(proj)
)

p_zscore

5.3 Overlay Motif z-score and corresponding gene score on UMAP

# Instead of looking at the distributions of these z-scores, we can overlay the 
# z-scores on our UMAP embedding as we’ve done previously for gene scores.
# Overlay z-scores
p_zscore_UMAP_Batf = plotEmbedding(
  ArchRProj = proj, 
  colorBy = "MotifMatrix", 
  name = "z:Batf_790", 
  embedding = "UMAP",
  plotAs = "points",
  imputeWeights = getImputeWeights(proj),
  size = 0.5
)

p_gene_exp_UMAP_Batf = plotEmbedding(
  ArchRProj = proj, 
  colorBy = "GeneScoreMatrix", 
  name = "Batf", 
  embedding = "UMAP",
  plotAs = "points",
  imputeWeights = getImputeWeights(proj),
  size = 0.5
)

plot_grid(p_zscore_UMAP_Batf, p_gene_exp_UMAP_Batf, align = "h", ncol = 2)

6 Motif Footprinting

# Obtain the positions of the relevant motifs
motifPositions = getPositions(proj, name = "Motif") 
motifPositions

## GRangesList object of length 884:
## $Tcfap2a_1
## GRanges object with 13813 ranges and 1 metadata column:
##           seqnames              ranges strand |     score
##              <Rle>           <IRanges>  <Rle> | <numeric>
##       [1]     chr1     4808001-4808010      - |   7.48774
##       [2]     chr1     4857513-4857522      + |   7.02247
##       [3]     chr1     6214851-6214860      + |   6.96612
##       [4]     chr1     7395839-7395848      + |   7.11519
##       [5]     chr1     7395840-7395849      - |   7.56411
##       ...      ...                 ...    ... .       ...
##   [13809]     chrX 167203920-167203929      + |   7.26500
##   [13810]     chrX 167209093-167209102      - |   7.27392
##   [13811]     chrX 167212044-167212053      + |   7.77924
##   [13812]     chrX 168673410-168673419      - |   7.46782
##   [13813]     chrX 169319935-169319944      + |   7.70292
##   -------
##   seqinfo: 20 sequences from an unspecified genome; no seqlengths
## 
## ...
## <883 more elements>

# This creates a GRangesList object where each TF motif is represented by a 
# separate GRanges object. We can subset this GRangesList to a few TF motifs 
# that we are interested in.
motifs_fp = c("Tbx21", "Gata3", "Rorc", "Rora", "Foxp3", "Batf", "Ikzf2")
markerMotifs_fp = unlist(lapply(motifs_fp, function(x) grep(x, names(motifPositions), value = TRUE)))
markerMotifs_fp

## [1] "Tbx21_762" "Gata3_384" "Rorc_678"  "Rora_682"  "Foxp3_311" "Batf3_789"
## [7] "Batf_790"

# Exclude Batf3
markerMotifs_fp = markerMotifs_fp[markerMotifs_fp %ni% "Batf3_789"]
markerMotifs_fp

## [1] "Tbx21_762" "Gata3_384" "Rorc_678"  "Rora_682"  "Foxp3_311" "Batf_790"

# To accurately profile TF footprints, a large number of reads is required. 
# Therefore we will use the pseudobulk data stored as group coverages calculated above.

# Compute footprints for the subset of marker motifs defined above
seFoot = getFootprints(
  ArchRProj = proj, 
  positions = motifPositions[markerMotifs_fp], 
  groupBy = "Clusters2"
)

6.1 Motif Footprinting in Treg Cells

# Subset proj (with all matrices added, see "methods.Rmd") to only Treg cells
proj_treg = proj[(proj@cellColData@listData[["Clusters"]] == c(tisTreg_cluster,tTreg_cluster)), ]

# Load ArchRProject
proj_treg = loadArchRProject(
  path = "ArchRProject_Treg", 
  force = TRUE, 
  showLogo = FALSE
  )

# Add UMAP embedding
proj_treg = addUMAP(
  ArchRProj = proj_treg, 
  reducedDims = "IterativeLSI", 
  name = "UMAP", 
  nNeighbors = 30, 
  minDist = 0.5, 
  metric = "cosine", 
  force = TRUE
)

# The key parameter here is groupBy, which defines the groups for which 
# pseudo-bulk replicates should be made.
proj_treg = addGroupCoverages(
  ArchRProj = proj_treg,
  groupBy = "Clusters2",
  minCells = 40, 
  maxCells = 500, #standard
  minReplicates = 2, #standard
  maxReplicates = 5, #standard
  sampleRatio = 0.8,
  force = TRUE
  )

# Compute footprints for the subset of marker motifs defined above
seFoot = getFootprints(
  ArchRProj = proj_treg, 
  positions = motifPositions[markerMotifs_fp],
  groupBy = "Clusters2"
)

6.2 Normalization of Footprints

6.2.1 Normalization by Subtraction of the Tn5 Bias

plotFootprints(
  seFoot = seFoot,
  ArchRProj = proj, 
  normMethod = "Subtract",
  plotName = "Footprints-Subtract-Bias",
  addDOC = FALSE,
  smoothWindow = 5
)

# Plots are saved in the output directory of the ArchRProject by default
# (ggplot object would be extremely large).

plotFootprints(
  seFoot = seFoot,
  ArchRProj = proj_treg, 
  normMethod = "Subtract",
  plotName = "Footprints-Subtract-Bias_Treg",
  addDOC = FALSE,
  smoothWindow = 5
)

6.2.2 Normalization by Dividing by the Tn5 Bias

plotFootprints(
  seFoot = seFoot,
  ArchRProj = proj, 
  normMethod = "Divide",
  plotName = "Footprints-Divide-Bias",
  addDOC = FALSE,
  smoothWindow = 5
)

plotFootprints(
  seFoot = seFoot,
  ArchRProj = proj_treg, 
  normMethod = "Divide",
  plotName = "Footprints-Divide-Bias_Treg",
  addDOC = FALSE,
  smoothWindow = 5
)

6.2.3 Footprinting without Normalization for the Tn5 Bias

plotFootprints(
  seFoot = seFoot,
  ArchRProj = proj, 
  normMethod = "None",
  plotName = "Footprints-No-Normalization",
  addDOC = FALSE,
  smoothWindow = 5
)

plotFootprints(
  seFoot = seFoot,
  ArchRProj = proj_treg, 
  normMethod = "None",
  plotName = "Footprints-No-Normalization_Treg",
  addDOC = FALSE,
  smoothWindow = 5
)

7 Co-Accessibility Analysis

# Co-accessibility is a correlation in accessibility between two peaks 
# across many single cells. Calculate co-accessibility
proj = addCoAccessibility(
    ArchRProj = proj,
    reducedDims = "IterativeLSI"
)

# Retrieve co-accessibility information via the getCoAccessibility() function
# Increase resolution to plow fewer co-accessibility interactions
# if returnLoops = FALSE, then the function will return a dataframe instead of 
# a loop track
cA = getCoAccessibility(
    ArchRProj = proj,
    corCutOff = 0.5,
    resolution = 1000, 
    returnLoops = TRUE 
)

# Plot browser tracks of co-accessibility for our marker genes
markerGenes = c("Cd4", "Il2ra", "Foxp3", "Batf", "Ccr8", "Ikzf2", "Rorc", "Tbx21", "Ifng", "Gata3", "Il2", "Sell")

p_cA = plotBrowserTrack(
    ArchRProj = proj, 
    groupBy = "Clusters", 
    geneSymbol = markerGenes, 
    upstream = 50000,
    downstream = 50000,
    loops = getCoAccessibility(proj)
)

## GRanges object with 12 ranges and 2 metadata columns:
##        seqnames              ranges strand |     gene_id      symbol
##           <Rle>           <IRanges>  <Rle> | <character> <character>
##    [1]     chr6 124864693-124888209      - |       12504         Cd4
##    [2]     chr2   11642792-11693194      + |       16184       Il2ra
##    [3]     chrX     7579676-7595243      + |       20371       Foxp3
##    [4]    chr12   85686720-85709087      + |       53314        Batf
##    [5]     chr9 120092133-120094906      + |       12776        Ccr8
##    ...      ...                 ...    ... .         ...         ...
##    [8]    chr11   97098007-97115331      - |       57765       Tbx21
##    [9]    chr10 118441046-118445894      + |       15978        Ifng
##   [10]     chr2     9857078-9878600      - |       14462       Gata3
##   [11]     chr3   37120713-37125954      - |       16183         Il2
##   [12]     chr1 164062076-164080785      + |       20343        Sell
##   -------
##   seqinfo: 21 sequences from mm10 genome

# To plot the browser track select a specific marker gene using the $ accessor
grid::grid.newpage()
grid::grid.draw(p_cA$Batf)

8 Trajectory Analysis

# Create user-defined trajectory backbone from early via late progenitors to
# tissue Treg cells in the respective non-lymphoid tissue
Treg_trajectory_VAT = c("C19", "C18", "C16", "C8")
Treg_trajectory_skin = c("C19", "C18", "C16", "C9")
Treg_trajectory_colon = c("C19", "C18", "C16", "C6")

# Create the trajectory
# This creates a new column in cellColData that stores the pseudo-time value for 
# each cell in the trajectory
proj = addTrajectory(
    ArchRProj = proj, 
    name = "Treg_trajectory_VAT", 
    groupBy = "Clusters",
    trajectory = Treg_trajectory_VAT, 
    embedding = "UMAP", 
    force = TRUE
)

proj = addTrajectory(
    ArchRProj = proj, 
    name = "Treg_trajectory_skin", 
    groupBy = "Clusters",
    trajectory = Treg_trajectory_skin, 
    embedding = "UMAP", 
    force = TRUE
)

proj = addTrajectory(
    ArchRProj = proj, 
    name = "Treg_trajectory_colon", 
    groupBy = "Clusters",
    trajectory = Treg_trajectory_colon, 
    embedding = "UMAP", 
    force = TRUE
)

# Exclude cells with NA values because these are not part of the trajectory
proj$Treg_trajectory_VAT[!is.na(proj$Treg_trajectory_VAT)]
proj$Treg_trajectory_skin[!is.na(proj$Treg_trajectory_skin)]
proj$Treg_trajectory_colon[!is.na(proj$Treg_trajectory_colon)]

# Save ArchRProject
saveArchRProject(proj, 
  outputDirectory= "ArchRProject", 
  overwrite = TRUE,  
  load = FALSE
  )

# Plot trajectory
# This overlays the pseudo-time values on our UMAP embedding and displays an 
# arrow approximating the trajectory path from the spline-fit; cells that are 
# not part of the trajectory are coloured in gray.
Treg_trajectory_VAT_p = plotTrajectory(proj, 
    trajectory = "Treg_trajectory_VAT", 
    colorBy = "cellColData", 
    name = "Treg_trajectory_VAT",
    plotAs = "points"
)

Treg_trajectory_skin_p = plotTrajectory(proj, 
    trajectory = "Treg_trajectory_skin", 
    colorBy = "cellColData", 
    name = "Treg_trajectory_skin",
    plotAs = "points"
)

Treg_trajectory_colon_p = plotTrajectory(proj, 
    trajectory = "Treg_trajectory_colon", 
    colorBy = "cellColData", 
    name = "Treg_trajectory_colon",
    plotAs = "points"
)

ggAlignPlots(Treg_trajectory_VAT_p[[1]], Treg_trajectory_skin_p[[1]], Treg_trajectory_colon_p[[1]], type = "h")

8.1 Pseudo-Time UMAPs

# Overlay other features on the trajectory within our UMAP embedding
# (display specific features only within the cells that are relevant to our 
# trajectory)

# Overlay genescores on Treg trajectory
# Batf
Treg_traj_VAT_p_Batf = plotTrajectory(proj, 
  trajectory = "Treg_trajectory_VAT", 
  colorBy = "GeneScoreMatrix", 
  name = "Batf", 
  continuousSet = "horizonExtra",
  plotAs = "points"
)

Treg_traj_skin_p_Batf = plotTrajectory(proj, 
  trajectory = "Treg_trajectory_skin", 
  colorBy = "GeneScoreMatrix", 
  name = "Batf", 
  continuousSet = "horizonExtra",
  plotAs = "points"
)

Treg_traj_colon_p_Batf = plotTrajectory(proj, 
  trajectory = "Treg_trajectory_colon", 
  colorBy = "GeneScoreMatrix", 
  name = "Batf", 
  continuousSet = "horizonExtra",
  plotAs = "points"
)

# Overlay motif scores on Treg trajectory
# Batf
Treg_traj_VAT_p_Batf_motif = plotTrajectory(proj, 
  trajectory = "Treg_trajectory_VAT", 
  colorBy = "MotifMatrix", 
  name = "z:Batf_790", 
  continuousSet = "solarExtra",
  plotAs = "points"
)

Treg_traj_skin_p_Batf_motif = plotTrajectory(proj, 
  trajectory = "Treg_trajectory_skin", 
  colorBy = "MotifMatrix", 
  name = "z:Batf_790", 
  continuousSet = "solarExtra",
  plotAs = "points"
)

Treg_traj_colon_p_Batf_motif = plotTrajectory(proj, 
  trajectory = "Treg_trajectory_colon", 
  colorBy = "MotifMatrix", 
  name = "z:Batf_790", 
  continuousSet = "solarExtra",
  plotAs = "points"
)

# Plot
ggAlignPlots(Treg_trajectory_VAT_p[[1]], Treg_traj_VAT_p_Batf[[2]], type = "h", sizes = c(1,2))

ggAlignPlots(Treg_trajectory_skin_p[[1]], Treg_traj_skin_p_Batf[[2]], type = "h", sizes = c(1,2))

ggAlignPlots(Treg_trajectory_colon_p[[1]], Treg_traj_colon_p_Batf[[2]], type = "h", sizes = c(1,2))

ggAlignPlots(Treg_trajectory_VAT_p[[1]], Treg_traj_VAT_p_Batf_motif[[2]], type = "h", sizes = c(1,2))

ggAlignPlots(Treg_trajectory_skin_p[[1]], Treg_traj_skin_p_Batf_motif[[2]], type = "h", sizes = c(1,2))

ggAlignPlots(Treg_trajectory_colon_p[[1]], Treg_traj_colon_p_Batf_motif[[2]], type = "h", sizes = c(1,2))

Using single-cell chromatin accessibility sequencing to characterize CD4+ T cells from murine tissues - downstream analysis

Kathrin Luise Braband, Annekathrin Ludt, Sara Salome Helbich, Malte Simon, Niklas Beumer, Benedikt Brors, Federico Marini and Michael Delacher

2023-05-08

1 Final filtered Dataset

1.1 Visualisation in UMAP Embedding

1.2 Overlay Gene Scores on UMAP

1.3 Define Clusters

2 Pseudo-bulk Replicates

3 Peak-Calling

3.1 Tissue Signatures

3.1.1 Import Tissue Signatures

3.1.2 Calculate Signatures Scores

3.2 Identifying Marker Peaks

3.2.1 Identifying Marker Peaks grouped by Clusters

4 Motif Enrichment

4.1 Add Motif Annotations

4.2 Motif Enrichment in Marker Peaks

4.2.1 Motif Enrichment in Marker Peaks grouped by Clusters

5 ChromVAR Deviations Enrichment

5.1 Add Motif Deviations

5.2 Plot the Distribution of Motifs in the Clusters

5.3 Overlay Motif z-score and corresponding gene score on UMAP

6 Motif Footprinting

6.1 Motif Footprinting in Treg Cells

6.2 Normalization of Footprints

6.2.1 Normalization by Subtraction of the Tn5 Bias

6.2.2 Normalization by Dividing by the Tn5 Bias

6.2.3 Footprinting without Normalization for the Tn5 Bias

7 Co-Accessibility Analysis

8 Trajectory Analysis

8.1 Pseudo-Time UMAPs