# Set seeds and initial parameters for ArchR
set.seed(1)

# Number of cores to use (adapt to your machine, but has to be > 1 otherwise plotTSSenrichment does not work properly)
addArchRThreads(threads = 8)
## Setting default number of Parallel threads to 8.
# Reference genome (adapt to your data, use the same reference that was used for the alignment)
addArchRGenome("mm10")
## Setting default genome to Mm10.
# Load ArchRProject
proj = loadArchRProject(
  path = "ArchRProject", 
  force = TRUE, 
  showLogo = FALSE
  )
# Get an overview of the project
proj
## 
##            ___      .______        ______  __    __  .______      
##           /   \     |   _  \      /      ||  |  |  | |   _  \     
##          /  ^  \    |  |_)  |    |  ,----'|  |__|  | |  |_)  |    
##         /  /_\  \   |      /     |  |     |   __   | |      /     
##        /  _____  \  |  |\  \\___ |  `----.|  |  |  | |  |\  \\___.
##       /__/     \__\ | _| `._____| \______||__|  |__| | _| `._____|
## 
## class: ArchRProject 
## outputDirectory: /omics/groups/OE0436/internal/Regensburg/Mainz/03_archr/scripts/methods/paper/ArchRProject 
## samples(5): scATAC_4 scATAC_1 scATAC_9 scATAC_8 scATAC_5
## sampleColData names(1): ArrowFiles
## cellColData names(34): Sample TSSEnrichment ... Treg_trajectory_skin
##   Treg_trajectory_colon
## numberOfCells(1): 14435
## medianTSS(1): 20.872
## medianFrags(1): 29176

1 Final filtered Dataset

1.1 Visualisation in UMAP Embedding

# Color by clusters
p_clusters = plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "Clusters", 
  embedding = "UMAP", 
  size = 0.5
  )

# Color by samples
p_samples = plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "Sample", 
  embedding = "UMAP", 
  size = 0.5
  )

plot_grid(p_clusters, p_samples, align = "h", ncol = 2)

# Color by doublet enrichment score
p_doubscore = plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "DoubletScore", 
  embedding = "UMAP", 
  plotAs = "points", 
  size = 0.5
)

p_doubenr=plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "DoubletEnrichment", 
  embedding = "UMAP", 
  plotAs = "points", 
  size = 0.5
)

# Color by number of fragments per cell
p_nfrags = plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "nFrags", 
  embedding = "UMAP", 
  plotAs = "points", 
  size = 0.5
)

# Plot
plot_grid(p_doubscore, p_doubenr, p_nfrags, align = "h", ncol = 3, scale = 1.1)

1.2 Overlay Gene Scores on UMAP

# Overlay gene scores on UMAP embedding, use MAGIC smoothing
# Get marker features
markersGS = getMarkerFeatures(
  ArchRProj = proj, 
  useMatrix = "GeneScoreMatrix", 
  groupBy = "Clusters",
  bias = c("TSSEnrichment", "log10(nFrags)"),
  testMethod = "wilcoxon"
)

# Define which genes to plot
markerGenes = c("Cd4", "Il2ra", "Foxp3", "Batf", "Ccr8", "Ikzf2", "Rorc", "Tbx21", "Ifng", "Gata3", "Il2", "Sell")

# Add impute weights
proj = addImputeWeights(proj)

# Plot
magic_genes = plotEmbedding(
  ArchRProj = proj, 
  colorBy = "GeneScoreMatrix", 
  name = markerGenes, 
  embedding = "UMAP",
  plotAs = "points",
  imputeWeights = getImputeWeights(proj),
  size = 0.5
)

# Plot all genes in cowplot
magic_genes_cow = lapply(magic_genes, function(x){
  x + guides(color = FALSE, fill = FALSE) + 
    theme_ArchR(baseSize = 6.5) +
    theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
    theme(
      axis.text.x=element_blank(), 
      axis.ticks.x=element_blank(), 
      axis.text.y=element_blank(), 
      axis.ticks.y=element_blank()
    )
})
do.call(cowplot::plot_grid, c(list(ncol = 4), magic_genes_cow))

# Save as PDF
plotPDF(magic_genes_cow,  
  name = "UMAP_genescores.pdf", 
  ArchRProj = proj, 
  addDOC = FALSE, 
  width = 5, 
  height = 5
)
p_Cd4 = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Cd4",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Cd4

p_Il2ra = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Il2ra",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Il2ra

p_Foxp3 = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Foxp3",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Foxp3

p_Batf = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Batf",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Batf

p_Ccr8 = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Ccr8",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Ccr8

p_Ikzf2 = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Ikzf2",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Ikzf2

p_Rorc = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Rorc",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Rorc

p_Tbx21 = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Tbx21",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Tbx21

p_Ifng = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Ifng",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Ifng

p_Gata3 = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Gata3",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Gata3

# Look at tissue Tregs and their precursor stages in the spleen
markerGenes_treg = c("Foxp3", "Batf", "Nfil3", "Klrg1", "Gata3", "Il1rl1", "Id3", "Id2", "Ikzf2")

# Plot
magic_genes_treg = plotEmbedding(
  ArchRProj = proj, 
  colorBy = "GeneScoreMatrix", 
  name = markerGenes_treg, 
  embedding = "UMAP",
  plotAs = "points", 
  imputeWeights = getImputeWeights(proj),
  size = 0.5
)

# Plot all genes in cowplot
magic_genes_cow_treg = lapply(magic_genes_treg, function(x){
  x + guides(color = FALSE, fill = FALSE) + 
    theme_ArchR(baseSize = 6.5) +
    theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
    theme(
      axis.text.x=element_blank(), 
      axis.ticks.x=element_blank(), 
      axis.text.y=element_blank(), 
      axis.ticks.y=element_blank()
    )
})
do.call(cowplot::plot_grid, c(list(ncol = 3), magic_genes_cow_treg))

# To further characterize Treg cells in our dataset, we look at various Treg
# markers
p_Foxp3 = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Foxp3",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Foxp3

p_Batf = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Batf",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Batf

p_Nfil3 = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Nfil3",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Nfil3

p_Klrg1 = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Klrg1",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Klrg1

p_Gata3 = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Gata3",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Gata3

p_Il1rl1 = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Il1rl1",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Il1rl1

p_Id3 = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Id3",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Id3

p_Id2 = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Id2",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Id2

p_Ikzf2 = plotGroups(
  ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "GeneScoreMatrix", 
  name = "Ikzf2",
  plotAs = "violin",
  alpha = 0.4,
  addBoxPlot = TRUE
)
p_Ikzf2

1.3 Define Clusters

# Define clusters for comparisons/ project subsetting (to be performed later)
tisTreg_cluster = c("C16", "C8", "C9", "C6") #C16 spleen, C8 fat, C9 skin, C6 colon
tTreg_cluster = c("C19", "C18") #C19 early precursor, C18 late precursor  
pTreg_cluster = "C7"

# Also scATAC_8 is from spleen, however this sample includes only CD25+ cells 
# whereas all other samples are CD4+
spleen = "scATAC_1" 
skin = "scATAC_9"
colon = "scATAC_4"
fat = "scATAC_5"
tissue = c(skin, colon, fat)
# Add cluster names to AchRProject
clusters = as.data.frame(proj@cellColData$Clusters)
colnames(clusters) = "V1"

clusters = clusters %>% mutate(across("V1", str_replace, "^C16$", "C16 tisTreg spleen")) 
clusters = clusters %>% mutate(across("V1", str_replace, "^C8$", "C8 tisTreg fat"))
clusters = clusters %>% mutate(across("V1", str_replace, "^C9$", "C9 tisTreg skin"))
clusters = clusters %>% mutate(across("V1", str_replace, "^C6$", "C6 tisTreg colon"))
clusters = clusters %>% mutate(across("V1", str_replace, "^C7$", "C7 pTreg colon")) 
clusters = clusters %>% mutate(across("V1", str_replace, "^C19$", "C19 early precursor spleen"))
clusters = clusters %>% mutate(across("V1", str_replace, "^C18$", "C18 late precursor spleen"))
clusters = clusters %>% mutate(across("V1", str_replace, "^C1$", "C1 naive CD4+ colon"))
clusters = clusters %>% mutate(across("V1", str_replace, "^C3$", "C3 naive CD4+ spleen/fat"))
clusters = clusters %>% mutate(across("V1", str_replace, "^C2$", "C2 naive CD4+ spleen"))
clusters = clusters %>% mutate(across("V1", str_replace, "^C4$", "C4 naive CD4+ spleen"))
clusters = clusters %>% mutate(across("V1", str_replace, "^C5$", "C5 naive CD4+ spleen"))
clusters = clusters %>% mutate(across("V1", str_replace, "^C14$", "C14 Th1 spleen/fat"))
clusters = clusters %>% mutate(across("V1", str_replace, "^C15$", "C15 Th1 colon"))
clusters = clusters %>% mutate(across("V1", str_replace, "^C11$", "C11 Th1 skin"))
clusters = clusters %>% mutate(across("V1", str_replace, "^C12$", "C12 Th17 skin"))
clusters = clusters %>% mutate(across("V1", str_replace, "^C10$", "C10 Th2 colon"))

clusters_vector = as.vector(clusters$V1)

proj@cellColData$Clusters2 = clusters_vector

2 Pseudo-bulk Replicates

# We have a look at the number of cells in the individual clusters
cM_doubfilter = confusionMatrix(paste0(proj$Clusters2), paste0(proj$Sample))
cM_doubfilter
## 21 x 5 sparse Matrix of class "dgCMatrix"
##                            scATAC_1 scATAC_4 scATAC_5 scATAC_8 scATAC_9
## C4 naive CD4+ spleen            482        .        .        .        .
## C5 naive CD4+ spleen            282        3        8        4        1
## C2 naive CD4+ spleen           2707        .       53        .        .
## C18 late precursor spleen         8        .        .      291        .
## C16 tisTreg spleen               32        4       19      475        8
## C14 Th1 spleen/fat              174       40      548        2       13
## C19 early precursor spleen      170        9       45     1191        3
## C8 tisTreg fat                   10        4      118       30        6
## C11 Th1 skin                      1        1        6        .      892
## C3 naive CD4+ spleen/fat        244       43      228        .       10
## C12 Th17 skin                     1        7       15        .      472
## C13                               .      461        6        .        1
## C6 tisTreg colon                  .      777       22        .        8
## C1 naive CD4+ colon               .     1276       20        .        .
## C7 pTreg colon                    .      513        6        .        .
## C15 Th1 colon                     .     1377        3        .        2
## C20                               .       23        7        .       30
## C21                               .      188       13        .       22
## C10 Th2 colon                     .      104       30        .        2
## C17                               .       30        5        .        1
## C9 tisTreg skin                   .        4        1        3      840
# The key parameter here is groupBy, which defines the groups for which 
# pseudo-bulk replicates should be made
proj = addGroupCoverages(
  ArchRProj = proj,
  groupBy = "Clusters2",
  minCells = 40, 
  maxCells = 500, 
  minReplicates = 2,
  maxReplicates = 5, 
  sampleRatio = 0.8,
  force = TRUE
  )

3 Peak-Calling

# You can use the following function to search the path to Macs2
# However, sometimes this might not work and you have to manually add the path 
# like it is shown in the second line. 
pathToMacs2 = findMacs2()
# If you manually add the path, you have to change this line!
pathToMacs2 = "../../../../Macs2/bin/macs2"

proj = addReproduciblePeakSet(
  ArchRProj = proj, 
  groupBy = "Clusters2", 
  pathToMacs2 = pathToMacs2
)
# Add peak matrix to ArchRProject
proj = addPeakMatrix(proj)

# Confirm that peak matrix has been added
getAvailableMatrices(proj)

3.1 Tissue Signatures

3.1.1 Import Tissue Signatures

# Check for enrichment of Treg signatures in Clusters
# signatures from PMID:33789089

signature_names <- c("early_progenitor_tisTreg","late_progenitor_tisTreg","tisTreg_core","tisTreg_skin","tisTreg_VAT","tisTreg_colon")
granges_list <- list()
n <- 1

for (x in signature_names){
  filename = paste("data/signatures/",x,".csv", sep = "")
  sig_csv = read.csv(filename, sep = "-", header = FALSE)
  sig_csv = data.frame(seqnames=sig_csv[,1], start=as.character(sig_csv[,2]), end=as.character(sig_csv[,3]))
  granges_list[[n]] = makeGRangesFromDataFrame(sig_csv, seqnames.field="seqnames", start.field="start", end.field="end")
  n <- n+1
}

early_progenitor_tisTreg_GR = granges_list[[1]]
late_progenitor_tisTreg_GR = granges_list[[2]]
tisTreg_core_GR = granges_list[[3]]
tisTreg_skin_GR = granges_list[[4]]
tisTreg_VAT_GR = granges_list[[5]]
tisTreg_colon_GR = granges_list[[6]]

3.1.2 Calculate Signatures Scores

# Signature z-scores can be calculated via the 'addDeviationsMatrix' function 
# of the chromVAR package. This wrapper function accepts the ArchRProject, the 
# GRanges objects with my signatures as a list, and the name for the signature 
# set as a string. The output is the ArchRProject with the z-scores and 
# deviation-scores added as additional columns to cellColData (one column for 
# each signature).

archr_add_peak_signatures = function(project, signature_list, signature_name){
  #signature_list: list of granges
  #signature_name: name string for the set of signatures
    add_df_to_cellcoldata = function(pro, pheno_df, force=FALSE){
      stopifnot(identical(rownames(pro@cellColData), rownames(pheno_df)))
      cnames = colnames(pheno_df)
      for(i in 1:ncol(pheno_df)){
        pro = addCellColData(ArchRProj = pro, data=pheno_df[, i], name = cnames[i],
                         cells = rownames(pro@cellColData), force = force)
      }
      return(pro)
    }
 if(length(signature_list)<2){
    stop('Currently, only works if at least two signatures are provided')
  }
  for(i in seq_along(signature_list)){
    names(signature_list[[i]]) = NULL
  }
  project <- addPeakAnnotations(ArchRProj = project, 
                             regions = signature_list,
                             name = signature_name,
                             force=T)

  method_use = "chromVAR" #does only work with fixed width peaks
  if(any(sapply(signature_list, function(x) length(unique(width(x)))) > 1)){
     method_use = 'ArchR'
  }
  
  project <- addBgdPeaks(project, force = T, method=method_use)
  
  project <- addDeviationsMatrix(
    ArchRProj = project, 
    peakAnnotation = signature_name,
    binarize=TRUE,
    bgdPeaks = getBgdPeaks(project, method = method_use),
    force = TRUE
  )
  
  dr_df = as.data.frame(project@cellColData)
  sig_se = getMatrixFromProject(project, paste0(signature_name, 'Matrix'))
  z_score_mat = t(assays(sig_se)[['z']])
  z_score_mat = z_score_mat[match(rownames(dr_df),rownames(z_score_mat)), ]
  colnames(z_score_mat) = paste0('z_', colnames(z_score_mat))
  stopifnot(identical(rownames(z_score_mat), rownames(dr_df)))
  dev_score_mat = t(assays(sig_se)[['deviations']])
  dev_score_mat = dev_score_mat[match(rownames(dr_df),rownames(dev_score_mat)), ]
  colnames(dev_score_mat) = paste0('dev_', colnames(dev_score_mat))
  stopifnot(identical(rownames(dev_score_mat), rownames(dr_df)))
  
  project = add_df_to_cellcoldata(project, z_score_mat, force=T)
  project = add_df_to_cellcoldata(project, dev_score_mat, force=T)
  return(project)
}

signature_list = list(early_progenitor_tisTreg_sig = early_progenitor_tisTreg_GR, 
  late_progenitor_tisTreg_sig = late_progenitor_tisTreg_GR, 
  tisTreg_core_sig = tisTreg_core_GR, 
  tisTreg_skin_sig = tisTreg_skin_GR, 
  tisTreg_VAT_sig = tisTreg_VAT_GR, 
  tisTreg_colon_sig = tisTreg_colon_GR 
)

proj <- archr_add_peak_signatures(proj, signature_list, "signatures")
# Overlay signatures on UMAP
p_early_progenitor_tisTreg_sig = plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "z_early_progenitor_tisTreg_sig", 
  embedding = "UMAP",
  plotAs = "points",
  size = 0.5
  )

p_late_progenitor_tisTreg_sig = plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "z_late_progenitor_tisTreg_sig", 
  embedding = "UMAP",
  plotAs = "points",
  size = 0.5
  )

p_tisTreg_core_sig = plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "z_tisTreg_core_sig", 
  embedding = "UMAP",
  plotAs = "points",
  size = 0.5
  )

p_tisTreg_skin_sig = plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "z_tisTreg_skin_sig", 
  embedding = "UMAP",
  plotAs = "points",
  size = 0.5
  )

p_tisTreg_VAT_sig = plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "z_tisTreg_VAT_sig", 
  embedding = "UMAP",
  plotAs = "points",
  size = 0.5
  )

p_tisTreg_colon_sig = plotEmbedding(
  ArchRProj = proj, 
  colorBy = "cellColData", 
  name = "z_tisTreg_colon_sig", 
  embedding = "UMAP",
  plotAs = "points",
  size = 0.5
  )

# Plot
ggAlignPlots(p_early_progenitor_tisTreg_sig, p_late_progenitor_tisTreg_sig, p_tisTreg_core_sig, type = "h")

ggAlignPlots(p_tisTreg_skin_sig, p_tisTreg_VAT_sig, p_tisTreg_colon_sig, type = "h")

3.2 Identifying Marker Peaks

3.2.1 Identifying Marker Peaks grouped by Clusters

# Identify marker peaks (differential peaks) between clusters (cell types):
markersPeaks = getMarkerFeatures(
  ArchRProj = proj, 
  useMatrix = "PeakMatrix", 
  groupBy = "Clusters2",
  bias = c("TSSEnrichment", "log10(nFrags)"),
  testMethod = "wilcoxon"
)

# Retrieve particular slices of this SummarizedExperiment that we are interested
# in: The default behavior of this function is to return a list of DataFrame 
# objects, one for each cell group
markerList = getMarkers(markersPeaks, cutOff = "FDR <= 0.01 & Log2FC >= 1")

# Instead of a list of DataFrame objects, we can use getMarkers() to return a 
# GRangesList object by setting returnGR = TRUE
markerList2 = getMarkers(markersPeaks, cutOff = "FDR <= 0.01 & Log2FC >= 1", returnGR = TRUE)

# Plot marker peaks as heatmap
heatmapPeaks = plotMarkerHeatmap(
  seMarker = markersPeaks, 
  cutOff = "FDR <= 0.001 & Log2FC >= 3",
  transpose = FALSE,
)
# Plot this heatmap using draw()
heatmap_peaks = ComplexHeatmap::draw(heatmapPeaks, 
  heatmap_legend_side = "right"
)

4 Motif Enrichment

4.1 Add Motif Annotations

# Look for motifs that are enriched in peaks that are up or down in various cell
# types
# We must first add these motif annotations to our ArchRProject; this 
# effectively creates a binary matrix where the presence of a motif in each peak
# is indicated numerically
proj = addMotifAnnotations(ArchRProj = proj, motifSet = "cisbp", name = "Motif")

4.2 Motif Enrichment in Marker Peaks

4.2.1 Motif Enrichment in Marker Peaks grouped by Clusters

# We perform motif enrichment on our marker peaks 
# In this dataset we used more strict thresholds for the FDR and Log2FC 
# because of the size of the data; otherwise plots could not be generated.
enrichMotifs = peakAnnoEnrichment(
  seMarker = markersPeaks, 
  ArchRProj = proj,
  peakAnnotation = "Motif",
  cutOff = "FDR <= 0.001 & Log2FC >= 2"
)

# Plot these motif enrichments across all cell groups
heatmapEM = plotEnrichHeatmap(enrichMotifs, 
  n = 10, 
  transpose = TRUE
)

# Visualize
row_order = c("C1 naive CD4+ colon","C3 naive CD4+ spleen/fat","C2 naive CD4+ spleen","C4 naive CD4+ spleen","C5 naive CD4+ spleen","C19 early precursor spleen","C18 late precursor spleen","C16 tisTreg spleen","C8 tisTreg fat","C9 tisTreg skin","C6 tisTreg colon","C7 pTreg colon","C14 Th1 spleen/fat","C15 Th1 colon","C11 Th1 skin","C12 Th17 skin","C10 Th2 colon","C13","C17","C20","C21")

heatmapEM2 = ComplexHeatmap::draw(heatmapEM, 
  heatmap_legend_side = "bot", 
  annotation_legend_side = "bot",
  row_order = row_order
)

5 ChromVAR Deviations Enrichment

ChromVAR is designed for predicting enrichment of TF activity on a per-cell basis from sparse chromatin accessibility data

5.1 Add Motif Deviations

# Add a set of background peaks; sample peaks based on similarity in GC-content 
# and nFrags across all samples using the Mahalanobis distance
proj = addBgdPeaks(proj, force = T)

# Compute per-cell deviations across all of our motif annotations
proj = addDeviationsMatrix(
  ArchRProj = proj,
  peakAnnotation = "Motif",
  force = TRUE
)
# Access deviations
# If we want this function to return a ggplot object, we set plot = TRUE 
# otherwise, this function would return the DataFrame object.
plotVarDev = getVarDeviations(proj, name = "MotifMatrix", plot = TRUE)
## DataFrame with 6 rows and 6 columns
##      seqnames     idx        name combinedVars combinedMeans      rank
##         <Rle> <array>     <array>    <numeric>     <numeric> <integer>
## f104        z     104     Fos_104     117.3228  -0.008371192         1
## f843        z     843 Smarcc1_843     117.0174  -0.000772193         2
## f108        z     108   Bach1_108      89.5822   0.003612684         3
## f119        z     119   Bach2_119      71.1386   0.039212150         4
## f98         z      98     Fosb_98      69.8579  -0.008232795         5
## f135        z     135    Jund_135      62.7090   0.083707388         6
# plot variable deviations:
plotVarDev

5.2 Plot the Distribution of Motifs in the Clusters

# If we want to extract a subset of motifs for downstream analysis,
# we can do this using the getFeatures() function.
motifs = c("Batf")
markerMotifs = getFeatures(proj, select = paste(motifs, collapse="|"), useMatrix = "MotifMatrix")
markerMotifs
## [1] "z:Batf_790"           "z:Batf3_789"          "deviations:Batf_790" 
## [4] "deviations:Batf3_789"
# Exclude Batf3
markerMotifs = grep("z:", markerMotifs, value = TRUE)
markerMotifs = markerMotifs[markerMotifs %ni% "z:Batf3_789"]
markerMotifs
## [1] "z:Batf_790"
# We supply the impute weights that we calculated previously during our gene 
# score analyses (impute weights allow us to smooth the signal across nearby 
# cells, helpful in the context of our sparse scATAC-seq data).

# First, re-run addImputeWeights (subset cells from the original imputation)
proj = addImputeWeights(proj)

# Then, plot distribution of chromVAR deviation scores
p_zscore = plotGroups(ArchRProj = proj, 
  groupBy = "Clusters", 
  colorBy = "MotifMatrix", 
  name = markerMotifs,
  imputeWeights = getImputeWeights(proj)
)

p_zscore

5.3 Overlay Motif z-score and corresponding gene score on UMAP

# Instead of looking at the distributions of these z-scores, we can overlay the 
# z-scores on our UMAP embedding as we’ve done previously for gene scores.
# Overlay z-scores
p_zscore_UMAP_Batf = plotEmbedding(
  ArchRProj = proj, 
  colorBy = "MotifMatrix", 
  name = "z:Batf_790", 
  embedding = "UMAP",
  plotAs = "points",
  imputeWeights = getImputeWeights(proj),
  size = 0.5
)

p_gene_exp_UMAP_Batf = plotEmbedding(
  ArchRProj = proj, 
  colorBy = "GeneScoreMatrix", 
  name = "Batf", 
  embedding = "UMAP",
  plotAs = "points",
  imputeWeights = getImputeWeights(proj),
  size = 0.5
)

plot_grid(p_zscore_UMAP_Batf, p_gene_exp_UMAP_Batf, align = "h", ncol = 2)

6 Motif Footprinting

# Obtain the positions of the relevant motifs
motifPositions = getPositions(proj, name = "Motif") 
motifPositions
## GRangesList object of length 884:
## $Tcfap2a_1
## GRanges object with 13813 ranges and 1 metadata column:
##           seqnames              ranges strand |     score
##              <Rle>           <IRanges>  <Rle> | <numeric>
##       [1]     chr1     4808001-4808010      - |   7.48774
##       [2]     chr1     4857513-4857522      + |   7.02247
##       [3]     chr1     6214851-6214860      + |   6.96612
##       [4]     chr1     7395839-7395848      + |   7.11519
##       [5]     chr1     7395840-7395849      - |   7.56411
##       ...      ...                 ...    ... .       ...
##   [13809]     chrX 167203920-167203929      + |   7.26500
##   [13810]     chrX 167209093-167209102      - |   7.27392
##   [13811]     chrX 167212044-167212053      + |   7.77924
##   [13812]     chrX 168673410-168673419      - |   7.46782
##   [13813]     chrX 169319935-169319944      + |   7.70292
##   -------
##   seqinfo: 20 sequences from an unspecified genome; no seqlengths
## 
## ...
## <883 more elements>
# This creates a GRangesList object where each TF motif is represented by a 
# separate GRanges object. We can subset this GRangesList to a few TF motifs 
# that we are interested in.
motifs_fp = c("Tbx21", "Gata3", "Rorc", "Rora", "Foxp3", "Batf", "Ikzf2")
markerMotifs_fp = unlist(lapply(motifs_fp, function(x) grep(x, names(motifPositions), value = TRUE)))
markerMotifs_fp
## [1] "Tbx21_762" "Gata3_384" "Rorc_678"  "Rora_682"  "Foxp3_311" "Batf3_789"
## [7] "Batf_790"
# Exclude Batf3
markerMotifs_fp = markerMotifs_fp[markerMotifs_fp %ni% "Batf3_789"]
markerMotifs_fp
## [1] "Tbx21_762" "Gata3_384" "Rorc_678"  "Rora_682"  "Foxp3_311" "Batf_790"
# To accurately profile TF footprints, a large number of reads is required. 
# Therefore we will use the pseudobulk data stored as group coverages calculated above.

# Compute footprints for the subset of marker motifs defined above
seFoot = getFootprints(
  ArchRProj = proj, 
  positions = motifPositions[markerMotifs_fp], 
  groupBy = "Clusters2"
)

6.1 Motif Footprinting in Treg Cells

# Subset proj (with all matrices added, see "methods.Rmd") to only Treg cells
proj_treg = proj[(proj@cellColData@listData[["Clusters"]] == c(tisTreg_cluster,tTreg_cluster)), ] 
# Load ArchRProject
proj_treg = loadArchRProject(
  path = "ArchRProject_Treg", 
  force = TRUE, 
  showLogo = FALSE
  )
# Add UMAP embedding
proj_treg = addUMAP(
  ArchRProj = proj_treg, 
  reducedDims = "IterativeLSI", 
  name = "UMAP", 
  nNeighbors = 30, 
  minDist = 0.5, 
  metric = "cosine", 
  force = TRUE
)
# The key parameter here is groupBy, which defines the groups for which 
# pseudo-bulk replicates should be made.
proj_treg = addGroupCoverages(
  ArchRProj = proj_treg,
  groupBy = "Clusters2",
  minCells = 40, 
  maxCells = 500, #standard
  minReplicates = 2, #standard
  maxReplicates = 5, #standard
  sampleRatio = 0.8,
  force = TRUE
  )
# Compute footprints for the subset of marker motifs defined above
seFoot = getFootprints(
  ArchRProj = proj_treg, 
  positions = motifPositions[markerMotifs_fp],
  groupBy = "Clusters2"
)

6.2 Normalization of Footprints

6.2.1 Normalization by Subtraction of the Tn5 Bias

plotFootprints(
  seFoot = seFoot,
  ArchRProj = proj, 
  normMethod = "Subtract",
  plotName = "Footprints-Subtract-Bias",
  addDOC = FALSE,
  smoothWindow = 5
)

# Plots are saved in the output directory of the ArchRProject by default
# (ggplot object would be extremely large).
plotFootprints(
  seFoot = seFoot,
  ArchRProj = proj_treg, 
  normMethod = "Subtract",
  plotName = "Footprints-Subtract-Bias_Treg",
  addDOC = FALSE,
  smoothWindow = 5
)

6.2.2 Normalization by Dividing by the Tn5 Bias

plotFootprints(
  seFoot = seFoot,
  ArchRProj = proj, 
  normMethod = "Divide",
  plotName = "Footprints-Divide-Bias",
  addDOC = FALSE,
  smoothWindow = 5
)
plotFootprints(
  seFoot = seFoot,
  ArchRProj = proj_treg, 
  normMethod = "Divide",
  plotName = "Footprints-Divide-Bias_Treg",
  addDOC = FALSE,
  smoothWindow = 5
)

6.2.3 Footprinting without Normalization for the Tn5 Bias

plotFootprints(
  seFoot = seFoot,
  ArchRProj = proj, 
  normMethod = "None",
  plotName = "Footprints-No-Normalization",
  addDOC = FALSE,
  smoothWindow = 5
)
plotFootprints(
  seFoot = seFoot,
  ArchRProj = proj_treg, 
  normMethod = "None",
  plotName = "Footprints-No-Normalization_Treg",
  addDOC = FALSE,
  smoothWindow = 5
)

7 Co-Accessibility Analysis

# Co-accessibility is a correlation in accessibility between two peaks 
# across many single cells. Calculate co-accessibility
proj = addCoAccessibility(
    ArchRProj = proj,
    reducedDims = "IterativeLSI"
)

# Retrieve co-accessibility information via the getCoAccessibility() function
# Increase resolution to plow fewer co-accessibility interactions
# if returnLoops = FALSE, then the function will return a dataframe instead of 
# a loop track
cA = getCoAccessibility(
    ArchRProj = proj,
    corCutOff = 0.5,
    resolution = 1000, 
    returnLoops = TRUE 
)

# Plot browser tracks of co-accessibility for our marker genes
markerGenes = c("Cd4", "Il2ra", "Foxp3", "Batf", "Ccr8", "Ikzf2", "Rorc", "Tbx21", "Ifng", "Gata3", "Il2", "Sell")

p_cA = plotBrowserTrack(
    ArchRProj = proj, 
    groupBy = "Clusters", 
    geneSymbol = markerGenes, 
    upstream = 50000,
    downstream = 50000,
    loops = getCoAccessibility(proj)
)
## GRanges object with 12 ranges and 2 metadata columns:
##        seqnames              ranges strand |     gene_id      symbol
##           <Rle>           <IRanges>  <Rle> | <character> <character>
##    [1]     chr6 124864693-124888209      - |       12504         Cd4
##    [2]     chr2   11642792-11693194      + |       16184       Il2ra
##    [3]     chrX     7579676-7595243      + |       20371       Foxp3
##    [4]    chr12   85686720-85709087      + |       53314        Batf
##    [5]     chr9 120092133-120094906      + |       12776        Ccr8
##    ...      ...                 ...    ... .         ...         ...
##    [8]    chr11   97098007-97115331      - |       57765       Tbx21
##    [9]    chr10 118441046-118445894      + |       15978        Ifng
##   [10]     chr2     9857078-9878600      - |       14462       Gata3
##   [11]     chr3   37120713-37125954      - |       16183         Il2
##   [12]     chr1 164062076-164080785      + |       20343        Sell
##   -------
##   seqinfo: 21 sequences from mm10 genome
# To plot the browser track select a specific marker gene using the $ accessor
grid::grid.newpage()
grid::grid.draw(p_cA$Batf)

8 Trajectory Analysis

# Create user-defined trajectory backbone from early via late progenitors to
# tissue Treg cells in the respective non-lymphoid tissue
Treg_trajectory_VAT = c("C19", "C18", "C16", "C8")
Treg_trajectory_skin = c("C19", "C18", "C16", "C9")
Treg_trajectory_colon = c("C19", "C18", "C16", "C6")

# Create the trajectory
# This creates a new column in cellColData that stores the pseudo-time value for 
# each cell in the trajectory
proj = addTrajectory(
    ArchRProj = proj, 
    name = "Treg_trajectory_VAT", 
    groupBy = "Clusters",
    trajectory = Treg_trajectory_VAT, 
    embedding = "UMAP", 
    force = TRUE
)

proj = addTrajectory(
    ArchRProj = proj, 
    name = "Treg_trajectory_skin", 
    groupBy = "Clusters",
    trajectory = Treg_trajectory_skin, 
    embedding = "UMAP", 
    force = TRUE
)

proj = addTrajectory(
    ArchRProj = proj, 
    name = "Treg_trajectory_colon", 
    groupBy = "Clusters",
    trajectory = Treg_trajectory_colon, 
    embedding = "UMAP", 
    force = TRUE
)

# Exclude cells with NA values because these are not part of the trajectory
proj$Treg_trajectory_VAT[!is.na(proj$Treg_trajectory_VAT)]
proj$Treg_trajectory_skin[!is.na(proj$Treg_trajectory_skin)]
proj$Treg_trajectory_colon[!is.na(proj$Treg_trajectory_colon)]

# Save ArchRProject
saveArchRProject(proj, 
  outputDirectory= "ArchRProject", 
  overwrite = TRUE,  
  load = FALSE
  )
# Plot trajectory
# This overlays the pseudo-time values on our UMAP embedding and displays an 
# arrow approximating the trajectory path from the spline-fit; cells that are 
# not part of the trajectory are coloured in gray.
Treg_trajectory_VAT_p = plotTrajectory(proj, 
    trajectory = "Treg_trajectory_VAT", 
    colorBy = "cellColData", 
    name = "Treg_trajectory_VAT",
    plotAs = "points"
)

Treg_trajectory_skin_p = plotTrajectory(proj, 
    trajectory = "Treg_trajectory_skin", 
    colorBy = "cellColData", 
    name = "Treg_trajectory_skin",
    plotAs = "points"
)

Treg_trajectory_colon_p = plotTrajectory(proj, 
    trajectory = "Treg_trajectory_colon", 
    colorBy = "cellColData", 
    name = "Treg_trajectory_colon",
    plotAs = "points"
)

ggAlignPlots(Treg_trajectory_VAT_p[[1]], Treg_trajectory_skin_p[[1]], Treg_trajectory_colon_p[[1]], type = "h")

8.1 Pseudo-Time UMAPs

# Overlay other features on the trajectory within our UMAP embedding
# (display specific features only within the cells that are relevant to our 
# trajectory)

# Overlay genescores on Treg trajectory
# Batf
Treg_traj_VAT_p_Batf = plotTrajectory(proj, 
  trajectory = "Treg_trajectory_VAT", 
  colorBy = "GeneScoreMatrix", 
  name = "Batf", 
  continuousSet = "horizonExtra",
  plotAs = "points"
)

Treg_traj_skin_p_Batf = plotTrajectory(proj, 
  trajectory = "Treg_trajectory_skin", 
  colorBy = "GeneScoreMatrix", 
  name = "Batf", 
  continuousSet = "horizonExtra",
  plotAs = "points"
)

Treg_traj_colon_p_Batf = plotTrajectory(proj, 
  trajectory = "Treg_trajectory_colon", 
  colorBy = "GeneScoreMatrix", 
  name = "Batf", 
  continuousSet = "horizonExtra",
  plotAs = "points"
)

# Overlay motif scores on Treg trajectory
# Batf
Treg_traj_VAT_p_Batf_motif = plotTrajectory(proj, 
  trajectory = "Treg_trajectory_VAT", 
  colorBy = "MotifMatrix", 
  name = "z:Batf_790", 
  continuousSet = "solarExtra",
  plotAs = "points"
)

Treg_traj_skin_p_Batf_motif = plotTrajectory(proj, 
  trajectory = "Treg_trajectory_skin", 
  colorBy = "MotifMatrix", 
  name = "z:Batf_790", 
  continuousSet = "solarExtra",
  plotAs = "points"
)

Treg_traj_colon_p_Batf_motif = plotTrajectory(proj, 
  trajectory = "Treg_trajectory_colon", 
  colorBy = "MotifMatrix", 
  name = "z:Batf_790", 
  continuousSet = "solarExtra",
  plotAs = "points"
)

# Plot
ggAlignPlots(Treg_trajectory_VAT_p[[1]], Treg_traj_VAT_p_Batf[[2]], type = "h", sizes = c(1,2))

ggAlignPlots(Treg_trajectory_skin_p[[1]], Treg_traj_skin_p_Batf[[2]], type = "h", sizes = c(1,2))

ggAlignPlots(Treg_trajectory_colon_p[[1]], Treg_traj_colon_p_Batf[[2]], type = "h", sizes = c(1,2))

ggAlignPlots(Treg_trajectory_VAT_p[[1]], Treg_traj_VAT_p_Batf_motif[[2]], type = "h", sizes = c(1,2))

ggAlignPlots(Treg_trajectory_skin_p[[1]], Treg_traj_skin_p_Batf_motif[[2]], type = "h", sizes = c(1,2))

ggAlignPlots(Treg_trajectory_colon_p[[1]], Treg_traj_colon_p_Batf_motif[[2]], type = "h", sizes = c(1,2))