
---
title: "Microglia Signatures Analysis"

---

# Introduction

This document details the analysis comparing differentially expressed genes (DEGs) from a bulk RNA-seq experiment in microglia against a public database of microglia gene signatures. The analysis uses Fisher's Exact Test for overlap significance and Gene Set Enrichment Analysis (GSEA) for rank-based enrichment.

# 1. Setup and Installation

This chunk installs and loads all the necessary R packages for the analysis.

```{r setup, message=FALSE, warning=FALSE}
# install.packages(c("tidyverse", "readxl", "VennDiagram", "data.table", "pheatmap"))
# if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
# BiocManager::install("fgsea")

library(tidyverse)
library(data.table)
library(readxl)
library(fgsea)
library(pheatmap)
2. Load and Prepare Data
Load the DEG table and the public microglia signatures. Then, filter for up- and down-regulated genes and format the public signatures into a named list.
code
{r load-data}
# Load your DEG table (replace with relative path)
deg <- read_csv("data/Table 1_DEG Bulk RNAseq Microglia.csv")

# Filter up and down regulated genes (adjust thresholds if needed)
up_genes <- deg %>%
  filter(log2FoldChange > 1, padj < 0.05) %>%
  pull(Gene)

down_genes <- deg %>%
  filter(log2FoldChange < -1, padj < 0.05) %>%
  pull(Gene)

# Load public signature database (replace with relative path)
humi <- read_csv("data/Table Microglia signatures HuMiCa.csv")

# Create a named list of gene sets (Population -> genes)
gene_sets <- humi %>%
  group_by(Population) %>%
  summarise(genes = list(unique(Gene))) %>%
  deframe()
3. Enrichment Analysis using Fisher's Exact Test
This section calculates the statistical significance of the overlap between the DEGs and each public signature using Fisher's Exact Test, including the Odds Ratio.
code
{r fisher-test}
# Define total number of genes for background
total_genes <- 20000  # An estimate for the human transcriptome

# Function for Fisher's test that returns p-value and odds ratio
run_fisher <- function(my_genes, public_genes, total = total_genes) {
  a <- length(intersect(my_genes, public_genes))
  b <- length(my_genes) - a
  c <- length(public_genes) - a
  d <- total - (a + b + c)
  ft <- fisher.test(matrix(c(a, b, c, d), nrow = 2), alternative = "greater")
  list(p = ft$p.value, odds = ft$estimate[])
}

# Run for up-regulated genes
fisher_up <- map_dfr(names(gene_sets), function(pop) {
  result <- run_fisher(up_genes, gene_sets[[pop]])
  tibble(Population = pop, PValue = result$p, OddsRatio = result$odds, Direction = "Up")
})

# Run for down-regulated genes
fisher_down <- map_dfr(names(gene_sets), function(pop) {
  result <- run_fisher(down_genes, gene_sets[[pop]])
  tibble(Population = pop, PValue = result$p, OddsRatio = result$odds, Direction = "Down")
})

# Combine results, adjust p-values, and add metadata
fisher_all <- bind_rows(fisher_up, fisher_down) %>%
  mutate(FDR = p.adjust(PValue, method = "fdr")) %>%
  rowwise() %>%
  mutate(nGenes = length(intersect(
    if (Direction == "Up") up_genes else down_genes,
    gene_sets[[Population]]
  ))) %>%
  ungroup()

# Add Study info from HuMiCa metadata
study_mapping <- humi %>%
  distinct(Population, Study)
fisher_all <- fisher_all %>%
  left_join(study_mapping, by = "Population") %>%
  arrange(FDR)

head(fisher_all)
4. Visualize Fisher's Test Results
Create a lollipop plot to visualize the significantly enriched signatures, faceted by study.
code
{r visualize-fisher}
# Lollipop Plot faceted by Study
p <- fisher_all %>%
  filter(FDR < 0.05) %>%
  ggplot(aes(x = reorder(Population, -log10(FDR)),
             y = -log10(FDR),
             color = OddsRatio,
             size = nGenes)) +
  geom_segment(aes(xend = Population, y = 0, yend = -log10(FDR)), color = "gray70") +
  geom_point(alpha = 0.8) +
  coord_flip() +
  facet_grid(Study ~ ., scales = "free_y", space = "free_y") +
  scale_color_gradientn(colors = c("skyblue", "orange", "darkred"),
                        trans = "log",
                        name = "Odds Ratio") +
  scale_size_continuous(name = "Overlap (# genes)") +
  labs(title = "Signatures Enriched in DEGs",
       x = "Signature", y = "-log10(FDR)") +
  theme_minimal(base_size = 13) +
  theme(strip.text.y = element_text(angle = 0, hjust = 0),
        panel.spacing.y = unit(1, "lines"))

print(p)

# Export the plot
ggsave("output/fisher_signatures_plot.png", plot = p, width = 8, height = 10, dpi = 600)
5. Visualize Overlapping Genes
These visualizations show which specific genes overlap between your DEG list and the public signatures.
5.1 Heatmap of Top 50 Overlapping Genes
code
{r heatmap-overlap}
# Get top 50 most frequent overlapping genes
deg_genes <- deg %>% filter(padj < 0.05) %>% pull(Gene)
overlap_df <- humi %>% filter(Gene %in% deg_genes)

top_genes <- overlap_df %>%
  count(Gene, sort = TRUE) %>%
  slice_head(n = 50) %>%
  pull(Gene)

# Create a binary presence/absence matrix
binary_mat <- overlap_df %>%
  filter(Gene %in% top_genes) %>%
  mutate(Pop_ID = paste(Study, Population, sep = " | ")) %>%
  distinct(Gene, Pop_ID) %>%
  mutate(present = 1) %>%
  pivot_wider(names_from = Pop_ID, values_from = present, values_fill = 0) %>%
  column_to_rownames("Gene")

# Plot the heatmap
heatmap_plot <- pheatmap(binary_mat,
         cluster_rows = TRUE,
         cluster_cols = TRUE,
         color = c("white", "black"),
         legend = FALSE,
         main = "Presence of Top 50 Overlapping Genes")

# Save the heatmap
ggsave("output/fisher_heatmap50_plot.png", plot = heatmap_plot, width = 12, height = 10, dpi = 600)
5.2 Heatmap with Smaller Fonts
A variation of the heatmap with adjusted font sizes for compactness.
code
{r heatmap-small-font}
heatmap_sfs <-pheatmap(binary_mat,
         cluster_rows = TRUE,
         cluster_cols = TRUE,
         color = c("white", "black"),
         legend = FALSE,
         main = "Top 50 Overlapping Genes",
         fontsize = 9,
         fontsize_row = 7,
         fontsize_col = 7)

# Export as PNG
ggsave("output/fisher_heatmap_sfs_plot.png", plot = heatmap_sfs, width = 10, height = 12, dpi = 600)
5.3 Dot Plot of Overlapping Genes
An alternative visualization to the heatmap, showing gene presence with dots.
code
{r dotplot-overlap}
overlap_df %>%
  filter(Gene %in% top_genes) %>%
  mutate(Pop_ID = paste(Study, Population, sep = " | ")) %>%
  ggplot(aes(x = Pop_ID, y = Gene)) +
  geom_point(size = 3) +
  coord_flip() +
  labs(title = "Presence of Top 50 Overlapping Genes",
       x = "Signature (Study | Population)", y = "Gene") +
  theme_minimal(base_size = 12) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))```

### 5.4 Heatmap of Upregulated Genes Only

This analysis is repeated, focusing only on the overlap with your **upregulated** genes.

```{r heatmap-upregulated-only}
# Filter the public signature table with upregulated genes only
overlap_df_up <- humi %>%
  filter(Gene %in% up_genes)

top_up_genes <- overlap_df_up %>%
  count(Gene, sort = TRUE) %>%
  slice_head(n = 100) %>%
  pull(Gene)

# Create binary presence/absence matrix for upregulated genes
binary_mat_up <- overlap_df_up %>%
  filter(Gene %in% top_up_genes) %>%
  mutate(Pop_ID = paste(Study, Population, sep = " | ")) %>%
  distinct(Gene, Pop_ID) %>%
  mutate(present = 1) %>%
  pivot_wider(names_from = Pop_ID, values_from = present, values_fill = 0) %>%
  column_to_rownames("Gene")

# Plot binary heatmap
heatmapUP <- pheatmap(binary_mat_up,
         cluster_rows = TRUE,
         cluster_cols = TRUE,
         color = c("white", "black"),
         legend = FALSE,
         main = "Presence of Top 100 Upregulated Overlapping Genes")

# **BUG FIX**: The original code saved the old `heatmap` object.
# The line below correctly saves the new `heatmapUP` object.
ggsave("output/fisher_heatmapUP_plot.png", plot = heatmapUP, width = 12, height = 14, dpi = 600)
6. Gene Set Enrichment Analysis (GSEA)
This section performs GSEA using a ranked list of all genes from the DEG analysis.
code
{r gsea-analysis}
# Load public signatures with Fold Change data (replace with relative path)
humi_fc <- read_csv("data/Table Microglia signatures HuMiCa_FC.csv")

# Create a named list of gene sets for GSEA
gene_sets_gsea <- humi_fc %>%
  group_by(cluster) %>%
  summarise(genes = list(unique(Gene))) %>%
  deframe()

# Create a full ranked list of genes for GSEA
ranked_genes <- deg %>%
  filter(!is.na(log2FoldChange)) %>%
  arrange(desc(log2FoldChange)) %>%
  distinct(Gene, .keep_all = TRUE) %>%
  select(Gene, log2FoldChange) %>%
  deframe()

# Run GSEA
gsea_res <- fgsea(
  pathways = gene_sets_gsea,
  stats = ranked_genes,
  nperm = 10000
) %>%
  as_tibble() %>%
  arrange(padj)

head(gsea_res)
7. Visualize GSEA Results
Create a dot plot to visualize the top GSEA results, showing the Normalized Enrichment Score (NES) and significance.
code
{r visualize-gsea}
# GSEA dot plot for top 25 signatures
gsea_plot <- gsea_res %>%
  filter(padj < 0.05) %>%
  slice_max(order_by = abs(NES), n = 25) %>%
  ggplot(aes(x = NES, y = reorder(pathway, NES),
             size = -log10(padj),
             color = NES)) +
  geom_point(alpha = 0.8) +
  scale_color_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0, name = "NES") +
  scale_size_continuous(name = "-log10(FDR)") +
  labs(title = "GSEA of Ranked Genes Against Microglia Signatures",
       x = "Normalized Enrichment Score (NES)", y = "Signature") +
  theme_minimal(base_size = 13)

print(gsea_plot)

# Save the plot
ggsave("output/gsea_results_plot.png", plot = gsea_plot, width = 8, height = 8, dpi = 600)