title: “TF Perturbation Analysis - Array vs Screen Comparison” author: “Xiangrui Kong” date: “十一月 27, 2025” output: html_document: toc: true toc_depth: 3 fig_width: 8 fig_height: 6 pdf_document: toc: true toc_depth: 3 fig_width: 8 fig_height: 6 —
screen array similarity Analysis 1. DEG similarity
# Set working directory
setwd("E:/ion/linux/perturb seq/TF_astrocyte/Version3_after_Sci_review/code for science/2.array_similarity")
# Clean environment
rm(list = ls())
# Define TF of interest
tf <- "tf31"
# Load array data (scRNA-seq DEGs)
dir <- getwd()
array <- read.csv(paste0(dir, "/array.DEGs.mouse_", tf, ".csv"), row.names = 1)
array$gene <- rownames(array)
# Load screen data (qPCR validation)
screen <- read.table(paste0(dir, "/screen.DEGs.mouse_", tf, ".txt"), header = TRUE)
screen$gene <- rownames(screen)
# Merge array and screen data
all <- merge(array, screen, by = "gene")
# Rename columns for clarity
colnames(all)[c(3, 8)] <- c("array_lfc", "screen_lfc")
# Calculate correlation
correlation <- cor(all$array_lfc, all$screen_lfc)
print(paste("Overall correlation:", round(correlation, 4)))
## [1] "Overall correlation: 0.469"
# Filter for significant DEGs from screen data (p < 0.05)
plot_data <- all[all$p_val.y < 0.05 & !is.na(all$gene), ]
plot_data <- plot_data[order(plot_data$screen_lfc), ]
# Calculate correlation for significant genes
cor_sig <- cor(plot_data$array_lfc, plot_data$screen_lfc)
# Create heatmap
library(pheatmap)
pheatmap(as.matrix(plot_data[, c("array_lfc", "screen_lfc")]),
border_color = NA,
color = c(colorRampPalette(c("navy", "floralwhite"))(500),
colorRampPalette(c("floralwhite", "#E2485F"))(500)),
breaks = c(seq(-1, 0, length.out = 500),
seq(0.01, 1, length.out = 500)),
show_rownames = FALSE,
cluster_cols = FALSE,
cluster_rows = TRUE,
main = paste0("TF ", tf, " - Pearson's r: ", signif(cor_sig, 4)),
cellwidth = 20)
2. pathway similarity
rm(list = ls())
# Load required libraries
library(dplyr)
library(ggplot2)
library(ggpubr) # For statistical annotations
library(ggalt) # For dumbbell plots
library(reshape2) # For data reshaping
2.1 Load Array Data (scRNA-seq Pathway Analysis)
# Define directory for array data
dir <- getwd()
i="tf31"
array_up <- read.table(
paste0(dir, "/array.mouse_", i, ".up_go_gprofiler.txt"),
sep = '\t',
header = TRUE
)
array_up$perturb <- i
array_up$sample <- "array"
array_up$value <- -log10(array_up$p_value) # Transform p-values
# Load down-regulated pathways
array_down <- read.table(
paste0(dir, "/array.mouse_", i, ".down_go_gprofiler.txt"),
sep = '\t',
header = TRUE
)
array_down$perturb <- i
array_down$sample <- "array"
array_down$value <- log10(array_down$p_value) # Signed transformation
screen_up<- read.table(
paste0(dir, "/screen.mouse_", i, ".up_go_gprofiler2.txt"),
sep = '\t',
header = TRUE
)
screen_up$perturb <- i
screen_up$sample <- "screen"
screen_up$value <- -log10(screen_up$p_value) # Transform p-values
# Load down-regulated pathways
screen_down <- read.table(
paste0(dir, "/screen.mouse_", i, ".down_go_gprofiler2.txt"),
sep = '\t',
header = TRUE
)
screen_down$perturb <- i
screen_down$sample <- "screen"
screen_down$value <- log10(screen_down$p_value) # Signed transformation
2.2. Define Visualization Functions
#' Create Scatter Plot for Pathway Comparison
#'
#' @param data Data frame containing pathway data
#' @param x_var Variable for x-axis (default: "array")
#' @param y_var Variable for y-axis (default: "screen")
#' @param color_var Variable for point coloring (default: "cat")
#' @param title Plot title
#' @param x_lab X-axis label
#' @param y_lab Y-axis label
#' @return ggplot object
create_scatter_plot <- function(data,
x_var = "array",
y_var = "screen",
color_var = "cat",
title = "",
x_lab = "signed log10-PValue of GO terms enriched in array",
y_lab = "signed log10-PValue of GO terms enriched in screen") {
ggplot(data, aes(x = .data[[x_var]], y = .data[[y_var]], color = .data[[color_var]])) +
geom_point(alpha = 0.5, size = 3) +
scale_color_manual(
values = c(
"overlaping pathway" = "navy",
"unique pathway in screen" = "gray",
"unique pathway in array" = "gray"
)
) +
geom_smooth(
method = "lm",
se = TRUE,
color = "blue"
) +
stat_cor(
method = "pearson",
label.x = 1,
label.y = -5,
color = "blue"
) +
geom_hline(yintercept = 0, color = "black") +
geom_vline(xintercept = 0, color = "black") +
labs(
title = title,
x = x_lab,
y = y_lab
) +
theme_bw()
}
#' Create Dumbbell Plot for Pathway Comparison
#'
#' @param data Data frame containing pathway data
#' @param x_var Variable for left points (default: "screen")
#' @param xend_var Variable for right points (default: "array")
#' @param y_var Variable for y-axis (default: "term_name")
#' @param colour_x Color for left points
#' @param colour_xend Color for right points
#' @param size_x Size of left points
#' @param size_xend Size of right points
#' @param line_size Size of connecting lines
#' @param line_color Color of connecting lines
#' @param x_lab X-axis label
#' @param title Plot title
#' @param legend_title Legend title
#' @param legend_labels Legend labels
#' @param reverse_y Whether to reverse y-axis
#' @return ggplot object
create_dumbbell_plot <- function(data,
x_var = "screen",
xend_var = "array",
y_var = "term_name",
colour_x = "darkred",
colour_xend = "#E2485F",
size_x = 4,
size_xend = 3,
line_size = 0.5,
line_color = "gray",
x_lab = "log10-transformed PValue of GO terms",
title = "",
legend_title = "Sample",
legend_labels = c("Screen", "Array"),
reverse_y = TRUE) {
p <- ggplot(data, aes(x = .data[[x_var]],
xend = .data[[xend_var]],
y = .data[[y_var]])) +
geom_dumbbell(colour_x = colour_x,
colour_xend = colour_xend,
size_x = size_x,
size_xend = size_xend,
size = line_size,
color = line_color) +
# Add invisible points for legend
geom_point(
aes(x = -Inf, y = Inf, color = legend_labels[1]),
size = 0, alpha = 0) +
geom_point(
aes(x = -Inf, y = Inf, color = legend_labels[2]),
size = 0, alpha = 0) +
scale_color_manual(
name = legend_title,
values = setNames(c(colour_x, colour_xend), legend_labels)) +
theme_bw() +
xlab(x_lab) +
labs(title = title) +
guides(color = guide_legend(override.aes = list(size = 3, alpha = 1))) +
theme(aspect.ratio = 4)
# Reverse y-axis if requested
if (reverse_y) {
p <- p + ylim(rev(data[[y_var]]))
}
return(p)
}
2.3 GO Pathway Scatter Plots
# Generate scatter plots for each TF
array_GO <- rbind(array_up, array_down)
screen_GO <- rbind(screen_up, screen_down)
# Prepare data for comparison
ALL <- rbind(
array_GO[, c("source", "term_name", "sample", "value")],
screen_GO[, c("source", "term_name", "sample", "value")]
)
# Filter for GO terms only
ALL <- ALL[grepl("GO", ALL$source), ]
ALL$term <- paste0(ALL$source, ";", ALL$term_name)
# Reshape to wide format
res1_ <- reshape2::dcast(ALL, term ~ sample, value.var = "value", fun.aggregate = sum)
res1_ <- as.data.frame(res1_)
rownames(res1_) <- res1_$term
res1_[is.na(res1_)] <- 0
# Categorize pathways
res1_[,"cat"] <- "overlaping pathway"
res1_[res1_$array == 0, "cat"] <- "unique pathway in screen"
res1_[res1_$screen == 0, "cat"] <- "unique pathway in array"
# Plot 1: All significant pathways
p1 <- create_scatter_plot(res1_, title = paste("TF", i, "- All significant GO pathways from array and screen"))
print(p1)
# Plot 2: Overlapping pathways only
res1_overlap <- subset(res1_, cat == "overlaping pathway")
p2 <- create_scatter_plot(res1_overlap, title = paste("TF", i, "- Overlapped significant GO pathways from array and screen"))
print(p2)
# Print summary statistics
cat("TF:", i, "\n")
## TF: tf31
cat("Total pathways:", nrow(res1_), "\n")
## Total pathways: 1034
cat("Overlapping pathways:", sum(res1_$cat == "overlaping pathway"), "\n")
## Overlapping pathways: 449
cat("Array unique pathways:", sum(res1_$cat == "unique pathway in array"), "\n")
## Array unique pathways: 413
cat("Screen unique pathways:", sum(res1_$cat == "unique pathway in screen"), "\n")
## Screen unique pathways: 172
cat("---\n")
## ---
2.4 GO Pathway Dumbbell Plots
# Generate dumbbell plots for each TF
# Up-regulated pathways
sc_up <- screen_up[screen_up$p_value < 0.05, ]
ar_up <- array_up
# Transform p-values
sc_up$value1 <- -log10(sc_up$p_value)
ar_up$value1 <- -log10(ar_up$p_value)
# Prepare terms
sc_up$term <- paste0(sc_up$source, ":", sc_up$term_name)
sc_up <- sc_up[order(sc_up$p_value), ]
top_terms <- sc_up$term[1:30] # Top 30 terms
ar_up$term <- paste0(ar_up$source, ":", ar_up$term_name)
# Merge data
dat1 <- ar_up[ar_up$term %in% top_terms, ]
dat2 <- sc_up[sc_up$term %in% top_terms, ]
toplot_up <- full_join(dat1[, c("term", "value1")],
dat2[, c("term", "value1")],
by = "term")
colnames(toplot_up) <- c("term_name", "array", "screen")
toplot_up[is.na(toplot_up)] <- 0
toplot_up <- toplot_up[order(toplot_up$screen, decreasing = TRUE), ]
toplot_up$term_name <- factor(toplot_up$term_name, levels = toplot_up$term_name)
# Create up-regulated dumbbell plot
p1 <- create_dumbbell_plot(
data = toplot_up,
colour_x = "darkred",
colour_xend = "#E2485F",
title = paste("TF", i, "- Top 30 Up-regulated GO terms in Screen")
)
print(p1)
# Down-regulated pathways
sc_down <- screen_down[screen_down$p_value < 0.05, ]
ar_down <- array_down
# Transform p-values
sc_down$value1 <- log10(sc_down$p_value)
ar_down$value1 <- log10(ar_down$p_value)
# Prepare terms
sc_down$term <- paste0(sc_down$source, ":", sc_down$term_name)
sc_down <- sc_down[order(sc_down$p_value), ]
top_terms <- sc_down$term[1:30] # Top 30 terms
ar_down$term <- paste0(ar_down$source, ":", ar_down$term_name)
# Merge data
dat1 <- ar_down[ar_down$term %in% top_terms, ]
dat2 <- sc_down[sc_down$term %in% top_terms, ]
toplot_down <- full_join(dat1[, c("term", "value1")],
dat2[, c("term", "value1")],
by = "term")
colnames(toplot_down) <- c("term_name", "array", "screen")
toplot_down[is.na(toplot_down)] <- 0
toplot_down <- toplot_down[order(toplot_down$screen, decreasing = TRUE), ]
toplot_down$term_name <- factor(toplot_down$term_name, levels = toplot_down$term_name)
# Create down-regulated dumbbell plot
p2 <- create_dumbbell_plot(
data = toplot_down,
colour_x = "navy",
colour_xend = "#4169E1",
title = paste("TF", i, "- Top 30 Down-regulated GO terms in Screen")
)
print(p2)
sessionInfo()
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=Chinese (Simplified)_China.utf8
## [2] LC_CTYPE=Chinese (Simplified)_China.utf8
## [3] LC_MONETARY=Chinese (Simplified)_China.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=Chinese (Simplified)_China.utf8
##
## time zone: Asia/Shanghai
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] reshape2_1.4.4 ggalt_0.4.0 ggpubr_0.6.1 ggplot2_3.5.2
## [5] dplyr_1.1.4 pheatmap_1.0.12
##
## loaded via a namespace (and not attached):
## [1] ash_1.0-15 sass_0.4.8 generics_0.1.4 tidyr_1.3.0
## [5] rstatix_0.7.2 proj4_1.0-14 KernSmooth_2.23-22 lattice_0.22-5
## [9] stringi_1.8.3 extrafontdb_1.0 digest_0.6.34 magrittr_2.0.3
## [13] evaluate_0.23 grid_4.5.1 RColorBrewer_1.1-3 fastmap_1.1.1
## [17] maps_3.4.2 Matrix_1.6-4 plyr_1.8.9 jsonlite_1.8.8
## [21] backports_1.4.1 Formula_1.2-5 mgcv_1.9-1 purrr_1.0.2
## [25] scales_1.4.0 jquerylib_0.1.4 abind_1.4-5 cli_3.6.5
## [29] rlang_1.1.6 splines_4.5.1 withr_3.0.2 cachem_1.0.8
## [33] tools_4.5.1 ggsignif_0.6.4 broom_1.0.5 vctrs_0.6.5
## [37] R6_2.6.1 lifecycle_1.0.4 stringr_1.5.1 car_3.1-3
## [41] MASS_7.3-60 pkgconfig_2.0.3 pillar_1.11.0 bslib_0.6.1
## [45] gtable_0.3.6 Rcpp_1.0.12 glue_1.8.0 xfun_0.41
## [49] tibble_3.3.0 tidyselect_1.2.1 highr_0.10 rstudioapi_0.15.0
## [53] knitr_1.45 farver_2.1.2 extrafont_0.19 nlme_3.1-164
## [57] htmltools_0.5.7 labeling_0.4.3 rmarkdown_2.25 carData_3.0-5
## [61] Rttf2pt1_1.3.12 compiler_4.5.1