title: “TF Perturbation Analysis - Array vs Screen Comparison” author: “Xiangrui Kong” date: “十一月 27, 2025” output: html_document: toc: true toc_depth: 3 fig_width: 8 fig_height: 6 pdf_document: toc: true toc_depth: 3 fig_width: 8 fig_height: 6 —

screen array similarity Analysis 1. DEG similarity

# Set working directory
setwd("E:/ion/linux/perturb seq/TF_astrocyte/Version3_after_Sci_review/code for science/2.array_similarity")

# Clean environment
rm(list = ls())

# Define TF of interest
tf <- "tf31"

# Load array data (scRNA-seq DEGs)
dir <- getwd()
array <- read.csv(paste0(dir, "/array.DEGs.mouse_", tf, ".csv"), row.names = 1)
array$gene <- rownames(array)

# Load screen data (qPCR validation)
screen <- read.table(paste0(dir, "/screen.DEGs.mouse_", tf, ".txt"), header = TRUE)
screen$gene <- rownames(screen)

# Merge array and screen data
all <- merge(array, screen, by = "gene")

# Rename columns for clarity
colnames(all)[c(3, 8)] <- c("array_lfc", "screen_lfc")

# Calculate correlation
correlation <- cor(all$array_lfc, all$screen_lfc)
print(paste("Overall correlation:", round(correlation, 4)))
## [1] "Overall correlation: 0.469"
# Filter for significant DEGs from screen data (p < 0.05)
plot_data <- all[all$p_val.y < 0.05 & !is.na(all$gene), ]
plot_data <- plot_data[order(plot_data$screen_lfc), ]

# Calculate correlation for significant genes
cor_sig <- cor(plot_data$array_lfc, plot_data$screen_lfc)

# Create heatmap
library(pheatmap)
pheatmap(as.matrix(plot_data[, c("array_lfc", "screen_lfc")]),
         border_color = NA,
         color = c(colorRampPalette(c("navy", "floralwhite"))(500),
                   colorRampPalette(c("floralwhite", "#E2485F"))(500)),
         breaks = c(seq(-1, 0, length.out = 500),
                    seq(0.01, 1, length.out = 500)),
         show_rownames = FALSE,
         cluster_cols = FALSE,
         cluster_rows = TRUE,
         main = paste0("TF ", tf, " - Pearson's r: ", signif(cor_sig, 4)),
         cellwidth = 20)

2. pathway similarity

rm(list = ls())

# Load required libraries
library(dplyr)
library(ggplot2)
library(ggpubr)      # For statistical annotations
library(ggalt)       # For dumbbell plots
library(reshape2)    # For data reshaping

2.1 Load Array Data (scRNA-seq Pathway Analysis)

# Define directory for array data
dir <- getwd()
i="tf31"

  array_up <- read.table(
    paste0(dir, "/array.mouse_", i, ".up_go_gprofiler.txt"),
    sep = '\t', 
    header = TRUE
  )
  array_up$perturb <- i
  array_up$sample <- "array"
  array_up$value <- -log10(array_up$p_value)  # Transform p-values
  
  # Load down-regulated pathways
  array_down <- read.table(
    paste0(dir, "/array.mouse_", i, ".down_go_gprofiler.txt"),
    sep = '\t', 
    header = TRUE
  )
  array_down$perturb <- i
  array_down$sample <- "array"
  array_down$value <- log10(array_down$p_value)  # Signed transformation



  screen_up<- read.table(
    paste0(dir, "/screen.mouse_", i, ".up_go_gprofiler2.txt"),
    sep = '\t', 
    header = TRUE
  )
  screen_up$perturb <- i
  screen_up$sample <- "screen"
  screen_up$value <- -log10(screen_up$p_value)  # Transform p-values
  
  # Load down-regulated pathways
  screen_down <- read.table(
    paste0(dir, "/screen.mouse_", i, ".down_go_gprofiler2.txt"),
    sep = '\t', 
    header = TRUE
  )
  screen_down$perturb <- i
  screen_down$sample <- "screen"
  screen_down$value <- log10(screen_down$p_value)  # Signed transformation

2.2. Define Visualization Functions

#' Create Scatter Plot for Pathway Comparison
#' 
#' @param data Data frame containing pathway data
#' @param x_var Variable for x-axis (default: "array")
#' @param y_var Variable for y-axis (default: "screen")
#' @param color_var Variable for point coloring (default: "cat")
#' @param title Plot title
#' @param x_lab X-axis label
#' @param y_lab Y-axis label
#' @return ggplot object

create_scatter_plot <- function(data, 
                                x_var = "array", 
                                y_var = "screen", 
                                color_var = "cat",
                                title = "",
                                x_lab = "signed log10-PValue of GO terms enriched in array",
                                y_lab = "signed log10-PValue of GO terms enriched in screen") {
  
  ggplot(data, aes(x = .data[[x_var]], y = .data[[y_var]], color = .data[[color_var]])) +
    geom_point(alpha = 0.5, size = 3) +
    scale_color_manual(
      values = c(
        "overlaping pathway" = "navy",
        "unique pathway in screen" = "gray",
        "unique pathway in array" = "gray"
      )
    ) +
    geom_smooth(
      method = "lm", 
      se = TRUE, 
      color = "blue"
    ) +
    stat_cor(
      method = "pearson",
      label.x = 1,
      label.y = -5,
      color = "blue"
    ) +
    geom_hline(yintercept = 0, color = "black") +
    geom_vline(xintercept = 0, color = "black") +
    labs(
      title = title,
      x = x_lab,
      y = y_lab
    ) +
    theme_bw()
}

#' Create Dumbbell Plot for Pathway Comparison
#' 
#' @param data Data frame containing pathway data
#' @param x_var Variable for left points (default: "screen")
#' @param xend_var Variable for right points (default: "array")
#' @param y_var Variable for y-axis (default: "term_name")
#' @param colour_x Color for left points
#' @param colour_xend Color for right points
#' @param size_x Size of left points
#' @param size_xend Size of right points
#' @param line_size Size of connecting lines
#' @param line_color Color of connecting lines
#' @param x_lab X-axis label
#' @param title Plot title
#' @param legend_title Legend title
#' @param legend_labels Legend labels
#' @param reverse_y Whether to reverse y-axis
#' @return ggplot object

create_dumbbell_plot <- function(data, 
                                 x_var = "screen", 
                                 xend_var = "array", 
                                 y_var = "term_name",
                                 colour_x = "darkred",
                                 colour_xend = "#E2485F",
                                 size_x = 4,
                                 size_xend = 3,
                                 line_size = 0.5,
                                 line_color = "gray",
                                 x_lab = "log10-transformed PValue of GO terms",
                                 title = "",
                                 legend_title = "Sample",
                                 legend_labels = c("Screen", "Array"),
                                 reverse_y = TRUE) {
  
  p <- ggplot(data, aes(x = .data[[x_var]], 
                        xend = .data[[xend_var]], 
                        y = .data[[y_var]])) +
    geom_dumbbell(colour_x = colour_x,
                  colour_xend = colour_xend,
                  size_x = size_x,
                  size_xend = size_xend,
                  size = line_size,
                  color = line_color) +
    # Add invisible points for legend
    geom_point(
      aes(x = -Inf, y = Inf, color = legend_labels[1]), 
      size = 0, alpha = 0) +
    geom_point(
      aes(x = -Inf, y = Inf, color = legend_labels[2]),  
      size = 0, alpha = 0) +
    scale_color_manual(
      name = legend_title,
      values = setNames(c(colour_x, colour_xend), legend_labels)) +
    theme_bw() + 
    xlab(x_lab) +
    labs(title = title) +
    guides(color = guide_legend(override.aes = list(size = 3, alpha = 1))) +
    theme(aspect.ratio = 4)
  
  # Reverse y-axis if requested
  if (reverse_y) {
    p <- p + ylim(rev(data[[y_var]]))
  }
  
  return(p)
}

2.3 GO Pathway Scatter Plots

# Generate scatter plots for each TF

  array_GO <- rbind(array_up, array_down)
  screen_GO <- rbind(screen_up, screen_down)
  
  # Prepare data for comparison
  ALL <- rbind(
    array_GO[, c("source", "term_name", "sample", "value")],
    screen_GO[, c("source", "term_name", "sample", "value")]
  )
  
  # Filter for GO terms only
  ALL <- ALL[grepl("GO", ALL$source), ]
  ALL$term <- paste0(ALL$source, ";", ALL$term_name)
  
  # Reshape to wide format
  res1_ <- reshape2::dcast(ALL, term ~ sample, value.var = "value", fun.aggregate = sum)
  res1_ <- as.data.frame(res1_)
  rownames(res1_) <- res1_$term
  res1_[is.na(res1_)] <- 0
  
  # Categorize pathways
  res1_[,"cat"] <- "overlaping pathway"
  res1_[res1_$array == 0, "cat"] <- "unique pathway in screen"
  res1_[res1_$screen == 0, "cat"] <- "unique pathway in array"
  

  
  # Plot 1: All significant pathways
  p1 <- create_scatter_plot(res1_, title = paste("TF", i, "- All significant GO pathways from array and screen"))
  print(p1)

  # Plot 2: Overlapping pathways only
  res1_overlap <- subset(res1_, cat == "overlaping pathway")
  p2 <- create_scatter_plot(res1_overlap, title = paste("TF", i, "- Overlapped significant GO pathways from array and screen"))
  print(p2)

  # Print summary statistics
  cat("TF:", i, "\n")
## TF: tf31
  cat("Total pathways:", nrow(res1_), "\n")
## Total pathways: 1034
  cat("Overlapping pathways:", sum(res1_$cat == "overlaping pathway"), "\n")
## Overlapping pathways: 449
  cat("Array unique pathways:", sum(res1_$cat == "unique pathway in array"), "\n")
## Array unique pathways: 413
  cat("Screen unique pathways:", sum(res1_$cat == "unique pathway in screen"), "\n")
## Screen unique pathways: 172
  cat("---\n")
## ---

2.4 GO Pathway Dumbbell Plots

# Generate dumbbell plots for each TF

  
  # Up-regulated pathways
  sc_up <- screen_up[screen_up$p_value < 0.05, ]
  ar_up <- array_up
  
  # Transform p-values
  sc_up$value1 <- -log10(sc_up$p_value)
  ar_up$value1 <- -log10(ar_up$p_value)
  
  # Prepare terms
  sc_up$term <- paste0(sc_up$source, ":", sc_up$term_name)
  sc_up <- sc_up[order(sc_up$p_value), ]
  top_terms <- sc_up$term[1:30]  # Top 30 terms
  
  ar_up$term <- paste0(ar_up$source, ":", ar_up$term_name)
  
  # Merge data
  dat1 <- ar_up[ar_up$term %in% top_terms, ] 
  dat2 <- sc_up[sc_up$term %in% top_terms, ] 
  toplot_up <- full_join(dat1[, c("term", "value1")], 
                         dat2[, c("term", "value1")], 
                         by = "term")
  colnames(toplot_up) <- c("term_name", "array", "screen")
  toplot_up[is.na(toplot_up)] <- 0
  toplot_up <- toplot_up[order(toplot_up$screen, decreasing = TRUE), ]
  toplot_up$term_name <- factor(toplot_up$term_name, levels = toplot_up$term_name)
  
  # Create up-regulated dumbbell plot
  p1 <- create_dumbbell_plot(
    data = toplot_up,
    colour_x = "darkred",
    colour_xend = "#E2485F",
    title = paste("TF", i, "- Top 30 Up-regulated GO terms in Screen")
  )
  print(p1)

  # Down-regulated pathways
  sc_down <- screen_down[screen_down$p_value < 0.05, ]
  ar_down <- array_down
  
  # Transform p-values
  sc_down$value1 <- log10(sc_down$p_value)
  ar_down$value1 <- log10(ar_down$p_value)
  
  # Prepare terms
  sc_down$term <- paste0(sc_down$source, ":", sc_down$term_name)
  sc_down <- sc_down[order(sc_down$p_value), ]
  top_terms <- sc_down$term[1:30]  # Top 30 terms
  
  ar_down$term <- paste0(ar_down$source, ":", ar_down$term_name)
  
  # Merge data
  dat1 <- ar_down[ar_down$term %in% top_terms, ] 
  dat2 <- sc_down[sc_down$term %in% top_terms, ] 
  toplot_down <- full_join(dat1[, c("term", "value1")], 
                           dat2[, c("term", "value1")], 
                           by = "term")
  colnames(toplot_down) <- c("term_name", "array", "screen")
  toplot_down[is.na(toplot_down)] <- 0
  toplot_down <- toplot_down[order(toplot_down$screen, decreasing = TRUE), ]
  toplot_down$term_name <- factor(toplot_down$term_name, levels = toplot_down$term_name)
  
  # Create down-regulated dumbbell plot
  p2 <- create_dumbbell_plot(
    data = toplot_down,
    colour_x = "navy",
    colour_xend = "#4169E1",
    title = paste("TF", i, "- Top 30 Down-regulated GO terms in Screen")
  )
  print(p2)

  1. Session Information
sessionInfo()
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=Chinese (Simplified)_China.utf8 
## [2] LC_CTYPE=Chinese (Simplified)_China.utf8   
## [3] LC_MONETARY=Chinese (Simplified)_China.utf8
## [4] LC_NUMERIC=C                               
## [5] LC_TIME=Chinese (Simplified)_China.utf8    
## 
## time zone: Asia/Shanghai
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] reshape2_1.4.4  ggalt_0.4.0     ggpubr_0.6.1    ggplot2_3.5.2  
## [5] dplyr_1.1.4     pheatmap_1.0.12
## 
## loaded via a namespace (and not attached):
##  [1] ash_1.0-15         sass_0.4.8         generics_0.1.4     tidyr_1.3.0       
##  [5] rstatix_0.7.2      proj4_1.0-14       KernSmooth_2.23-22 lattice_0.22-5    
##  [9] stringi_1.8.3      extrafontdb_1.0    digest_0.6.34      magrittr_2.0.3    
## [13] evaluate_0.23      grid_4.5.1         RColorBrewer_1.1-3 fastmap_1.1.1     
## [17] maps_3.4.2         Matrix_1.6-4       plyr_1.8.9         jsonlite_1.8.8    
## [21] backports_1.4.1    Formula_1.2-5      mgcv_1.9-1         purrr_1.0.2       
## [25] scales_1.4.0       jquerylib_0.1.4    abind_1.4-5        cli_3.6.5         
## [29] rlang_1.1.6        splines_4.5.1      withr_3.0.2        cachem_1.0.8      
## [33] tools_4.5.1        ggsignif_0.6.4     broom_1.0.5        vctrs_0.6.5       
## [37] R6_2.6.1           lifecycle_1.0.4    stringr_1.5.1      car_3.1-3         
## [41] MASS_7.3-60        pkgconfig_2.0.3    pillar_1.11.0      bslib_0.6.1       
## [45] gtable_0.3.6       Rcpp_1.0.12        glue_1.8.0         xfun_0.41         
## [49] tibble_3.3.0       tidyselect_1.2.1   highr_0.10         rstudioapi_0.15.0 
## [53] knitr_1.45         farver_2.1.2       extrafont_0.19     nlme_3.1-164      
## [57] htmltools_0.5.7    labeling_0.4.3     rmarkdown_2.25     carData_3.0-5     
## [61] Rttf2pt1_1.3.12    compiler_4.5.1