Gene set enrichment based on differential abundance for proteomics on cultured neurons

Code is expected to be run in the following conda environment:
mamba env create -f for_publication/gsea.yaml
mamba activate gsea_for_publication

Rscript -e ‘library(rmarkdown); rmarkdown::render(“for_publication/test_gene_set_enrichment.Rmd”, “html_document”)’

NOTE: This script takes a while to run

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(optparse)
library(org.Hs.eg.db)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, saveRDS, setdiff,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## 
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
## 
##     select
## 
library(clusterProfiler)
## 
## clusterProfiler v4.14.0 Learn more at https://yulab-smu.top/contribution-knowledge-mining/
## 
## Please cite:
## 
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan,
## X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal
## enrichment tool for interpreting omics data. The Innovation. 2021,
## 2(3):100141
## 
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:AnnotationDbi':
## 
##     select
## The following object is masked from 'package:IRanges':
## 
##     slice
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## The following object is masked from 'package:stats':
## 
##     filter
sessionInfo()
## R version 4.4.3 (2025-02-28)
## Platform: x86_64-conda-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /home/sarah.nadeau/miniforge3/envs/gsea_for_publication/lib/libopenblasp-r0.3.30.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## time zone: Europe/Rome
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] clusterProfiler_4.14.0 org.Hs.eg.db_3.20.0    AnnotationDbi_1.68.0  
##  [4] IRanges_2.40.0         S4Vectors_0.44.0       Biobase_2.66.0        
##  [7] BiocGenerics_0.52.0    optparse_1.7.5         dplyr_1.1.4           
## [10] rmarkdown_2.29        
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1        farver_2.1.2            blob_1.2.4             
##  [4] R.utils_2.13.0          Biostrings_2.74.0       lazyeval_0.2.2         
##  [7] fastmap_1.2.0           digest_0.6.37           lifecycle_1.0.4        
## [10] KEGGREST_1.46.0         tidytree_0.4.6          RSQLite_2.4.2          
## [13] magrittr_2.0.3          compiler_4.4.3          rlang_1.1.6            
## [16] sass_0.4.10             tools_4.4.3             igraph_2.1.4           
## [19] ggtangle_0.0.7          data.table_1.17.8       knitr_1.50             
## [22] bit_4.6.0               gson_0.1.0              plyr_1.8.9             
## [25] RColorBrewer_1.1-3      aplot_0.2.8             BiocParallel_1.40.0    
## [28] purrr_1.1.0             R.oo_1.27.1             grid_4.4.3             
## [31] GOSemSim_2.32.0         enrichplot_1.26.1       colorspace_2.1-1       
## [34] GO.db_3.20.0            ggplot2_3.5.2           scales_1.4.0           
## [37] cli_3.6.5               crayon_1.5.3            treeio_1.30.0          
## [40] generics_0.1.4          ggtree_3.14.0           httr_1.4.7             
## [43] reshape2_1.4.4          getopt_1.20.4           ape_5.8-1              
## [46] DBI_1.2.3               qvalue_2.38.0           cachem_1.1.0           
## [49] DOSE_4.0.0              stringr_1.5.1           zlibbioc_1.52.0        
## [52] splines_4.4.3           parallel_4.4.3          ggplotify_0.1.2        
## [55] XVector_0.46.0          yulab.utils_0.2.0       vctrs_0.6.5            
## [58] Matrix_1.7-3            jsonlite_2.0.0          patchwork_1.3.1        
## [61] gridGraphics_0.5-1      ggrepel_0.9.6           bit64_4.6.0-1          
## [64] tidyr_1.3.1             jquerylib_0.1.4         glue_1.8.0             
## [67] codetools_0.2-20        cowplot_1.2.0           stringi_1.8.7          
## [70] gtable_0.3.6            GenomeInfoDb_1.42.0     UCSC.utils_1.2.0       
## [73] tibble_3.3.0            pillar_1.11.0           htmltools_0.5.8.1      
## [76] fgsea_1.32.2            GenomeInfoDbData_1.2.13 R6_2.6.1               
## [79] evaluate_1.0.4          lattice_0.22-7          R.methodsS3_1.8.2      
## [82] png_0.1-8               memoise_2.0.1           ggfun_0.2.0            
## [85] bslib_0.9.0             Rcpp_1.1.0              fastmatch_1.1-6        
## [88] nlme_3.1-168            xfun_0.52               fs_1.6.6               
## [91] pkgconfig_2.0.3
# Create directory for results
OUTDIR <- paste0("for_publication/results")

if (!dir.exists(OUTDIR)) {
    dir.create(OUTDIR)
}

Define constants

contrasts <- c(
    "GFP_nanobody-Control_nanobody",
    "GFP_nanobody-Untreated",
    "Control_nanobody-Untreated"
)

Load data

data <- read.csv("for_publication/results/differential_abundance.csv")

Make ordered list and do gene set enrichment test for each contrast

for (i in 1:length(contrasts)) {
    contrast_short <- gsub("treatment| ", "", contrasts[i])

    data_tmp <- data %>%
        mutate(contrast_short = gsub("treatment| ", "", contrast)) %>%
        filter(contrast_short == contrasts[i]) %>%
        filter(!is.na(logFC)) %>%
        arrange(-logFC)

    print(paste(
        "Doing GSEA with", nrow(data_tmp),
        "unique proteins for contrast", contrasts[i]
    ))

    gene_list <- data_tmp$logFC
    names(gene_list) <- data_tmp$PG.ProteinGroups

    set.seed(14617)
    res <- gseGO(
        geneList = gene_list,  # ordered list of statistics, like logFC
        ont = "ALL",
        OrgDb = org.Hs.eg.db,
        keyType = "UNIPROT",
        minGSSize = 15,
        pvalueCutoff = 1,
        pAdjustMethod = "BH",
        scoreType = "std"
    )

    write.csv(
        as.data.frame(res) %>% mutate(
            contrast = contrasts[i],
            contrast_short = contrast_short,
            gene_list_order = "most_pos_to_most_neg_logFC"
        ),
        row.names = FALSE,
        file = paste0(OUTDIR, "/rawlogFC_gsea_", contrast_short, ".csv")
    )
}
## [1] "Doing GSEA with 8035 unique proteins for contrast GFP_nanobody-Control_nanobody"
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
## [1] "Doing GSEA with 8033 unique proteins for contrast GFP_nanobody-Untreated"
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some of the pathways the P-values were likely overestimated. For
## such pathways log2err is set to NA.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some pathways, in reality P-values are less than 1e-10. You can
## set the `eps` argument to zero for better estimation.
## leading edge analysis...
## done...
## [1] "Doing GSEA with 8035 unique proteins for contrast Control_nanobody-Untreated"
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...