Code is expected to be run in the following conda environment:
mamba env create -f for_publication/gsea.yaml
mamba activate gsea_for_publication
Rscript -e ‘library(rmarkdown); rmarkdown::render(“for_publication/test_gene_set_enrichment.Rmd”, “html_document”)’
NOTE: This script takes a while to run
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(optparse)
library(org.Hs.eg.db)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, saveRDS, setdiff,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
##
## select
##
library(clusterProfiler)
##
## clusterProfiler v4.14.0 Learn more at https://yulab-smu.top/contribution-knowledge-mining/
##
## Please cite:
##
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan,
## X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal
## enrichment tool for interpreting omics data. The Innovation. 2021,
## 2(3):100141
##
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:AnnotationDbi':
##
## select
## The following object is masked from 'package:IRanges':
##
## slice
## The following object is masked from 'package:S4Vectors':
##
## rename
## The following object is masked from 'package:stats':
##
## filter
sessionInfo()
## R version 4.4.3 (2025-02-28)
## Platform: x86_64-conda-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
##
## Matrix products: default
## BLAS/LAPACK: /home/sarah.nadeau/miniforge3/envs/gsea_for_publication/lib/libopenblasp-r0.3.30.so; LAPACK version 3.12.0
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Rome
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] clusterProfiler_4.14.0 org.Hs.eg.db_3.20.0 AnnotationDbi_1.68.0
## [4] IRanges_2.40.0 S4Vectors_0.44.0 Biobase_2.66.0
## [7] BiocGenerics_0.52.0 optparse_1.7.5 dplyr_1.1.4
## [10] rmarkdown_2.29
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 farver_2.1.2 blob_1.2.4
## [4] R.utils_2.13.0 Biostrings_2.74.0 lazyeval_0.2.2
## [7] fastmap_1.2.0 digest_0.6.37 lifecycle_1.0.4
## [10] KEGGREST_1.46.0 tidytree_0.4.6 RSQLite_2.4.2
## [13] magrittr_2.0.3 compiler_4.4.3 rlang_1.1.6
## [16] sass_0.4.10 tools_4.4.3 igraph_2.1.4
## [19] ggtangle_0.0.7 data.table_1.17.8 knitr_1.50
## [22] bit_4.6.0 gson_0.1.0 plyr_1.8.9
## [25] RColorBrewer_1.1-3 aplot_0.2.8 BiocParallel_1.40.0
## [28] purrr_1.1.0 R.oo_1.27.1 grid_4.4.3
## [31] GOSemSim_2.32.0 enrichplot_1.26.1 colorspace_2.1-1
## [34] GO.db_3.20.0 ggplot2_3.5.2 scales_1.4.0
## [37] cli_3.6.5 crayon_1.5.3 treeio_1.30.0
## [40] generics_0.1.4 ggtree_3.14.0 httr_1.4.7
## [43] reshape2_1.4.4 getopt_1.20.4 ape_5.8-1
## [46] DBI_1.2.3 qvalue_2.38.0 cachem_1.1.0
## [49] DOSE_4.0.0 stringr_1.5.1 zlibbioc_1.52.0
## [52] splines_4.4.3 parallel_4.4.3 ggplotify_0.1.2
## [55] XVector_0.46.0 yulab.utils_0.2.0 vctrs_0.6.5
## [58] Matrix_1.7-3 jsonlite_2.0.0 patchwork_1.3.1
## [61] gridGraphics_0.5-1 ggrepel_0.9.6 bit64_4.6.0-1
## [64] tidyr_1.3.1 jquerylib_0.1.4 glue_1.8.0
## [67] codetools_0.2-20 cowplot_1.2.0 stringi_1.8.7
## [70] gtable_0.3.6 GenomeInfoDb_1.42.0 UCSC.utils_1.2.0
## [73] tibble_3.3.0 pillar_1.11.0 htmltools_0.5.8.1
## [76] fgsea_1.32.2 GenomeInfoDbData_1.2.13 R6_2.6.1
## [79] evaluate_1.0.4 lattice_0.22-7 R.methodsS3_1.8.2
## [82] png_0.1-8 memoise_2.0.1 ggfun_0.2.0
## [85] bslib_0.9.0 Rcpp_1.1.0 fastmatch_1.1-6
## [88] nlme_3.1-168 xfun_0.52 fs_1.6.6
## [91] pkgconfig_2.0.3
# Create directory for results
OUTDIR <- paste0("for_publication/results")
if (!dir.exists(OUTDIR)) {
dir.create(OUTDIR)
}
contrasts <- c(
"GFP_nanobody-Control_nanobody",
"GFP_nanobody-Untreated",
"Control_nanobody-Untreated"
)
data <- read.csv("for_publication/results/differential_abundance.csv")
for (i in 1:length(contrasts)) {
contrast_short <- gsub("treatment| ", "", contrasts[i])
data_tmp <- data %>%
mutate(contrast_short = gsub("treatment| ", "", contrast)) %>%
filter(contrast_short == contrasts[i]) %>%
filter(!is.na(logFC)) %>%
arrange(-logFC)
print(paste(
"Doing GSEA with", nrow(data_tmp),
"unique proteins for contrast", contrasts[i]
))
gene_list <- data_tmp$logFC
names(gene_list) <- data_tmp$PG.ProteinGroups
set.seed(14617)
res <- gseGO(
geneList = gene_list, # ordered list of statistics, like logFC
ont = "ALL",
OrgDb = org.Hs.eg.db,
keyType = "UNIPROT",
minGSSize = 15,
pvalueCutoff = 1,
pAdjustMethod = "BH",
scoreType = "std"
)
write.csv(
as.data.frame(res) %>% mutate(
contrast = contrasts[i],
contrast_short = contrast_short,
gene_list_order = "most_pos_to_most_neg_logFC"
),
row.names = FALSE,
file = paste0(OUTDIR, "/rawlogFC_gsea_", contrast_short, ".csv")
)
}
## [1] "Doing GSEA with 8035 unique proteins for contrast GFP_nanobody-Control_nanobody"
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
## [1] "Doing GSEA with 8033 unique proteins for contrast GFP_nanobody-Untreated"
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some of the pathways the P-values were likely overestimated. For
## such pathways log2err is set to NA.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some pathways, in reality P-values are less than 1e-10. You can
## set the `eps` argument to zero for better estimation.
## leading edge analysis...
## done...
## [1] "Doing GSEA with 8035 unique proteins for contrast Control_nanobody-Untreated"
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...