Code is expected to be run in the following conda environment: mamba env create -f for_publication/gsea.yaml mamba activate gsea_for_publication
Rscript -e ‘library(rmarkdown); rmarkdown::render(“for_publication/make_plots.Rmd”, “html_document”)’
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(stringr)
library(org.Hs.eg.db)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, saveRDS, setdiff,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:tidyr':
##
## expand
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
##
## select
##
library(AnnotationDbi)
library(ggrepel)
library(clusterProfiler)
##
## clusterProfiler v4.14.0 Learn more at https://yulab-smu.top/contribution-knowledge-mining/
##
## Please cite:
##
## S Xu, E Hu, Y Cai, Z Xie, X Luo, L Zhan, W Tang, Q Wang, B Liu, R Wang,
## W Xie, T Wu, L Xie, G Yu. Using clusterProfiler to characterize
## multiomics data. Nature Protocols. 2024, 19(11):3292-3320
##
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:AnnotationDbi':
##
## select
## The following object is masked from 'package:IRanges':
##
## slice
## The following object is masked from 'package:S4Vectors':
##
## rename
## The following object is masked from 'package:stats':
##
## filter
library(ggpubr)
library(GO.db)
library(viridis)
## Loading required package: viridisLite
library(stringr)
library(ggridges)
sessionInfo()
## R version 4.4.3 (2025-02-28)
## Platform: x86_64-conda-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
##
## Matrix products: default
## BLAS/LAPACK: /home/sarah.nadeau/miniforge3/envs/gsea_for_publication/lib/libopenblasp-r0.3.30.so; LAPACK version 3.12.0
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Rome
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] ggridges_0.5.6 viridis_0.6.5 viridisLite_0.4.2
## [4] GO.db_3.20.0 ggpubr_0.6.1 clusterProfiler_4.14.0
## [7] ggrepel_0.9.6 org.Hs.eg.db_3.20.0 AnnotationDbi_1.68.0
## [10] IRanges_2.40.0 S4Vectors_0.44.0 Biobase_2.66.0
## [13] BiocGenerics_0.52.0 stringr_1.5.1 ggplot2_3.5.2
## [16] tidyr_1.3.1 dplyr_1.1.4 rmarkdown_2.29
##
## loaded via a namespace (and not attached):
## [1] DBI_1.2.3 gson_0.1.0 gridExtra_2.3
## [4] rlang_1.1.6 magrittr_2.0.3 DOSE_4.0.0
## [7] compiler_4.4.3 RSQLite_2.4.2 png_0.1-8
## [10] vctrs_0.6.5 reshape2_1.4.4 pkgconfig_2.0.3
## [13] crayon_1.5.3 fastmap_1.2.0 backports_1.5.0
## [16] XVector_0.46.0 enrichplot_1.26.1 UCSC.utils_1.2.0
## [19] purrr_1.1.0 bit_4.6.0 xfun_0.52
## [22] zlibbioc_1.52.0 cachem_1.1.0 aplot_0.2.8
## [25] GenomeInfoDb_1.42.0 jsonlite_2.0.0 blob_1.2.4
## [28] BiocParallel_1.40.0 broom_1.0.9 parallel_4.4.3
## [31] R6_2.6.1 bslib_0.9.0 stringi_1.8.7
## [34] RColorBrewer_1.1-3 car_3.1-3 jquerylib_0.1.4
## [37] GOSemSim_2.32.0 Rcpp_1.1.0 knitr_1.50
## [40] ggtangle_0.0.7 R.utils_2.13.0 Matrix_1.7-3
## [43] splines_4.4.3 igraph_2.1.4 tidyselect_1.2.1
## [46] abind_1.4-5 qvalue_2.38.0 codetools_0.2-20
## [49] lattice_0.22-7 tibble_3.3.0 plyr_1.8.9
## [52] treeio_1.30.0 withr_3.0.2 KEGGREST_1.46.0
## [55] evaluate_1.0.4 gridGraphics_0.5-1 Biostrings_2.74.0
## [58] pillar_1.11.0 ggtree_3.14.0 carData_3.0-5
## [61] ggfun_0.2.0 generics_0.1.4 scales_1.4.0
## [64] tidytree_0.4.6 glue_1.8.0 lazyeval_0.2.2
## [67] tools_4.4.3 data.table_1.17.8 fgsea_1.32.2
## [70] ggsignif_0.6.4 fs_1.6.6 fastmatch_1.1-6
## [73] cowplot_1.2.0 grid_4.4.3 ape_5.8-1
## [76] colorspace_2.1-1 nlme_3.1-168 GenomeInfoDbData_1.2.13
## [79] patchwork_1.3.1 Formula_1.2-5 cli_3.6.5
## [82] gtable_0.3.6 R.methodsS3_1.8.2 rstatix_0.7.2
## [85] yulab.utils_0.2.0 sass_0.4.10 digest_0.6.37
## [88] ggplotify_0.1.2 farver_2.1.2 memoise_2.0.1
## [91] htmltools_0.5.8.1 R.oo_1.27.1 lifecycle_1.0.4
## [94] httr_1.4.7 bit64_4.6.0-1
# Create directory for results
OUTDIR <- paste0("for_publication/results")
if (!dir.exists(OUTDIR)) {
dir.create(OUTDIR)
}
gse_prot <- rbind(
read.csv("for_publication/results/rawlogFC_gsea_Control_nanobody-Untreated.csv"),
read.csv("for_publication/results/rawlogFC_gsea_GFP_nanobody-Control_nanobody.csv"),
read.csv("for_publication/results/rawlogFC_gsea_GFP_nanobody-Untreated.csv")
)
dge_prot <- read.csv("for_publication/results/differential_abundance.csv")
abundance_prot <- read.csv("for_publication/data/prot_abundance.csv")
comparison_order <- c(
"GFP nanobody vs.\nUntreated" = "GFP_nanobody-Untreated",
"GFP nanobody vs.\nControl nanobody" = "GFP_nanobody-Control_nanobody",
"Control nanobody vs.\nUntreated" = "Control_nanobody-Untreated"
)
# Get features annotated to GO term
get_prots_by_go_term <- function(go_ids) {
is_first <- TRUE
for (id in go_ids) {
prots <- AnnotationDbi::select(
org.Hs.eg.db,
keytype = "GOALL",
columns = c("GOALL", "UNIPROT", "SYMBOL"),
keys = id
)
if (is_first) {
is_first <- FALSE
res <- prots %>% mutate(ID = id)
} else {
tmp <- prots %>% mutate(ID = id)
res <- rbind(res, tmp)
}
}
# Remove duplicates introduced by EVIDENCEALL
res <- res %>% group_by(ID, GOALL, UNIPROT, SYMBOL) %>% summarize(n_evidence_lines = n())
return(res)
}
# Reduce GO terms to parent term
offspring_list_bp <- as.list(GOBPOFFSPRING)
offspring_list_cc <- as.list(GOCCOFFSPRING)
offspring_list_mf <- as.list(GOMFOFFSPRING)
get_offspring <- function(
go_id,
ontology
) {
if (ontology == "BP") {
return(offspring_list_bp[[go_id]])
} else if (ontology == "CC") {
return(offspring_list_cc[[go_id]])
} else if (ontology == "MF") {
return(offspring_list_mf[[go_id]])
}
}
# Parse percent of tags (proteins in DGE results and in GO term) in leading edge
parse_tags_in_leading_edge <- function(leading_edge) {
tmp <- gsub("^tags=", "", leading_edge)
tmp2 <- gsub("%.*$", "", tmp)
return(as.numeric(tmp2))
}
# Get title case for gene ontology terms
get_title_case <- function(description) {
tmp <- str_to_title(description)
tmp <- unlist(lapply(X = tmp, FUN = gsub, pattern = "Rna", replacement = "RNA"))
tmp <- unlist(lapply(X = tmp, FUN = gsub, pattern = "Dna", replacement = "DNA"))
tmp <- unlist(lapply(X = tmp, FUN = gsub, pattern = "Or ", replacement = "or "))
tmp <- unlist(lapply(X = tmp, FUN = gsub, pattern = "On ", replacement = "on "))
tmp <- unlist(lapply(X = tmp, FUN = gsub, pattern = " A ", replacement = " a "))
return(tmp)
}
Also check if these terms are enriched in the other comparisons
top_terms <- gse_prot %>%
filter(contrast_short == "GFP_nanobody-Untreated") %>%
filter(p.adjust < 0.05) %>%
# group_by(ONTOLOGY) %>%
slice_min(n = 100, order_by = pvalue, with_ties = FALSE) %>%
dplyr::select(ONTOLOGY, ID, Description, enrichmentScore, NES, pvalue, p.adjust, contrast_short)
# Remove child terms, keep parents
top_terms$is_child <- FALSE
for (j in seq_len(nrow(top_terms))) {
child_terms <- get_offspring(go_id = top_terms[[j, "ID"]], ontology = top_terms[[j, "ONTOLOGY"]])
child_idxs <- which(top_terms$ID %in% child_terms)
top_terms[child_idxs, "is_child"] <- TRUE
}
print(paste("Removing", sum(top_terms$is_child), "/", nrow(top_terms), "terms for being a child term"))
## [1] "Removing 57 / 100 terms for being a child term"
most_signif_terms <- top_terms %>%
filter(!is_child) %>%
# group_by(ONTOLOGY) %>%
slice_min(order_by = p.adjust, n = 20, with_ties = FALSE) %>%
ungroup()
term_order <- most_signif_terms %>%
mutate(ont_factor = factor(
ONTOLOGY,
levels = c("BP", "CC", "MF"),
labels = c("Biological process", "Cellular compartment", "Molecular function")
)) %>%
arrange(desc(ont_factor), NES) %>%
mutate(desc_label = unlist(lapply(X = Description, FUN = get_title_case)))
to_plot <- gse_prot %>%
filter(ID %in% most_signif_terms$ID) %>%
dplyr::select(ONTOLOGY, ID, Description, enrichmentScore, setSize, NES, pvalue, p.adjust, contrast_short, leading_edge) %>%
mutate(change_direction = case_when(enrichmentScore < 0 ~ "Depletion", TRUE ~ "Enrichment")) %>%
mutate(change_direction = factor(change_direction, levels = c("Enrichment", "Depletion"))) %>%
mutate(percent_tags_in_leading_edge = unlist(lapply(X = leading_edge, FUN = parse_tags_in_leading_edge)))
to_plot <- to_plot %>%
mutate(ont_factor = factor(
ONTOLOGY,
levels = c("BP", "CC", "MF"),
labels = c("Biological process", "Cellular compartment", "Molecular function")
)) %>%
mutate(p_val_key = case_when(
p.adjust <= 0.001 ~ "***",
p.adjust <= 0.01 ~ "**",
p.adjust <= 0.05 ~ "*",
TRUE ~ ""
)) %>%
mutate(comparison_factor = factor(
contrast_short,
levels = comparison_order,
labels = names(comparison_order)
)) %>%
mutate(desc_factor = factor(
Description,
levels = term_order$Description,
labels = term_order$desc_label
))
# Make heatmap
p <- ggplot(
data = to_plot,
aes(x = comparison_factor, y = desc_factor, fill = NES, alpha = -log10(p.adjust))
) +
geom_tile() +
geom_text(aes(label = p_val_key), show.legend = FALSE) +
scale_fill_gradient2(low = "blue", mid = "yellow", high = "red") +
theme_bw() +
labs(x = "Comparison", y = "GO term") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major = element_blank()
) +
facet_grid(ONTOLOGY ~ ., scales = "free_y", space = "free_y")
ggsave(
plot = p,
paste0(OUTDIR, "/1_most_signif_go_terms_with_ontology.pdf"),
width = 6,
height = 5,
units = "in"
)
ggplot(
data = to_plot,
aes(x = comparison_factor, y = desc_factor, fill = NES, alpha = -log10(p.adjust))
) +
geom_tile() +
geom_text(aes(label = p_val_key), show.legend = FALSE) +
scale_fill_gradient2(low = "blue", mid = "yellow", high = "red") +
theme_bw() +
labs(x = "Comparison", y = "GO term") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major = element_blank()
)
ggsave(
paste0(OUTDIR, "/1_most_signif_go_terms.pdf"),
width = 6,
height = 5,
units = "in"
)
# Get mapping from gene names to uniprot IDs
genes_to_uniprots <- dge_prot %>%
ungroup() %>%
select(Genes, PG.ProteinGroups) %>%
distinct(Genes, PG.ProteinGroups)
# Define custom pathways
mt_prots <- genes_to_uniprots %>%
filter(grepl("^MT-", Genes) | grepl("^MRPS", Genes) | grepl("^MRPL", Genes) | PG.ProteinGroups == "P00003:eGFP (MT)")
lysosome_markers <- c(
"ACP2" = "P11117",
"LAMP1" = "P11279",
"LAMP2" = "P13473",
"LAMP3" = "Q9UQV4",
"SCARB2" = "Q14108",
"CTSB" = "P07858",
"CTSD" = "P07339",
"CTSL" = "P07711",
"CD63" = "P08962",
"M6PR" = "P20645",
"GLMP" = "Q8WWB7",
"BORCS5" = "Q969J3",
"BORCS6" = "Q96GS4",
"SLC46A3" = "Q7Z3Q1",
"GPR155" = "Q7Z3F1",
"LAPTM5" = "Q13571",
"GBA1" = "P04062",
"DNASE2" = "O00115",
"MANBA" = "O00462",
"CTSA" = "P10619",
"GILT" = "P13284",
"LIPA" = "P38571",
"TPP1" = "O14773",
"GAA" = "P10253",
"CYB561A3" = "Q8NBI2",
"PLA2G15" = "Q8NCC3",
"RILP" = "Q96NA2",
"MAN2B1" = "O00754",
"PRCP" = "P42785",
"LAMTOR1" = "Q6IAA8"
)
lysosome_prots <- genes_to_uniprots %>% filter(PG.ProteinGroups %in% lysosome_markers)
er_markers <- c(
"SEC61A1" = "P61619",
"SEC61B" = "P60468",
"SEC61G" = "P60059",
"RPN1" = "P04843",
"RPN2" = "P04844",
"ATP2A2" = "P16615",
"HO-1" = "P09601",
"CANX" = "P27824",
"CALR" = "P27797",
"HSPA5" = "P11021",
"HSP90B1" = "P14625",
"GP96" = "P14625",
"PDI" = "P07237",
"PDIA1" = "P07237",
"KDELR1" = "P24390",
"GRP78" = "P11021",
"PDIA3" = "P30101",
"ERP29" = "P30040",
"PDIA4" = "P13667",
"TXNDC12" = "O95881",
"ERP27" = "Q96DN0",
"ERP44" = "Q9BS26",
"TXNDC5" = "Q8NBS9",
"DNAJC10" = "Q8IXB1",
"POGLUT2" = "Q6UW63",
"KDELR2" = "P33947",
"DNAJB11" = "Q9UBS4",
"ERP70" = "A0A090N8Y2",
"HERPUD1" = "Q15011",
"DNAJC16" = "Q9Y2G8"
)
er_prots <- genes_to_uniprots %>% filter(PG.ProteinGroups %in% er_markers)
nuclear_markers <- c(
"LMNB2" = "Q03252",
"LMNB1" = "P20700",
"LMNA" = "P02545",
"LAP1" = "Q5JTV8",
"LAP2A" = "P42166",
"LAP2B" = "P42167",
"NCL" = "P19338"
)
nuclear_prots <- rbind(
genes_to_uniprots %>% filter(PG.ProteinGroups %in% nuclear_markers),
genes_to_uniprots %>% filter(grepl("^H2A", Genes)),
genes_to_uniprots %>% filter(grepl("^H2B", Genes)),
genes_to_uniprots %>% filter(grepl("^HNRNP", Genes))
)
term_to_gene_df <- rbind(
mt_prots %>% mutate(TERM = "MT"),
lysosome_prots %>% mutate(TERM = "Lysosome"),
er_prots %>% mutate(TERM = "ER"),
nuclear_prots %>% mutate(TERM = "Nucleus")
)
# Perform gene set enrichment testing for each custom pathway, each comparison
is_first <- TRUE
for (contrast_i in c("GFP_nanobody-Untreated", "GFP_nanobody-Control_nanobody", "Control_nanobody-Untreated")) {
tmp <- dge_prot %>%
filter(contrast_short == contrast_i) %>%
filter(!is.na(logFC)) %>%
arrange(desc(logFC))
gene_list <- tmp$logFC
names(gene_list) <- tmp$PG.ProteinGroups
set.seed(14617)
gse_tmp <- GSEA(
geneList = gene_list,
minGSSize = 1,
maxGSSize = 500,
pvalueCutoff = 1,
pAdjustMethod = "BH",
TERM2GENE = term_to_gene_df %>% dplyr::select(TERM, GENE = PG.ProteinGroups)
)
if (is_first) {
gse_custom <- as.data.frame(gse_tmp) %>% mutate(contrast = contrast_i)
is_first <- FALSE
} else {
gse_custom <- rbind(
gse_custom,
as.data.frame(gse_tmp) %>% mutate(contrast = contrast_i)
)
}
}
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
write.csv(
gse_custom,
paste0(OUTDIR, "/2_mt_vs_lysosomes_er_p_vals.csv"),
row.names = FALSE
)
write.csv(
term_to_gene_df,
paste0(OUTDIR, "/2_mt_vs_lysosomes_er_protein_list.csv"),
row.names = FALSE
)
# Format data
to_plot_logfc_dist <- dge_prot %>%
mutate(
is_mt = PG.ProteinGroups %in% mt_prots$PG.ProteinGroups,
is_lysosome = PG.ProteinGroups %in% lysosome_prots$PG.ProteinGroups,
is_er = PG.ProteinGroups %in% er_prots$PG.ProteinGroups,
is_nuc = PG.ProteinGroups %in% nuclear_prots$PG.ProteinGroups
) %>% pivot_longer(
cols = c(is_mt, is_er, is_lysosome, is_nuc),
names_to = "prot_type", values_to = "is_prot_type"
) %>%
filter(is_prot_type) %>%
mutate(prot_type_label = case_when(
prot_type == "is_mt" ~ "Mitochondrial proteins",
prot_type == "is_lysosome" ~ "Lysosome proteins",
prot_type == "is_er" ~ "Endoplasmic\nReticulum proteins",
prot_type == "is_nuc" ~ "Nucleus proteins"
)) %>% mutate(prot_type_label = factor(
prot_type_label,
levels = c("Mitochondrial proteins", "Lysosome proteins", "Endoplasmic\nReticulum proteins", "Nucleus proteins")
)) %>%
mutate(contrast_factor = factor(
contrast_short,
levels = comparison_order[3:1],
labels = names(comparison_order)[3:1]
))
ggplot(
data = to_plot_logfc_dist,
aes(x = logFC, y = contrast_factor, fill = contrast_factor, height = after_stat(density))) +
geom_density_ridges(stat = "density", alpha = 0.5) +
geom_vline(xintercept = 0, linetype = "dashed", color = "black") +
facet_grid(. ~ prot_type_label) +
theme_ridges(center_axis_labels = TRUE) +
theme(
strip.background = element_blank(),
legend.position = "none"
) +
scale_fill_manual(values = c("grey30", "darkgreen", "lightgreen")) +
labs(x = expression(log[2]~fold~change), y = element_blank())
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).
ggsave(
paste0(OUTDIR, "/2_mt_vs_lysosomes_er_density.pdf"),
width = 9,
height = 3,
units = "in"
)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).
core_enrichment <- gse_prot %>%
filter(Description == "mitochondrial gene expression", contrast == "GFP_nanobody-Control_nanobody") %>%
pull(core_enrichment)
core_enrichment <- strsplit(core_enrichment, split = "/")[[1]]
ggplot(
data = abundance_prot %>% filter(PG.ProteinGroups %in% core_enrichment),
aes(x = treatment, y = LogIntensity_norm)
) +
geom_point() +
facet_wrap(. ~ Genes, scales = "free_y") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Removed 15 rows containing missing values or values outside the scale
## range (`geom_point()`).