Plots for proteomics on cultured neurons

Code is expected to be run in the following conda environment: mamba env create -f for_publication/gsea.yaml mamba activate gsea_for_publication

Rscript -e ‘library(rmarkdown); rmarkdown::render(“for_publication/make_plots.Rmd”, “html_document”)’

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(stringr)
library(org.Hs.eg.db)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, saveRDS, setdiff,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:tidyr':
## 
##     expand
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## 
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
## 
##     select
## 
library(AnnotationDbi)
library(ggrepel)
library(clusterProfiler)
## 
## clusterProfiler v4.14.0 Learn more at https://yulab-smu.top/contribution-knowledge-mining/
## 
## Please cite:
## 
## S Xu, E Hu, Y Cai, Z Xie, X Luo, L Zhan, W Tang, Q Wang, B Liu, R Wang,
## W Xie, T Wu, L Xie, G Yu. Using clusterProfiler to characterize
## multiomics data. Nature Protocols. 2024, 19(11):3292-3320
## 
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:AnnotationDbi':
## 
##     select
## The following object is masked from 'package:IRanges':
## 
##     slice
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## The following object is masked from 'package:stats':
## 
##     filter
library(ggpubr)
library(GO.db)
library(viridis)
## Loading required package: viridisLite
library(stringr)
library(ggridges)

sessionInfo()
## R version 4.4.3 (2025-02-28)
## Platform: x86_64-conda-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /home/sarah.nadeau/miniforge3/envs/gsea_for_publication/lib/libopenblasp-r0.3.30.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## time zone: Europe/Rome
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ggridges_0.5.6         viridis_0.6.5          viridisLite_0.4.2     
##  [4] GO.db_3.20.0           ggpubr_0.6.1           clusterProfiler_4.14.0
##  [7] ggrepel_0.9.6          org.Hs.eg.db_3.20.0    AnnotationDbi_1.68.0  
## [10] IRanges_2.40.0         S4Vectors_0.44.0       Biobase_2.66.0        
## [13] BiocGenerics_0.52.0    stringr_1.5.1          ggplot2_3.5.2         
## [16] tidyr_1.3.1            dplyr_1.1.4            rmarkdown_2.29        
## 
## loaded via a namespace (and not attached):
##  [1] DBI_1.2.3               gson_0.1.0              gridExtra_2.3          
##  [4] rlang_1.1.6             magrittr_2.0.3          DOSE_4.0.0             
##  [7] compiler_4.4.3          RSQLite_2.4.2           png_0.1-8              
## [10] vctrs_0.6.5             reshape2_1.4.4          pkgconfig_2.0.3        
## [13] crayon_1.5.3            fastmap_1.2.0           backports_1.5.0        
## [16] XVector_0.46.0          enrichplot_1.26.1       UCSC.utils_1.2.0       
## [19] purrr_1.1.0             bit_4.6.0               xfun_0.52              
## [22] zlibbioc_1.52.0         cachem_1.1.0            aplot_0.2.8            
## [25] GenomeInfoDb_1.42.0     jsonlite_2.0.0          blob_1.2.4             
## [28] BiocParallel_1.40.0     broom_1.0.9             parallel_4.4.3         
## [31] R6_2.6.1                bslib_0.9.0             stringi_1.8.7          
## [34] RColorBrewer_1.1-3      car_3.1-3               jquerylib_0.1.4        
## [37] GOSemSim_2.32.0         Rcpp_1.1.0              knitr_1.50             
## [40] ggtangle_0.0.7          R.utils_2.13.0          Matrix_1.7-3           
## [43] splines_4.4.3           igraph_2.1.4            tidyselect_1.2.1       
## [46] abind_1.4-5             qvalue_2.38.0           codetools_0.2-20       
## [49] lattice_0.22-7          tibble_3.3.0            plyr_1.8.9             
## [52] treeio_1.30.0           withr_3.0.2             KEGGREST_1.46.0        
## [55] evaluate_1.0.4          gridGraphics_0.5-1      Biostrings_2.74.0      
## [58] pillar_1.11.0           ggtree_3.14.0           carData_3.0-5          
## [61] ggfun_0.2.0             generics_0.1.4          scales_1.4.0           
## [64] tidytree_0.4.6          glue_1.8.0              lazyeval_0.2.2         
## [67] tools_4.4.3             data.table_1.17.8       fgsea_1.32.2           
## [70] ggsignif_0.6.4          fs_1.6.6                fastmatch_1.1-6        
## [73] cowplot_1.2.0           grid_4.4.3              ape_5.8-1              
## [76] colorspace_2.1-1        nlme_3.1-168            GenomeInfoDbData_1.2.13
## [79] patchwork_1.3.1         Formula_1.2-5           cli_3.6.5              
## [82] gtable_0.3.6            R.methodsS3_1.8.2       rstatix_0.7.2          
## [85] yulab.utils_0.2.0       sass_0.4.10             digest_0.6.37          
## [88] ggplotify_0.1.2         farver_2.1.2            memoise_2.0.1          
## [91] htmltools_0.5.8.1       R.oo_1.27.1             lifecycle_1.0.4        
## [94] httr_1.4.7              bit64_4.6.0-1
# Create directory for results
OUTDIR <- paste0("for_publication/results")

if (!dir.exists(OUTDIR)) {
    dir.create(OUTDIR)
}

Load data

gse_prot <- rbind(
    read.csv("for_publication/results/rawlogFC_gsea_Control_nanobody-Untreated.csv"),
    read.csv("for_publication/results/rawlogFC_gsea_GFP_nanobody-Control_nanobody.csv"),
    read.csv("for_publication/results/rawlogFC_gsea_GFP_nanobody-Untreated.csv")
)

dge_prot <- read.csv("for_publication/results/differential_abundance.csv")

abundance_prot <- read.csv("for_publication/data/prot_abundance.csv")

Define constants for plotting

comparison_order <- c(
    "GFP nanobody vs.\nUntreated" = "GFP_nanobody-Untreated",
    "GFP nanobody vs.\nControl nanobody" = "GFP_nanobody-Control_nanobody",
    "Control nanobody vs.\nUntreated" = "Control_nanobody-Untreated"
)

Define functions

# Get features annotated to GO term
get_prots_by_go_term <- function(go_ids) {
    is_first <- TRUE
    for (id in go_ids) {
        prots <- AnnotationDbi::select(
            org.Hs.eg.db,
            keytype = "GOALL",
            columns = c("GOALL", "UNIPROT", "SYMBOL"),
            keys = id
        )
        if (is_first) {
            is_first <- FALSE
            res <- prots %>% mutate(ID = id)
        } else {
            tmp <- prots %>% mutate(ID = id)
            res <- rbind(res, tmp)
        }
    }
    # Remove duplicates introduced by EVIDENCEALL
    res <- res %>% group_by(ID, GOALL, UNIPROT, SYMBOL) %>% summarize(n_evidence_lines = n())
    return(res)
}

# Reduce GO terms to parent term
offspring_list_bp <- as.list(GOBPOFFSPRING)
offspring_list_cc <- as.list(GOCCOFFSPRING)
offspring_list_mf <- as.list(GOMFOFFSPRING)

get_offspring <- function(
    go_id,
    ontology
) {
    if (ontology == "BP") {
        return(offspring_list_bp[[go_id]])
    } else if (ontology == "CC") {
        return(offspring_list_cc[[go_id]])
    } else if (ontology == "MF") {
        return(offspring_list_mf[[go_id]])
    } 
}

# Parse percent of tags (proteins in DGE results and in GO term) in leading edge
parse_tags_in_leading_edge <- function(leading_edge) {
    tmp <- gsub("^tags=", "", leading_edge)
    tmp2 <- gsub("%.*$", "", tmp)
    return(as.numeric(tmp2))
}

# Get title case for gene ontology terms
get_title_case <- function(description) {
    tmp <- str_to_title(description)
    tmp <- unlist(lapply(X = tmp, FUN = gsub, pattern = "Rna", replacement = "RNA"))
    tmp <- unlist(lapply(X = tmp, FUN = gsub, pattern = "Dna", replacement = "DNA"))
    tmp <- unlist(lapply(X = tmp, FUN = gsub, pattern = "Or ", replacement = "or "))
    tmp <- unlist(lapply(X = tmp, FUN = gsub, pattern = "On ", replacement = "on "))
    tmp <- unlist(lapply(X = tmp, FUN = gsub, pattern = " A ", replacement = " a "))
    return(tmp)
}

Make heatmap of most significantly changed GO terms in GFP nanobody vs Untreated

Also check if these terms are enriched in the other comparisons

top_terms <- gse_prot %>%
    filter(contrast_short == "GFP_nanobody-Untreated") %>%
    filter(p.adjust < 0.05) %>%
    # group_by(ONTOLOGY) %>%
    slice_min(n = 100, order_by = pvalue, with_ties = FALSE) %>%
    dplyr::select(ONTOLOGY, ID, Description, enrichmentScore, NES, pvalue, p.adjust, contrast_short)

# Remove child terms, keep parents
top_terms$is_child <- FALSE
for (j in seq_len(nrow(top_terms))) {
    child_terms <- get_offspring(go_id = top_terms[[j, "ID"]], ontology = top_terms[[j, "ONTOLOGY"]])
    child_idxs <- which(top_terms$ID %in% child_terms)
    top_terms[child_idxs, "is_child"] <- TRUE
}
print(paste("Removing", sum(top_terms$is_child), "/", nrow(top_terms), "terms for being a child term"))
## [1] "Removing 57 / 100 terms for being a child term"
most_signif_terms <- top_terms %>%
    filter(!is_child) %>%
    # group_by(ONTOLOGY) %>%
    slice_min(order_by = p.adjust, n = 20, with_ties = FALSE) %>%
    ungroup()

term_order <- most_signif_terms %>%
    mutate(ont_factor = factor(
        ONTOLOGY,
        levels = c("BP", "CC", "MF"),
        labels = c("Biological process", "Cellular compartment", "Molecular function")
    )) %>%
    arrange(desc(ont_factor), NES) %>%
    mutate(desc_label = unlist(lapply(X = Description, FUN = get_title_case)))

to_plot <- gse_prot %>%
    filter(ID %in% most_signif_terms$ID) %>%
    dplyr::select(ONTOLOGY, ID, Description, enrichmentScore, setSize, NES, pvalue, p.adjust, contrast_short, leading_edge) %>%
    mutate(change_direction = case_when(enrichmentScore < 0 ~ "Depletion", TRUE ~ "Enrichment")) %>%
    mutate(change_direction = factor(change_direction, levels = c("Enrichment", "Depletion"))) %>%
    mutate(percent_tags_in_leading_edge = unlist(lapply(X = leading_edge, FUN = parse_tags_in_leading_edge)))

to_plot <- to_plot %>%
    mutate(ont_factor = factor(
        ONTOLOGY,
        levels = c("BP", "CC", "MF"),
        labels = c("Biological process", "Cellular compartment", "Molecular function")
    )) %>%
    mutate(p_val_key = case_when(
        p.adjust <= 0.001 ~ "***",
        p.adjust <= 0.01 ~ "**",
        p.adjust <= 0.05 ~ "*",
        TRUE ~ ""
    )) %>%
    mutate(comparison_factor = factor(
        contrast_short,
        levels = comparison_order,
        labels = names(comparison_order)
    )) %>%
    mutate(desc_factor = factor(
        Description,
        levels = term_order$Description,
        labels = term_order$desc_label
    ))

# Make heatmap
p <- ggplot(
    data = to_plot,
    aes(x = comparison_factor, y = desc_factor, fill = NES, alpha = -log10(p.adjust))
) +
    geom_tile() +
    geom_text(aes(label = p_val_key), show.legend = FALSE) +
    scale_fill_gradient2(low = "blue", mid = "yellow", high = "red") +
    theme_bw() +
    labs(x = "Comparison", y = "GO term") +
    theme(
        axis.text.x = element_text(angle = 45, hjust = 1),
        panel.grid.major = element_blank()
    ) +
    facet_grid(ONTOLOGY ~ ., scales = "free_y", space = "free_y")
ggsave(
    plot = p,
    paste0(OUTDIR, "/1_most_signif_go_terms_with_ontology.pdf"),
    width = 6,
    height = 5,
    units = "in"
)

ggplot(
    data = to_plot,
    aes(x = comparison_factor, y = desc_factor, fill = NES, alpha = -log10(p.adjust))
) +
    geom_tile() +
    geom_text(aes(label = p_val_key), show.legend = FALSE) +
    scale_fill_gradient2(low = "blue", mid = "yellow", high = "red") +
    theme_bw() +
    labs(x = "Comparison", y = "GO term") +
    theme(
        axis.text.x = element_text(angle = 45, hjust = 1),
        panel.grid.major = element_blank()
    )

ggsave(
    paste0(OUTDIR, "/1_most_signif_go_terms.pdf"),
    width = 6,
    height = 5,
    units = "in"
)

Test gene set enrichment for custom pathways

# Get mapping from gene names to uniprot IDs
genes_to_uniprots <- dge_prot %>%
    ungroup() %>%
    select(Genes, PG.ProteinGroups) %>%
    distinct(Genes, PG.ProteinGroups)

# Define custom pathways
mt_prots <- genes_to_uniprots %>%
    filter(grepl("^MT-", Genes) | grepl("^MRPS", Genes) | grepl("^MRPL", Genes) | PG.ProteinGroups == "P00003:eGFP (MT)")

lysosome_markers <- c(
    "ACP2" = "P11117",
    "LAMP1" = "P11279",
    "LAMP2" = "P13473",
    "LAMP3" = "Q9UQV4",
    "SCARB2" = "Q14108",
    "CTSB" = "P07858",
    "CTSD" = "P07339",
    "CTSL" = "P07711",
    "CD63" = "P08962",
    "M6PR" = "P20645",
    "GLMP" = "Q8WWB7",
    "BORCS5" = "Q969J3",
    "BORCS6" = "Q96GS4",
    "SLC46A3" = "Q7Z3Q1",
    "GPR155" = "Q7Z3F1",
    "LAPTM5" = "Q13571",
    "GBA1" = "P04062",
    "DNASE2" = "O00115",
    "MANBA" = "O00462",
    "CTSA" = "P10619",
    "GILT" = "P13284",
    "LIPA" = "P38571",
    "TPP1" = "O14773",
    "GAA" = "P10253",
    "CYB561A3" = "Q8NBI2",
    "PLA2G15" = "Q8NCC3",
    "RILP" = "Q96NA2",
    "MAN2B1" = "O00754",
    "PRCP" = "P42785",
    "LAMTOR1" = "Q6IAA8"
) 
lysosome_prots <- genes_to_uniprots %>% filter(PG.ProteinGroups %in% lysosome_markers)

er_markers <- c(
    "SEC61A1" = "P61619",
    "SEC61B" = "P60468",
    "SEC61G" = "P60059",
    "RPN1" = "P04843",
    "RPN2" = "P04844",
    "ATP2A2" = "P16615",
    "HO-1" = "P09601",
    "CANX" = "P27824",
    "CALR" = "P27797",
    "HSPA5" = "P11021",
    "HSP90B1" = "P14625",
    "GP96" = "P14625",
    "PDI" = "P07237",
    "PDIA1" = "P07237",
    "KDELR1" = "P24390",
    "GRP78" = "P11021",
    "PDIA3" = "P30101",
    "ERP29" = "P30040",
    "PDIA4" = "P13667",
    "TXNDC12" = "O95881",
    "ERP27" = "Q96DN0",
    "ERP44" = "Q9BS26",
    "TXNDC5" = "Q8NBS9",
    "DNAJC10" = "Q8IXB1",
    "POGLUT2" = "Q6UW63",
    "KDELR2" = "P33947",
    "DNAJB11" = "Q9UBS4",
    "ERP70" = "A0A090N8Y2",
    "HERPUD1" = "Q15011",
    "DNAJC16" = "Q9Y2G8"
)
er_prots <- genes_to_uniprots %>% filter(PG.ProteinGroups %in% er_markers)

nuclear_markers <- c(
    "LMNB2" = "Q03252",
    "LMNB1" = "P20700",
    "LMNA" = "P02545",
    "LAP1" = "Q5JTV8",
    "LAP2A" = "P42166",
    "LAP2B" = "P42167",
    "NCL" = "P19338"
)
nuclear_prots <- rbind(
    genes_to_uniprots %>% filter(PG.ProteinGroups %in% nuclear_markers),
    genes_to_uniprots %>% filter(grepl("^H2A", Genes)),
    genes_to_uniprots %>% filter(grepl("^H2B", Genes)),
    genes_to_uniprots %>% filter(grepl("^HNRNP", Genes))
)

term_to_gene_df <- rbind(
    mt_prots %>% mutate(TERM = "MT"),
    lysosome_prots %>% mutate(TERM = "Lysosome"),
    er_prots %>% mutate(TERM = "ER"),
    nuclear_prots %>% mutate(TERM = "Nucleus")
)

# Perform gene set enrichment testing for each custom pathway, each comparison
is_first <- TRUE
for (contrast_i in c("GFP_nanobody-Untreated", "GFP_nanobody-Control_nanobody", "Control_nanobody-Untreated")) {
    tmp <- dge_prot %>%
        filter(contrast_short == contrast_i) %>%
        filter(!is.na(logFC)) %>%
        arrange(desc(logFC))

    gene_list <- tmp$logFC
    names(gene_list) <- tmp$PG.ProteinGroups

    set.seed(14617)
    gse_tmp <- GSEA(
        geneList = gene_list,
        minGSSize = 1,
        maxGSSize = 500,
        pvalueCutoff = 1,
        pAdjustMethod = "BH",
        TERM2GENE = term_to_gene_df %>% dplyr::select(TERM, GENE = PG.ProteinGroups)
    )
    if (is_first) {
        gse_custom <- as.data.frame(gse_tmp) %>% mutate(contrast = contrast_i)
        is_first <- FALSE
    } else {
        gse_custom <- rbind(
            gse_custom,
            as.data.frame(gse_tmp) %>% mutate(contrast = contrast_i)
        )
    }
}
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
write.csv(
    gse_custom,
    paste0(OUTDIR, "/2_mt_vs_lysosomes_er_p_vals.csv"),
    row.names = FALSE
)

write.csv(
    term_to_gene_df,
    paste0(OUTDIR, "/2_mt_vs_lysosomes_er_protein_list.csv"),
    row.names = FALSE
)

Plot histogram of logFC for custom terms for each comparison

# Format data
to_plot_logfc_dist <- dge_prot %>%
    mutate(
        is_mt = PG.ProteinGroups %in% mt_prots$PG.ProteinGroups,
        is_lysosome = PG.ProteinGroups %in% lysosome_prots$PG.ProteinGroups,
        is_er = PG.ProteinGroups %in% er_prots$PG.ProteinGroups,
        is_nuc = PG.ProteinGroups %in% nuclear_prots$PG.ProteinGroups
    ) %>% pivot_longer(
        cols = c(is_mt, is_er, is_lysosome, is_nuc),
        names_to = "prot_type", values_to = "is_prot_type"
    ) %>%
    filter(is_prot_type) %>%
    mutate(prot_type_label = case_when(
        prot_type == "is_mt" ~ "Mitochondrial proteins",
        prot_type == "is_lysosome" ~ "Lysosome proteins",
        prot_type == "is_er" ~ "Endoplasmic\nReticulum proteins",
        prot_type == "is_nuc" ~ "Nucleus proteins"
    )) %>% mutate(prot_type_label = factor(
        prot_type_label,
        levels = c("Mitochondrial proteins", "Lysosome proteins", "Endoplasmic\nReticulum proteins", "Nucleus proteins")
    )) %>%
    mutate(contrast_factor = factor(
        contrast_short,
        levels = comparison_order[3:1],
        labels = names(comparison_order)[3:1]
    ))

ggplot(
    data = to_plot_logfc_dist,
    aes(x = logFC, y = contrast_factor, fill = contrast_factor, height = after_stat(density))) + 
    geom_density_ridges(stat = "density", alpha = 0.5) +
    geom_vline(xintercept = 0, linetype = "dashed", color = "black") +
    facet_grid(. ~ prot_type_label) +
    theme_ridges(center_axis_labels = TRUE) +
    theme(
        strip.background = element_blank(),
        legend.position = "none"
    ) +
    scale_fill_manual(values = c("grey30", "darkgreen", "lightgreen")) +
    labs(x = expression(log[2]~fold~change), y = element_blank())
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).

ggsave(
    paste0(OUTDIR, "/2_mt_vs_lysosomes_er_density.pdf"),
    width = 9,
    height = 3,
    units = "in"
)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).

Sanity check core proteins driving top GO term result

core_enrichment <- gse_prot %>%
    filter(Description == "mitochondrial gene expression", contrast == "GFP_nanobody-Control_nanobody") %>%
    pull(core_enrichment)
core_enrichment <- strsplit(core_enrichment, split = "/")[[1]]

ggplot(
    data = abundance_prot %>% filter(PG.ProteinGroups %in% core_enrichment),
    aes(x = treatment, y = LogIntensity_norm)
) +
    geom_point() +
    facet_wrap(. ~ Genes, scales = "free_y") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Removed 15 rows containing missing values or values outside the scale
## range (`geom_point()`).