Differential abundance for proteomics on cultured neurons

library(edgeR)
## Loading required package: limma
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(tibble)
library(optparse)
library(ggrepel)
## Loading required package: ggplot2
sessionInfo()
## R version 4.4.2 (2024-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## time zone: Europe/Rome
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggrepel_0.9.6  ggplot2_3.5.2  optparse_1.7.5 tibble_3.2.1   tidyr_1.3.1   
## [6] dplyr_1.1.4    edgeR_4.4.2    limma_3.62.2  
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       jsonlite_2.0.0     compiler_4.4.2     Rcpp_1.0.14       
##  [5] tidyselect_1.2.1   jquerylib_0.1.4    scales_1.4.0       fastmap_1.2.0     
##  [9] statmod_1.5.0      lattice_0.20-45    R6_2.6.1           generics_0.1.4    
## [13] knitr_1.50         RColorBrewer_1.1-3 bslib_0.9.0        pillar_1.10.2     
## [17] rlang_1.1.6        getopt_1.20.4      cachem_1.1.0       xfun_0.52         
## [21] sass_0.4.10        cli_3.6.5          withr_3.0.2        magrittr_2.0.3    
## [25] digest_0.6.37      grid_4.4.2         locfit_1.5-9.12    lifecycle_1.0.4   
## [29] vctrs_0.6.5        evaluate_1.0.3     glue_1.8.0         farver_2.1.2      
## [33] rmarkdown_2.29     purrr_1.0.4        tools_4.4.2        pkgconfig_2.0.3   
## [37] htmltools_0.5.8.1
# Create directory for results
OUTDIR <- paste0("for_publication/results")

if (!dir.exists(OUTDIR)) {
    dir.create(OUTDIR)
}

Define constants

contrasts <- c(
    "treatmentGFP_nanobody-treatmentControl_nanobody",
    "treatmentGFP_nanobody-treatmentUntreated",
    "treatmentControl_nanobody-treatmentUntreated"
)
min_mean_for_feature <- 0
min_n_per_treatment_for_features <- 0
min_n_total_for_features <- 5

Load data

data <- read.csv("for_publication/data/prot_abundance.csv")
metadata <- read.csv("for_publication/data/metadata.csv")

Format data

# Make data into matrix: samples (rows) x features (cols)
data_wide <- t(data %>%
    pivot_wider(
        id_cols = sample, names_from = feature, values_from = LogIntensity_norm
    ) %>% 
    column_to_rownames(var = "sample"))

# Put metadata in same order as count data
metadata <- metadata %>%
    filter(sample %in% colnames(data_wide)) %>%
    mutate(sample_order = factor(sample, levels = colnames(data_wide))) %>%
    arrange(sample_order)

print("These samples will be used for differential expression testing:")
## [1] "These samples will be used for differential expression testing:"
table(metadata$treatment)
## 
## Control_nanobody     GFP_nanobody        Untreated 
##                5                5                5
# Filter low-expression features
mean_per_feature <- apply(
    X = data_wide,
    MARGIN = 1,
    FUN = mean,
    na.rm = TRUE
)
mean_per_feature_pass <- mean_per_feature >= min_mean_for_feature
print(paste(
    "Filtering to", sum(mean_per_feature_pass), "/",
    nrow(data_wide), "features pass mean threshold of",
    min_mean_for_feature
))
## [1] "Filtering to 8057 / 8057 features pass mean threshold of 0"
data_wide <- data_wide[mean_per_feature_pass, ]

# Filter genes quantified in less than N samples in any condition
get_n_quantified <- function(x) {
    return(sum(x > 0, na.rm = TRUE))
}
n_quantified_fail <- unique(data %>%
    group_by(feature, treatment) %>%
    summarize(n_quantified = get_n_quantified(LogIntensity)) %>%
    filter(n_quantified < min_n_per_treatment_for_features) %>%
    pull(feature))
## `summarise()` has grouped output by 'feature'. You can override using the
## `.groups` argument.
n_quantified_pass <- !(rownames(data_wide) %in% n_quantified_fail)
print(paste(
    "Filtering to", sum(n_quantified_pass), "/",
    nrow(data_wide), "genes quantified in more than",
    min_n_per_treatment_for_features, "samples in every treatment"))
## [1] "Filtering to 8057 / 8057 genes quantified in more than 0 samples in every treatment"
data_wide <- data_wide[n_quantified_pass, ]

# Filter genes quantified in less than N samples total
get_n_quantified <- function(x) {
    return(sum(x > 0, na.rm = TRUE))
}
n_quantified_total_fail <- unique(data %>%
    group_by(feature) %>%
    summarize(n_quantified = get_n_quantified(LogIntensity)) %>%
    filter(n_quantified < min_n_total_for_features) %>%
    pull(feature))

n_quantified_total_pass <- !(rownames(data_wide) %in% n_quantified_total_fail)
print(paste(
    "Filtering to", sum(n_quantified_total_pass), "/",
    nrow(data_wide), "genes quantified in more than",
    min_n_total_for_features, "samples total"))
## [1] "Filtering to 8040 / 8057 genes quantified in more than 5 samples total"
data_wide <- data_wide[n_quantified_total_pass, ]

# Check sample order
if (!all(metadata$sample == colnames(data_wide))) {
    stop("Samples out of order!")
}

Fit model

# Fit model (no intercept)
design <- model.matrix(~0 + treatment, data = metadata)
fit <- lmFit(object = data_wide, design = design)
## Warning: Partial NA coefficients for 8 probe(s)
# Specify contrasts
contr <- makeContrasts(
    contrasts = contrasts,
    levels = colnames(coef(fit))
)

fit_contrasts <- contrasts.fit(fit, contr)

# Share information across proteins to smooth standard errors
# via empirical bayes method
fit_contrasts_ebayes <- eBayes(fit_contrasts, trend = TRUE)

# Make diagnostics plots for linear model assumptions
# Residuals variance-mean trend and limma-trend fit used by ebayes
plotSA(fit = fit_contrasts_ebayes, col = c("black", "red"))

Save results to dataframe

contrast_names <- colnames(fit_contrasts_ebayes$p.value)
std_errors <- sqrt(fit_contrasts_ebayes$s2.post) * fit_contrasts_ebayes$stdev.unscaled
is_first <- TRUE
for (i in seq_along(contrast_names)) {
    tmp <- topTable(fit_contrasts_ebayes, number = Inf, coef = i, sort = "none", adjust.method = "bonferroni")
    tmp$feature <- rownames(tmp)
    tmp <- as.data.frame(tmp) %>% mutate(
        contrast = contrast_names[i],
        SE = std_errors[, i],
        df = fit_contrasts_ebayes$df.total
    )
    if (is_first) {
        is_first <- FALSE
        res <- tmp
    } else {
        res <- rbind(res, tmp)
    }
}

res <- res %>% left_join(
    data %>%
        group_by(feature, PG.ProteinGroups, Genes) %>%
        slice_head(n = 1) %>%
        select(feature, PG.ProteinGroups, Genes),
    by = "feature"
) %>% mutate(
    contrast_short = gsub("treatment", "", contrast)
)

write.csv(
    res,
    paste0(OUTDIR, "/differential_abundance.csv"),
    row.names = FALSE
)

Make volcano plot of results

print(paste(sum(res$adj.P.Val < 0.05, na.rm = TRUE), "genes siginficant at alpha=0.05"))
## [1] "10 genes siginficant at alpha=0.05"
print(paste("Lowest p-value is", min(res$adj.P.Val, na.rm = TRUE)))
## [1] "Lowest p-value is 5.63433297344967e-07"
# Make volcano plot
get_pos_suffix <- function(feature) {
    if (grepl("_M", feature)) {
        return(paste0(" ", str_split(feature, "_")[[1]][2]))
    } else {
        return("")
    }
}

to_plot <- res %>%
    filter(!is.na(P.Value)) %>%
    mutate(pos_suffix = unlist(lapply(X = feature, FUN = get_pos_suffix))) %>%
    mutate(gene_label = paste0(Genes, pos_suffix))
ggplot(
    data = to_plot,
    aes(x = logFC, y = -log10(P.Value), shape = adj.P.Val < 0.05, color = adj.P.Val < 0.05)
) +
    geom_point() +
    geom_text_repel(
        data = to_plot %>%
            filter(adj.P.Val < 0.05),
        aes(label = gene_label),
        size = 2,
        max.overlaps = 100,
        min.segment.length = 0.1
    ) +
    geom_text_repel(
        data = to_plot %>%
            filter(adj.P.Val >= 0.05) %>%
            group_by(contrast_short) %>%
            slice_min(order_by = P.Value, n = 20),
        aes(label = gene_label),
        size = 2
    ) +
    theme_bw() +
    scale_shape_manual(values = c(1, 19)) +
    scale_color_manual(values = c("darkgrey", "black")) +
    facet_wrap(. ~ contrast_short, ncol = 1)
## Warning: ggrepel: 20 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 19 unlabeled data points (too many overlaps). Consider increasing max.overlaps
## ggrepel: 19 unlabeled data points (too many overlaps). Consider increasing max.overlaps