Differential abundance for proteomics on cultured neurons
library(edgeR)
## Loading required package: limma
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(tibble)
library(optparse)
library(ggrepel)
## Loading required package: ggplot2
sessionInfo()
## R version 4.4.2 (2024-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Rome
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggrepel_0.9.6 ggplot2_3.5.2 optparse_1.7.5 tibble_3.2.1 tidyr_1.3.1
## [6] dplyr_1.1.4 edgeR_4.4.2 limma_3.62.2
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 jsonlite_2.0.0 compiler_4.4.2 Rcpp_1.0.14
## [5] tidyselect_1.2.1 jquerylib_0.1.4 scales_1.4.0 fastmap_1.2.0
## [9] statmod_1.5.0 lattice_0.20-45 R6_2.6.1 generics_0.1.4
## [13] knitr_1.50 RColorBrewer_1.1-3 bslib_0.9.0 pillar_1.10.2
## [17] rlang_1.1.6 getopt_1.20.4 cachem_1.1.0 xfun_0.52
## [21] sass_0.4.10 cli_3.6.5 withr_3.0.2 magrittr_2.0.3
## [25] digest_0.6.37 grid_4.4.2 locfit_1.5-9.12 lifecycle_1.0.4
## [29] vctrs_0.6.5 evaluate_1.0.3 glue_1.8.0 farver_2.1.2
## [33] rmarkdown_2.29 purrr_1.0.4 tools_4.4.2 pkgconfig_2.0.3
## [37] htmltools_0.5.8.1
# Create directory for results
OUTDIR <- paste0("for_publication/results")
if (!dir.exists(OUTDIR)) {
dir.create(OUTDIR)
}
Define constants
contrasts <- c(
"treatmentGFP_nanobody-treatmentControl_nanobody",
"treatmentGFP_nanobody-treatmentUntreated",
"treatmentControl_nanobody-treatmentUntreated"
)
min_mean_for_feature <- 0
min_n_per_treatment_for_features <- 0
min_n_total_for_features <- 5
Load data
data <- read.csv("for_publication/data/prot_abundance.csv")
metadata <- read.csv("for_publication/data/metadata.csv")
Format data
# Make data into matrix: samples (rows) x features (cols)
data_wide <- t(data %>%
pivot_wider(
id_cols = sample, names_from = feature, values_from = LogIntensity_norm
) %>%
column_to_rownames(var = "sample"))
# Put metadata in same order as count data
metadata <- metadata %>%
filter(sample %in% colnames(data_wide)) %>%
mutate(sample_order = factor(sample, levels = colnames(data_wide))) %>%
arrange(sample_order)
print("These samples will be used for differential expression testing:")
## [1] "These samples will be used for differential expression testing:"
table(metadata$treatment)
##
## Control_nanobody GFP_nanobody Untreated
## 5 5 5
# Filter low-expression features
mean_per_feature <- apply(
X = data_wide,
MARGIN = 1,
FUN = mean,
na.rm = TRUE
)
mean_per_feature_pass <- mean_per_feature >= min_mean_for_feature
print(paste(
"Filtering to", sum(mean_per_feature_pass), "/",
nrow(data_wide), "features pass mean threshold of",
min_mean_for_feature
))
## [1] "Filtering to 8057 / 8057 features pass mean threshold of 0"
data_wide <- data_wide[mean_per_feature_pass, ]
# Filter genes quantified in less than N samples in any condition
get_n_quantified <- function(x) {
return(sum(x > 0, na.rm = TRUE))
}
n_quantified_fail <- unique(data %>%
group_by(feature, treatment) %>%
summarize(n_quantified = get_n_quantified(LogIntensity)) %>%
filter(n_quantified < min_n_per_treatment_for_features) %>%
pull(feature))
## `summarise()` has grouped output by 'feature'. You can override using the
## `.groups` argument.
n_quantified_pass <- !(rownames(data_wide) %in% n_quantified_fail)
print(paste(
"Filtering to", sum(n_quantified_pass), "/",
nrow(data_wide), "genes quantified in more than",
min_n_per_treatment_for_features, "samples in every treatment"))
## [1] "Filtering to 8057 / 8057 genes quantified in more than 0 samples in every treatment"
data_wide <- data_wide[n_quantified_pass, ]
# Filter genes quantified in less than N samples total
get_n_quantified <- function(x) {
return(sum(x > 0, na.rm = TRUE))
}
n_quantified_total_fail <- unique(data %>%
group_by(feature) %>%
summarize(n_quantified = get_n_quantified(LogIntensity)) %>%
filter(n_quantified < min_n_total_for_features) %>%
pull(feature))
n_quantified_total_pass <- !(rownames(data_wide) %in% n_quantified_total_fail)
print(paste(
"Filtering to", sum(n_quantified_total_pass), "/",
nrow(data_wide), "genes quantified in more than",
min_n_total_for_features, "samples total"))
## [1] "Filtering to 8040 / 8057 genes quantified in more than 5 samples total"
data_wide <- data_wide[n_quantified_total_pass, ]
# Check sample order
if (!all(metadata$sample == colnames(data_wide))) {
stop("Samples out of order!")
}
Fit model
# Fit model (no intercept)
design <- model.matrix(~0 + treatment, data = metadata)
fit <- lmFit(object = data_wide, design = design)
## Warning: Partial NA coefficients for 8 probe(s)
# Specify contrasts
contr <- makeContrasts(
contrasts = contrasts,
levels = colnames(coef(fit))
)
fit_contrasts <- contrasts.fit(fit, contr)
# Share information across proteins to smooth standard errors
# via empirical bayes method
fit_contrasts_ebayes <- eBayes(fit_contrasts, trend = TRUE)
# Make diagnostics plots for linear model assumptions
# Residuals variance-mean trend and limma-trend fit used by ebayes
plotSA(fit = fit_contrasts_ebayes, col = c("black", "red"))

Save results to dataframe
contrast_names <- colnames(fit_contrasts_ebayes$p.value)
std_errors <- sqrt(fit_contrasts_ebayes$s2.post) * fit_contrasts_ebayes$stdev.unscaled
is_first <- TRUE
for (i in seq_along(contrast_names)) {
tmp <- topTable(fit_contrasts_ebayes, number = Inf, coef = i, sort = "none", adjust.method = "bonferroni")
tmp$feature <- rownames(tmp)
tmp <- as.data.frame(tmp) %>% mutate(
contrast = contrast_names[i],
SE = std_errors[, i],
df = fit_contrasts_ebayes$df.total
)
if (is_first) {
is_first <- FALSE
res <- tmp
} else {
res <- rbind(res, tmp)
}
}
res <- res %>% left_join(
data %>%
group_by(feature, PG.ProteinGroups, Genes) %>%
slice_head(n = 1) %>%
select(feature, PG.ProteinGroups, Genes),
by = "feature"
) %>% mutate(
contrast_short = gsub("treatment", "", contrast)
)
write.csv(
res,
paste0(OUTDIR, "/differential_abundance.csv"),
row.names = FALSE
)
Make volcano plot of results
print(paste(sum(res$adj.P.Val < 0.05, na.rm = TRUE), "genes siginficant at alpha=0.05"))
## [1] "10 genes siginficant at alpha=0.05"
print(paste("Lowest p-value is", min(res$adj.P.Val, na.rm = TRUE)))
## [1] "Lowest p-value is 5.63433297344967e-07"
# Make volcano plot
get_pos_suffix <- function(feature) {
if (grepl("_M", feature)) {
return(paste0(" ", str_split(feature, "_")[[1]][2]))
} else {
return("")
}
}
to_plot <- res %>%
filter(!is.na(P.Value)) %>%
mutate(pos_suffix = unlist(lapply(X = feature, FUN = get_pos_suffix))) %>%
mutate(gene_label = paste0(Genes, pos_suffix))
ggplot(
data = to_plot,
aes(x = logFC, y = -log10(P.Value), shape = adj.P.Val < 0.05, color = adj.P.Val < 0.05)
) +
geom_point() +
geom_text_repel(
data = to_plot %>%
filter(adj.P.Val < 0.05),
aes(label = gene_label),
size = 2,
max.overlaps = 100,
min.segment.length = 0.1
) +
geom_text_repel(
data = to_plot %>%
filter(adj.P.Val >= 0.05) %>%
group_by(contrast_short) %>%
slice_min(order_by = P.Value, n = 20),
aes(label = gene_label),
size = 2
) +
theme_bw() +
scale_shape_manual(values = c(1, 19)) +
scale_color_manual(values = c("darkgrey", "black")) +
facet_wrap(. ~ contrast_short, ncol = 1)
## Warning: ggrepel: 20 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 19 unlabeled data points (too many overlaps). Consider increasing max.overlaps
## ggrepel: 19 unlabeled data points (too many overlaps). Consider increasing max.overlaps
