Working directory set to the Document Directory in Knit
library(vegan)
library(microbiome)
library(phyloseq)
library(ggplot2)
library(RColorBrewer)
library(stringr)
library(ggpubr) # Plot organization, and rremove()
library(naturalsort)
library(ggVennDiagram) # Nice venn diagrams and extract their data
library(cowplot) # To stack plots, plot_grid() fucntion
library(dplyr)
library(tidyr)
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.5 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=es_ES.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=es_ES.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=es_ES.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Zurich
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] tidyr_1.3.1 dplyr_1.1.4 cowplot_1.1.3
## [4] ggVennDiagram_1.5.2 naturalsort_0.1.3 ggpubr_0.6.0
## [7] stringr_1.5.1 RColorBrewer_1.1-3 microbiome_1.26.0
## [10] ggplot2_3.5.1 phyloseq_1.48.0 vegan_2.6-8
## [13] lattice_0.22-5 permute_0.9-7
##
## loaded via a namespace (and not attached):
## [1] ade4_1.7-22 tidyselect_1.2.1 Biostrings_2.72.1
## [4] fastmap_1.2.0 digest_0.6.37 lifecycle_1.0.4
## [7] cluster_2.1.6 survival_3.7-0 magrittr_2.0.3
## [10] compiler_4.4.1 rlang_1.1.4 sass_0.4.9
## [13] tools_4.4.1 igraph_2.1.1 utf8_1.2.4
## [16] yaml_2.3.10 data.table_1.16.2 knitr_1.49
## [19] ggsignif_0.6.4 plyr_1.8.9 abind_1.4-8
## [22] Rtsne_0.17 withr_3.0.2 purrr_1.0.2
## [25] BiocGenerics_0.50.0 grid_4.4.1 stats4_4.4.1
## [28] fansi_1.0.6 multtest_2.60.0 biomformat_1.32.0
## [31] colorspace_2.1-1 Rhdf5lib_1.26.0 scales_1.3.0
## [34] iterators_1.0.14 MASS_7.3-61 cli_3.6.3
## [37] rmarkdown_2.29 crayon_1.5.3 generics_0.1.3
## [40] rstudioapi_0.17.1 httr_1.4.7 reshape2_1.4.4
## [43] ape_5.8 cachem_1.1.0 rhdf5_2.48.0
## [46] zlibbioc_1.50.0 splines_4.4.1 parallel_4.4.1
## [49] XVector_0.44.0 vctrs_0.6.5 Matrix_1.7-1
## [52] carData_3.0-5 jsonlite_1.8.9 car_3.1-3
## [55] IRanges_2.38.1 S4Vectors_0.42.1 rstatix_0.7.2
## [58] Formula_1.2-5 foreach_1.5.2 jquerylib_0.1.4
## [61] glue_1.8.0 codetools_0.2-19 stringi_1.8.4
## [64] gtable_0.3.6 GenomeInfoDb_1.40.1 UCSC.utils_1.0.0
## [67] munsell_0.5.1 tibble_3.2.1 pillar_1.9.0
## [70] htmltools_0.5.8.1 rhdf5filters_1.16.0 GenomeInfoDbData_1.2.12
## [73] R6_2.5.1 evaluate_1.0.1 Biobase_2.64.0
## [76] backports_1.5.0 broom_1.0.7 bslib_0.8.0
## [79] Rcpp_1.0.13-1 nlme_3.1-165 mgcv_1.9-1
## [82] xfun_0.49 pkgconfig_2.0.3
Data loading
samples_order <- c("Donor","Pre_FMT","M1","M4","M6")
physeq <- readRDS("phyloseq_IBS_FMT.rds") # Phyloseq object
# Extracting metadata
metadata <- get_variable(physeq) %>%
mutate(Sample_name_short = factor(Sample_name_short, unname(samples_order)),
Sample_name_long = factor(Sample_name_long, c("FMT_donor","Pre-FMT","1M-post-FMT","4M-post-FMT","6M-post-FMT")),
QuantificationQubit_ng_ul = as.numeric(QuantificationQubit_ng_ul))
reads <- read.csv("Reads_IBS_FMT.csv") %>%
mutate(Sample_name_short = factor(Sample_name_short, unname(samples_order)),
Reads = factor(Reads, c("Reads processing","Maintained reads", "Tax filtered reads")))
DNA and Reads counts
###########################
####### LIBRARY DNA #######
###########################
libDNA <- ggplot(metadata, aes(x = Sample_name_long, y = QuantificationQubit_ng_ul, fill = seq_name)) +
geom_col() +
labs(x= "Sample name",
y = expression(paste("DNA quantification [ng/", mu, "l]")),
title = "A", #Low values of library DNA yield
subtitle = "Individuals") +
theme_bw() +
scale_fill_manual(values = c("#fc8d59","#91bfdb")) +
theme(text = element_text(size = 11),
plot.title = element_text(face = "bold"),
strip.text = element_text(face = "bold", colour = "black"),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
legend.position="right") + # none
guides(fill=guide_legend(title="Sequence run", byrow=TRUE)) + #nrow=1,
scale_x_discrete(drop = TRUE)
###########################
########## READS ##########
###########################
overall_reads_barplot <-
ggplot(reads %>%
filter(Reads!="Tax filtered reads"),
aes(x=Sample_name_long, y=Count, fill=Reads)) +
geom_col() +
theme_bw() +
scale_fill_manual(values = c("#a6cee3","#1f78b4","#8dd3c7")) +
labs(x= "Sample label",
y ="Reads",
title = "B") + #Reads counts overall
# subtitle = "Individuals") +
theme(text = element_text(size = 11),
plot.title = element_text(face = "bold", vjust = -0.5),
strip.text = element_text(face = "bold", colour = "black"),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 9),
plot.subtitle = element_text(hjust = 0.5),
legend.position="right") +
scale_x_discrete(drop = TRUE) + # Keep all groups, included the ones with values. Alternative : (drop = FALSE)
# FACET GRID
geom_hline(yintercept=150000, col="red", linetype="solid", linewidth=1, alpha=0.3) +
geom_hline(yintercept=100000, col="red", linetype="solid", linewidth=1, alpha=0.8)
###########################
####### COMBINATION #######
###########################
p_QC <- plot_grid(libDNA + rremove("x.text") + rremove("x.title"),
overall_reads_barplot,
align = 'v', ncol = 1, axis = 'lr')
# ggsave(filename = "./Results/QC_DNA_Reads.pdf", plot = p_QC, width = 6, height = 6, dpi = 500, ) # chagnged the _Groups _Runs
print(p_QC)
reads %>% filter(Reads!="Tax filtered reads") %>% select(Sample_name_short,Reads,Count) %>%
pivot_wider(names_from = Reads, values_from =Count ) %>%
mutate(maintainted_reads_perc = round( (`Maintained reads` / (`Maintained reads` + `Reads processing`)), 2)) %>%
write.csv("Results/Read_counts.csv", row.names = F)
# REMOVING SINGLETONS
ntaxa(physeq)
## [1] 1242
physeq_filt <- prune_taxa(taxa_sums(physeq) > 1, physeq)
ntaxa(physeq_filt)
## [1] 1238
ntaxa(physeq) - ntaxa(physeq_filt)
## [1] 4
physeq_filt <- prune_samples(as.character(physeq@sam_data$Sample_name_short),
physeq_filt)
# RAREFACTION
tab <- otu_table(physeq_filt)
class(tab) <- "matrix" # as.matrix() will do nothing
## Warning in class(tab) <- "matrix": Configuración de clase(x) a "matrix"
## establece atribuibles a NULL; resultado ya no será un objeto S4
## you get a warning here, but this is what we need to have
tab <- t(tab) # transpose observations to rows
pdf("Results/S1_Rerecurves.pdf", width = 6, height = 4)
rare <- rarecurve(tab, step=1000, lwd=2, ylab="ASVs", label=T)
# rare
abline(v=150000, col="red")
abline(v=250000, col="blue")
dev.off()
## png
## 2
# ps.rar2 <- rarefy_even_depth(physeq_filt, rngseed=1, sample.size=250000, replace=F)
Repeated code for visualization
rare <- rarecurve(tab, step=1000, lwd=2, ylab="ASVs", label=T)
# rare
abline(v=150000, col="red")
abline(v=250000, col="blue")
All samples reach the rarefaction plateau.
Calculating and visualizing alpha diversity at Species level
# TAXA AGGREGATION
taxa_r <- "Species"
taxa.glom <- tax_glom(physeq_filt, taxrank=taxa_r)
physeq.glom.agg <- aggregate_taxa(taxa.glom, taxa_r)
# CALCULATING RICHNESS METRICS
ricorico <- estimate_richness(physeq.glom.agg) %>%
mutate(Sample_name_short = rownames(.)) %>%
`rownames<-`( NULL ) %>%
select(Observed,Chao1, se.chao1, Shannon, Simpson, InvSimpson, Fisher, Sample_name_short) %>%
pivot_longer(cols = c(Observed,Chao1, se.chao1, Shannon, Simpson, InvSimpson, Fisher),
names_to = "metric", values_to="value") %>%
left_join(metadata, by="Sample_name_short")
ggplot(ricorico %>%
filter(metric %in% c("Observed", "Shannon")) %>%
mutate(individual_kind = case_when(project_name == "DONORS" ~ "FMT donor",
project_name == "TMF" ~ "Recipient")),
aes(x=Sample_name_long, y=value, fill=individual_kind)) +
geom_col() +
facet_wrap(vars(metric), scales = "free_y")+
theme_classic()+
labs(x="Sample name", y="Index value (species level)") +
scale_fill_manual(values = c("#fc8d59","#91bfdb")) + # #99d594 91bfdb
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + #, vjust = 0.5))
guides(fill=guide_legend(title="Sample origin"))
# ggsave("Results/Fig2A_alpha_div_Species.pdf", width = 5, height = 3)
Extracting data intersection among all samples
taxa_r <- "Genus"
taxa.glom <- tax_glom(physeq_filt, taxrank=taxa_r)
physeq.glom.agg <- aggregate_taxa(taxa.glom, taxa_r)
genus_counts <- data.frame(physeq.glom.agg@otu_table)
genus_counts <- genus_counts %>% relocate(naturalsort(names( genus_counts)))
genus_names <- lapply( genus_counts, function(x) rownames( genus_counts)[x>0])
# object for calculating relative abundance
genus_counts_ra <- microbiome::transform(physeq.glom.agg, "compositional")
genus_counts_ra <- data.frame(genus_counts_ra@otu_table)
genus_counts_ra$taxa <- rownames(genus_counts)
genus_counts <- genus_counts %>% # Reordering the columns
select(Donor, Pre_FMT, M1, M4, M6)
# ---
genus_counts_ra_base <- genus_counts_ra %>%
reshape2::melt(id.vars="taxa") %>%
rename(Sample = variable) %>%
mutate(tax_sample = paste(taxa, Sample, sep="_")) %>%
filter(value >0)
# 1. TAXA PER INTERSECTIONS
##############################################################################
# Get the taxa names per sample (time point)
## since here we only care about presence absence I use the original genus_counts
## dataframe with the selected time points.
genus_counts_names <- lapply(genus_counts, function(x) rownames(genus_counts)[x>0])
ven.plot <- Venn(genus_counts_names) # Get the venn diagram for the intersections
data <- process_data(ven.plot) # Extract plot information
data.venn <- data$regionData # Extract intersections
# Extract individual samples from each intersection
intersections <- lapply(str_split(data.venn$name, "[/]"), function(z){ z[!is.na(z) & z != ""]})
names(intersections) <- data.venn$name
a <- c() # List for intersection names
b <- c() # List for each intersection component
for (name in names(intersections)) {
# print(name)
c <- intersections[name]
a <- append( a, rep(name,length(unname(c[[1]]))) )
b <- append(b, unname(c[[1]]))
}
# Incorporating these elements into a data frame
df1 <- data.frame(intersection = a,
samples_intersects = b)
# Adding taxa of each interaction
df2 <- merge(df1, data.venn[c("name", "item")], by.x="intersection", by.y="name",
all.x=T)
# Unnesting the list of taxa into distinct rows:
df2 <- df2 %>% unnest(item)
# Creating taxa_sample column in the intersections dataframe to then merge it with the actual intervals count
df2$tax_sample <- paste(df2$item, df2$samples_intersects, sep="_")
# 2) Extracting ASVs interval/counts of each intercept per group
####################################################################################
# Creating taxa_sample column to then merge with the other one
genus_counts_ra_res <- genus_counts_ra_base %>%
left_join(df2 %>% select(c(intersection,tax_sample)),by="tax_sample")
Absence-presence visualziation with a Venn Diagram
ggVennDiagram( genus_names) +
scale_fill_distiller(palette = "Spectral")
ggsave(paste0("Results/Fig2B_Venn_",taxa_r,".pdf"), height = 7, width = 7)
Proportion of each sample intersection for each sample
# Palette
colours_gradient <- c("white","#FFFFD9","#EDF8B1", "#C7E9B4", "#7FCDBB", "#41B6C4",
"#1D91C0", "#225EA8", "#253494", "#081D58")
###### Visualization more tidy
A <- c("Donor/Pre_FMT/M1/M4/M6","Pre_FMT/M1/M4/M6" )
B <- c("Pre_FMT/M4/M6","Pre_FMT/M1","Pre_FMT","Pre_FMT/M1/M4","Pre_FMT/M6","Pre_FMT/M4","Pre_FMT/M1/M6")
C <- c("Donor/Pre_FMT/M1","Donor/Pre_FMT/M4/M6","Donor/Pre_FMT/M4","Donor/Pre_FMT/M1/M4","Donor/Pre_FMT","Donor/Pre_FMT/M6","Donor/Pre_FMT/M1/M6")
D <- c("Donor","Donor/M1/M6","Donor/M1","Donor/M1/M4/M6","Donor/M4/M6","Donor/M1/M4","Donor/M4")
E <- c("M1", "M1/M6","M1/M4/M6","M1/M4","M4/M6","M4","M6")
genus_counts_ra_res_intersections <- genus_counts_ra_res %>%
mutate(value = value*100,
value = round(value, 3)) %>%
group_by(intersection, Sample) %>%
summarise(Sum = sum(value)) %>%
mutate(Sample = factor(Sample, c("Donor","Pre_FMT","M1","M4","M6"))) %>%
mutate(category = case_when(intersection %in% A ~ "Cores",
intersection %in% B ~ "Cores variation",
intersection %in% C ~ "Patient variation",
intersection %in% D ~ "Engraftment var.",
intersection %in% E ~ "Acquisition"),
category = factor(category, c("Cores", "Cores variation", "Patient variation", "Engraftment var.", "Acquisition")))
## `summarise()` has grouped output by 'intersection'. You can override using the
## `.groups` argument.
ggplot(genus_counts_ra_res_intersections,
aes(x = Sample, y = intersection, fill=Sum)) +
geom_tile(color = "white") +
scale_fill_gradientn(colours = colours_gradient) + #YGreens
geom_text(aes(Sample, intersection, label = ifelse(Sum>0, Sum, ".")), size = 2) +
labs(x="Sample", y="Intersection",
title="Relative abundance of the taxa in each intersection per sample") +
facet_grid(category ~ ., scales="free", space="free") +
theme_classic()
# ggsave("Results/Fig2C_Intersections_categories_genus_relative_abundance.pdf", width = 4.7, height=6)
Relative abundance of the common genera and their longitudinal changes
######## Taxa selection: at any time or sample >1%.
norare_taxa <- genus_counts_ra_res %>%
mutate(value = value*100) %>%
filter(value > 1) %>%
pull(taxa) %>%
unique()
# FOLD CHANGE LABELING
fc_taxa <- genus_counts_ra_res %>%
filter(taxa %in% norare_taxa) %>%
pivot_wider(id_cols = taxa, names_from = Sample, values_from = value, values_fill = 0) %>%
mutate(FC2 = log2(Pre_FMT / M6)) %>%
filter(!is.infinite(FC2)) %>%
filter(!is.na(FC2)) %>%
filter(abs(FC2) > 1) %>%
pull(taxa)
#######
## Taxa boosted, reduced, or stable after FMT. Non-core taxa remain in black
remodeled_fmt <- genus_counts_ra_res %>%
filter(intersection == "Donor/Pre_FMT/M1/M4/M6") %>%
filter(taxa %in% fc_taxa) %>%
select(taxa, Sample, value) %>%
pivot_wider(id_cols=taxa, names_from = Sample, values_from = value) %>%
mutate(Remodeled_fmt = case_when((Donor > Pre_FMT) & (M6 > Pre_FMT) ~ "Boosted_by_FMT",
(Pre_FMT > Donor) & (M6 > Pre_FMT) ~ "Increased_after_FMT",
(Pre_FMT > Donor) & (Pre_FMT > M6) ~ "Reduced_by_FMT",
TRUE ~ "Stable")) %>%
select(taxa, Remodeled_fmt)
## Ordering taxa by abundance
taxa_order <- genus_counts_ra_res %>%
dplyr::arrange(intersection, desc(value)) %>% pull(taxa) %>% unique()
## Adding colours
intersect_colours <- genus_counts_ra_res %>%
filter(taxa %in% norare_taxa) %>%
left_join(remodeled_fmt, by="taxa") %>%
mutate(colours = case_when(Remodeled_fmt == "Increased_after_FMT" ~ "#8da0cb",
Remodeled_fmt == "Boosted_by_FMT" ~ "#66c2a5",
Remodeled_fmt == "Reduced_by_FMT" ~ "#fc8d62",
TRUE ~ "black")) %>%
select(taxa, Remodeled_fmt, colours) %>%
distinct() %>%
mutate(taxa = factor(taxa, taxa_order)) %>%
arrange(taxa)
# PREPARING THE DATA FRAME FOR VISUALIZATION
GENUS_RA_intersect_vis <- genus_counts_ra_res %>%
filter(taxa %in% norare_taxa) %>%
mutate(value = value*100,
value = round(value, 2)) %>%
mutate(taxa = factor(taxa, taxa_order)) %>%
mutate(Sample = factor(Sample, c("Pre_FMT","Donor","M1","M4","M6")))
# PLOT
ggplot(GENUS_RA_intersect_vis,
aes(x = Sample, y = taxa, fill =value)) +
geom_tile(color = "white") +
scale_fill_gradientn(colours = brewer.pal(n = 9, name = "YlGnBu")) + # #
geom_text(aes(Sample, taxa, label = value), size = 2) +
labs(x="Sample", y="Genus", # Genus Phylum
title="Relative abundance of taxa per sample") +
theme_classic() +
theme(axis.text.y = element_text(color = intersect_colours$colours)) #, size = 16
## Warning: Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.
# ggsave("Results/Fig3_Taxa_Genus_RA_Filtered_FoldChange.pdf", width = 5, height=7)
Relative abundance per sample of the Bacteroides gatekeeper species
taxa_rank <- "Species"
taxa.glom <- tax_glom(physeq_filt, taxrank=taxa_rank)
physeq.glom.agg <- aggregate_taxa(taxa.glom, taxa_rank)
physeq.glom.agg.ra <- microbiome::transform(physeq.glom.agg, "compositional")
species.ra <- data.frame(physeq.glom.agg.ra@otu_table)
bacteroides.gatekeepers.ra <- species.ra %>%
tibble::rownames_to_column(var = "Species") %>%
filter(Species %in% c("Bacteroides uniformis","Bacteroides vulgatus")) %>%
pivot_longer(cols = 2:(dim(.)[2]), names_to = "Sample") %>%
mutate(Sample = factor(Sample, c("Pre_FMT", "Donor", "M1", "M4", "M6")))
ggplot(data = bacteroides.gatekeepers.ra,
aes(x=Sample,
y=value*100,
fill=Species)) +
geom_bar(position="stack", stat="identity") +
theme_classic() +
scale_y_continuous(expand = c(0, 0)) + #limits = c(0,5),
labs(y = "Relative abundance (%)", x="Sample") +
scale_fill_manual(values=c("#9E499C","#F7CF46"))
# ggsave("Results/Fig3_add_Bacteroides_gatekeepers_relative_abundance.pdf", width = 4, height = 3)