Library and data loading

Working directory set to the Document Directory in Knit

library(vegan)
library(microbiome)
library(phyloseq)
library(ggplot2)
library(RColorBrewer)
library(stringr)
library(ggpubr) # Plot organization, and rremove()
library(naturalsort)
library(ggVennDiagram) # Nice venn diagrams and extract their data
library(cowplot) # To stack plots, plot_grid() fucntion
library(dplyr)
library(tidyr)

sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.5 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=es_ES.UTF-8    LC_NUMERIC=C            LC_TIME=C.UTF-8        
##  [4] LC_COLLATE=es_ES.UTF-8  LC_MONETARY=C.UTF-8     LC_MESSAGES=es_ES.UTF-8
##  [7] LC_PAPER=C.UTF-8        LC_NAME=C               LC_ADDRESS=C           
## [10] LC_TELEPHONE=C          LC_MEASUREMENT=C.UTF-8  LC_IDENTIFICATION=C    
## 
## time zone: Europe/Zurich
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] tidyr_1.3.1         dplyr_1.1.4         cowplot_1.1.3      
##  [4] ggVennDiagram_1.5.2 naturalsort_0.1.3   ggpubr_0.6.0       
##  [7] stringr_1.5.1       RColorBrewer_1.1-3  microbiome_1.26.0  
## [10] ggplot2_3.5.1       phyloseq_1.48.0     vegan_2.6-8        
## [13] lattice_0.22-5      permute_0.9-7      
## 
## loaded via a namespace (and not attached):
##  [1] ade4_1.7-22             tidyselect_1.2.1        Biostrings_2.72.1      
##  [4] fastmap_1.2.0           digest_0.6.37           lifecycle_1.0.4        
##  [7] cluster_2.1.6           survival_3.7-0          magrittr_2.0.3         
## [10] compiler_4.4.1          rlang_1.1.4             sass_0.4.9             
## [13] tools_4.4.1             igraph_2.1.1            utf8_1.2.4             
## [16] yaml_2.3.10             data.table_1.16.2       knitr_1.49             
## [19] ggsignif_0.6.4          plyr_1.8.9              abind_1.4-8            
## [22] Rtsne_0.17              withr_3.0.2             purrr_1.0.2            
## [25] BiocGenerics_0.50.0     grid_4.4.1              stats4_4.4.1           
## [28] fansi_1.0.6             multtest_2.60.0         biomformat_1.32.0      
## [31] colorspace_2.1-1        Rhdf5lib_1.26.0         scales_1.3.0           
## [34] iterators_1.0.14        MASS_7.3-61             cli_3.6.3              
## [37] rmarkdown_2.29          crayon_1.5.3            generics_0.1.3         
## [40] rstudioapi_0.17.1       httr_1.4.7              reshape2_1.4.4         
## [43] ape_5.8                 cachem_1.1.0            rhdf5_2.48.0           
## [46] zlibbioc_1.50.0         splines_4.4.1           parallel_4.4.1         
## [49] XVector_0.44.0          vctrs_0.6.5             Matrix_1.7-1           
## [52] carData_3.0-5           jsonlite_1.8.9          car_3.1-3              
## [55] IRanges_2.38.1          S4Vectors_0.42.1        rstatix_0.7.2          
## [58] Formula_1.2-5           foreach_1.5.2           jquerylib_0.1.4        
## [61] glue_1.8.0              codetools_0.2-19        stringi_1.8.4          
## [64] gtable_0.3.6            GenomeInfoDb_1.40.1     UCSC.utils_1.0.0       
## [67] munsell_0.5.1           tibble_3.2.1            pillar_1.9.0           
## [70] htmltools_0.5.8.1       rhdf5filters_1.16.0     GenomeInfoDbData_1.2.12
## [73] R6_2.5.1                evaluate_1.0.1          Biobase_2.64.0         
## [76] backports_1.5.0         broom_1.0.7             bslib_0.8.0            
## [79] Rcpp_1.0.13-1           nlme_3.1-165            mgcv_1.9-1             
## [82] xfun_0.49               pkgconfig_2.0.3

Data loading

samples_order <- c("Donor","Pre_FMT","M1","M4","M6")

physeq <- readRDS("phyloseq_IBS_FMT.rds") # Phyloseq object

# Extracting metadata 
metadata <- get_variable(physeq) %>%
  mutate(Sample_name_short = factor(Sample_name_short, unname(samples_order)),
         Sample_name_long = factor(Sample_name_long, c("FMT_donor","Pre-FMT","1M-post-FMT","4M-post-FMT","6M-post-FMT")),
         QuantificationQubit_ng_ul = as.numeric(QuantificationQubit_ng_ul))

reads <- read.csv("Reads_IBS_FMT.csv") %>%
  mutate(Sample_name_short = factor(Sample_name_short, unname(samples_order)),
         Reads = factor(Reads, c("Reads processing","Maintained reads", "Tax filtered reads")))

QC

DNA and Reads counts

###########################
####### LIBRARY DNA #######
###########################
libDNA <- ggplot(metadata, aes(x = Sample_name_long, y = QuantificationQubit_ng_ul, fill = seq_name)) +
  geom_col() +
  labs(x= "Sample name",  
       y = expression(paste("DNA quantification [ng/", mu, "l]")),
       title = "A", #Low values of library DNA yield
       subtitle = "Individuals") +
  theme_bw() +
  scale_fill_manual(values = c("#fc8d59","#91bfdb")) +
  theme(text = element_text(size = 11),
        plot.title = element_text(face = "bold"),
        strip.text = element_text(face = "bold", colour = "black"),
        axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        legend.position="right") + # none
    guides(fill=guide_legend(title="Sequence run", byrow=TRUE)) + #nrow=1,
  scale_x_discrete(drop = TRUE)  

###########################
########## READS ##########
###########################

overall_reads_barplot <- 
  ggplot(reads %>% 
           filter(Reads!="Tax filtered reads"),
         aes(x=Sample_name_long, y=Count, fill=Reads)) +
    geom_col() +
    theme_bw() +
    scale_fill_manual(values = c("#a6cee3","#1f78b4","#8dd3c7")) +
    labs(x= "Sample label",  
       y ="Reads",
       title = "B") + #Reads counts overall
       # subtitle = "Individuals") +
  theme(text = element_text(size = 11),
        plot.title = element_text(face = "bold", vjust = -0.5),
        strip.text = element_text(face = "bold", colour = "black"),
        axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 9),
        plot.subtitle = element_text(hjust = 0.5),
        legend.position="right") +
    scale_x_discrete(drop = TRUE) + # Keep all groups, included the ones with values. Alternative : (drop = FALSE)
  # FACET GRID  
    geom_hline(yintercept=150000, col="red", linetype="solid", linewidth=1, alpha=0.3) +
  geom_hline(yintercept=100000, col="red", linetype="solid", linewidth=1, alpha=0.8) 

###########################
####### COMBINATION #######
###########################
p_QC <- plot_grid(libDNA + rremove("x.text") + rremove("x.title"), 
                  overall_reads_barplot, 
                  align = 'v', ncol = 1, axis = 'lr')



# ggsave(filename = "./Results/QC_DNA_Reads.pdf", plot = p_QC, width = 6, height = 6, dpi = 500, ) # chagnged the _Groups _Runs
print(p_QC)

reads %>% filter(Reads!="Tax filtered reads") %>% select(Sample_name_short,Reads,Count) %>%
  pivot_wider(names_from = Reads, values_from =Count ) %>%
  mutate(maintainted_reads_perc = round( (`Maintained reads` / (`Maintained reads` + `Reads processing`)), 2))  %>%
  write.csv("Results/Read_counts.csv", row.names = F)

Rarefaction curves

# REMOVING SINGLETONS
ntaxa(physeq)
## [1] 1242
physeq_filt <- prune_taxa(taxa_sums(physeq) > 1, physeq)
ntaxa(physeq_filt)
## [1] 1238
ntaxa(physeq) - ntaxa(physeq_filt) 
## [1] 4
physeq_filt <- prune_samples(as.character(physeq@sam_data$Sample_name_short),
                             physeq_filt)

# RAREFACTION
tab <- otu_table(physeq_filt)
class(tab) <- "matrix" # as.matrix() will do nothing
## Warning in class(tab) <- "matrix": Configuración de clase(x) a "matrix"
## establece atribuibles a NULL; resultado ya no será un objeto S4
## you get a warning here, but this is what we need to have
tab <- t(tab) # transpose observations to rows

pdf("Results/S1_Rerecurves.pdf", width = 6, height = 4)
rare <- rarecurve(tab, step=1000, lwd=2, ylab="ASVs",  label=T)
# rare
abline(v=150000, col="red")
abline(v=250000, col="blue")
dev.off()
## png 
##   2
# ps.rar2 <- rarefy_even_depth(physeq_filt, rngseed=1, sample.size=250000, replace=F)

Repeated code for visualization

rare <- rarecurve(tab, step=1000, lwd=2, ylab="ASVs",  label=T)
# rare
abline(v=150000, col="red")
abline(v=250000, col="blue")

All samples reach the rarefaction plateau.

ALPHA DIVERSITY (Fig2.A)

Calculating and visualizing alpha diversity at Species level

# TAXA AGGREGATION
taxa_r <- "Species"
taxa.glom <- tax_glom(physeq_filt, taxrank=taxa_r)
physeq.glom.agg <- aggregate_taxa(taxa.glom, taxa_r)

# CALCULATING RICHNESS METRICS
ricorico <- estimate_richness(physeq.glom.agg) %>%
  mutate(Sample_name_short = rownames(.)) %>%
  `rownames<-`( NULL ) %>%
  select(Observed,Chao1, se.chao1, Shannon, Simpson, InvSimpson, Fisher, Sample_name_short) %>%
  pivot_longer(cols = c(Observed,Chao1, se.chao1, Shannon, Simpson, InvSimpson, Fisher),
               names_to = "metric", values_to="value")  %>%
  left_join(metadata, by="Sample_name_short") 


ggplot(ricorico %>% 
         filter(metric %in% c("Observed", "Shannon"))  %>% 
         mutate(individual_kind = case_when(project_name == "DONORS" ~ "FMT donor",
                                            project_name == "TMF" ~ "Recipient")), 
             aes(x=Sample_name_long, y=value, fill=individual_kind)) +
  geom_col() +
  facet_wrap(vars(metric), scales = "free_y")+
  theme_classic()+
  labs(x="Sample name", y="Index value (species level)") +
  scale_fill_manual(values = c("#fc8d59","#91bfdb")) + # #99d594 91bfdb
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + #, vjust = 0.5))
  guides(fill=guide_legend(title="Sample origin"))

# ggsave("Results/Fig2A_alpha_div_Species.pdf", width = 5, height = 3)

Taxa intersections

Data preparation

Extracting data intersection among all samples

taxa_r <- "Genus"
taxa.glom <- tax_glom(physeq_filt, taxrank=taxa_r)
physeq.glom.agg <- aggregate_taxa(taxa.glom, taxa_r)

genus_counts <- data.frame(physeq.glom.agg@otu_table)
genus_counts <-  genus_counts %>% relocate(naturalsort(names( genus_counts)))
genus_names <- lapply( genus_counts, function(x) rownames( genus_counts)[x>0])

# object for calculating relative abundance
genus_counts_ra <- microbiome::transform(physeq.glom.agg, "compositional")
genus_counts_ra <- data.frame(genus_counts_ra@otu_table) 
genus_counts_ra$taxa <- rownames(genus_counts)
genus_counts <- genus_counts %>% # Reordering the columns
  select(Donor, Pre_FMT, M1, M4, M6) 
# ---
genus_counts_ra_base <- genus_counts_ra %>%
  reshape2::melt(id.vars="taxa")  %>%
  rename(Sample = variable) %>% 
  mutate(tax_sample = paste(taxa, Sample, sep="_")) %>%
  filter(value >0) 

# 1. TAXA PER INTERSECTIONS
############################################################################## 
# Get the taxa names per sample (time point)
## since here we only care about presence absence I use the original genus_counts 
## dataframe with the selected time points.
genus_counts_names <- lapply(genus_counts, function(x) rownames(genus_counts)[x>0])

ven.plot <- Venn(genus_counts_names) # Get the venn diagram for the intersections
data <- process_data(ven.plot) # Extract plot information 
data.venn <- data$regionData # Extract intersections
# Extract individual samples from each intersection
intersections <- lapply(str_split(data.venn$name, "[/]"), function(z){ z[!is.na(z) & z != ""]})
names(intersections) <- data.venn$name
a <- c() # List for intersection names
b <- c() # List for each intersection component
for (name in names(intersections)) {
  # print(name)
  c <- intersections[name]
  a <- append( a, rep(name,length(unname(c[[1]]))) )
  b <- append(b, unname(c[[1]]))
}

# Incorporating these elements into a data frame
df1 <- data.frame(intersection = a,
                  samples_intersects = b)
# Adding taxa of each interaction
df2 <- merge(df1, data.venn[c("name", "item")], by.x="intersection", by.y="name",
      all.x=T) 
# Unnesting the list of taxa into distinct rows:
df2 <- df2 %>% unnest(item)

# Creating taxa_sample column in the intersections dataframe to then merge it with the actual intervals count
df2$tax_sample <- paste(df2$item, df2$samples_intersects, sep="_")


# 2) Extracting ASVs interval/counts of each intercept per group
####################################################################################
# Creating taxa_sample column to then merge with the other one
genus_counts_ra_res <- genus_counts_ra_base %>%
  left_join(df2 %>% select(c(intersection,tax_sample)),by="tax_sample")

Venn diagram (Fig2.B)

Absence-presence visualziation with a Venn Diagram

ggVennDiagram( genus_names) + 
  scale_fill_distiller(palette = "Spectral") 

ggsave(paste0("Results/Fig2B_Venn_",taxa_r,".pdf"), height = 7, width = 7)

Proportion per sample intersection (Fig2.C)

Proportion of each sample intersection for each sample

# Palette
colours_gradient <- c("white","#FFFFD9","#EDF8B1", "#C7E9B4", "#7FCDBB", "#41B6C4",
                      "#1D91C0", "#225EA8", "#253494", "#081D58")


###### Visualization more tidy
A <- c("Donor/Pre_FMT/M1/M4/M6","Pre_FMT/M1/M4/M6" )
B <- c("Pre_FMT/M4/M6","Pre_FMT/M1","Pre_FMT","Pre_FMT/M1/M4","Pre_FMT/M6","Pre_FMT/M4","Pre_FMT/M1/M6")
C <- c("Donor/Pre_FMT/M1","Donor/Pre_FMT/M4/M6","Donor/Pre_FMT/M4","Donor/Pre_FMT/M1/M4","Donor/Pre_FMT","Donor/Pre_FMT/M6","Donor/Pre_FMT/M1/M6")
D <- c("Donor","Donor/M1/M6","Donor/M1","Donor/M1/M4/M6","Donor/M4/M6","Donor/M1/M4","Donor/M4")
E <- c("M1", "M1/M6","M1/M4/M6","M1/M4","M4/M6","M4","M6")

genus_counts_ra_res_intersections <- genus_counts_ra_res %>%
         mutate(value = value*100,
                value = round(value, 3)) %>%
         group_by(intersection, Sample) %>%
         summarise(Sum = sum(value)) %>%
         mutate(Sample = factor(Sample, c("Donor","Pre_FMT","M1","M4","M6"))) %>%
  mutate(category = case_when(intersection %in% A ~ "Cores",
                              intersection %in% B ~ "Cores variation",
                              intersection %in% C ~ "Patient variation",
                              intersection %in% D ~ "Engraftment var.",
                              intersection %in% E ~ "Acquisition"),
         category = factor(category, c("Cores", "Cores variation", "Patient variation", "Engraftment var.", "Acquisition")))
## `summarise()` has grouped output by 'intersection'. You can override using the
## `.groups` argument.
ggplot(genus_counts_ra_res_intersections, 
       aes(x = Sample, y = intersection, fill=Sum)) +
  geom_tile(color = "white") +
  scale_fill_gradientn(colours = colours_gradient) + #YGreens
  geom_text(aes(Sample, intersection, label = ifelse(Sum>0, Sum, ".")), size = 2) +
  labs(x="Sample", y="Intersection",
       title="Relative abundance of the taxa in each intersection per sample") +
  facet_grid(category ~ ., scales="free", space="free") +
  theme_classic() 

# ggsave("Results/Fig2C_Intersections_categories_genus_relative_abundance.pdf", width = 4.7, height=6)    

Per sample: Genus relative abundance and log-fold change (Fig3)

Relative abundance of the common genera and their longitudinal changes

######## Taxa selection: at any time or sample >1%.
norare_taxa <- genus_counts_ra_res %>% 
  mutate(value = value*100)  %>% 
  filter(value > 1)  %>% 
  pull(taxa) %>% 
  unique()

# FOLD CHANGE LABELING
fc_taxa <- genus_counts_ra_res %>%
  filter(taxa %in% norare_taxa) %>% 
  pivot_wider(id_cols = taxa, names_from = Sample, values_from = value, values_fill = 0) %>%
  mutate(FC2 = log2(Pre_FMT / M6)) %>%
  filter(!is.infinite(FC2)) %>%
  filter(!is.na(FC2)) %>%
  filter(abs(FC2) > 1) %>%
  pull(taxa)

#######
## Taxa boosted, reduced, or stable after FMT. Non-core taxa remain in black
remodeled_fmt <- genus_counts_ra_res %>%
  filter(intersection == "Donor/Pre_FMT/M1/M4/M6") %>%
  filter(taxa %in% fc_taxa) %>%
  select(taxa, Sample, value) %>%
  pivot_wider(id_cols=taxa, names_from = Sample, values_from = value) %>%
  mutate(Remodeled_fmt = case_when((Donor > Pre_FMT) & (M6 > Pre_FMT) ~ "Boosted_by_FMT",
                                   (Pre_FMT > Donor) & (M6 > Pre_FMT) ~ "Increased_after_FMT",
                                   (Pre_FMT > Donor) & (Pre_FMT > M6) ~ "Reduced_by_FMT",
                                   TRUE ~ "Stable"))  %>%
  select(taxa, Remodeled_fmt)

## Ordering taxa by abundance 
taxa_order <- genus_counts_ra_res %>% 
  dplyr::arrange(intersection, desc(value)) %>% pull(taxa) %>% unique()



## Adding colours
intersect_colours <- genus_counts_ra_res %>%
  filter(taxa %in% norare_taxa) %>% 
  left_join(remodeled_fmt, by="taxa") %>%
  mutate(colours = case_when(Remodeled_fmt == "Increased_after_FMT" ~ "#8da0cb",
                             Remodeled_fmt == "Boosted_by_FMT" ~ "#66c2a5",
                             Remodeled_fmt == "Reduced_by_FMT" ~ "#fc8d62",
                             TRUE ~ "black")) %>%
  select(taxa, Remodeled_fmt, colours) %>%
  distinct() %>%
   mutate(taxa = factor(taxa, taxa_order))  %>%
  arrange(taxa)

# PREPARING THE DATA FRAME FOR VISUALIZATION
GENUS_RA_intersect_vis <- genus_counts_ra_res %>%
  filter(taxa %in% norare_taxa) %>% 
  mutate(value = value*100,
         value = round(value, 2)) %>%
  mutate(taxa = factor(taxa, taxa_order))  %>%
  mutate(Sample = factor(Sample, c("Pre_FMT","Donor","M1","M4","M6")))

# PLOT
ggplot(GENUS_RA_intersect_vis, 
  aes(x = Sample, y = taxa, fill =value)) +
  geom_tile(color = "white") +
  scale_fill_gradientn(colours = brewer.pal(n = 9, name = "YlGnBu")) + # #
  geom_text(aes(Sample, taxa, label = value), size = 2) +
  labs(x="Sample", y="Genus", # Genus Phylum
       title="Relative abundance of taxa per sample") +
  theme_classic() +
  theme(axis.text.y = element_text(color = intersect_colours$colours)) #, size = 16
## Warning: Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.

# ggsave("Results/Fig3_Taxa_Genus_RA_Filtered_FoldChange.pdf", width = 5, height=7) 

Bacteroides species

Relative abundance per sample of the Bacteroides gatekeeper species

taxa_rank <- "Species"
taxa.glom <- tax_glom(physeq_filt, taxrank=taxa_rank)
physeq.glom.agg <- aggregate_taxa(taxa.glom, taxa_rank)
physeq.glom.agg.ra <- microbiome::transform(physeq.glom.agg, "compositional")

species.ra <- data.frame(physeq.glom.agg.ra@otu_table) 

bacteroides.gatekeepers.ra <- species.ra %>% 
  tibble::rownames_to_column(var = "Species") %>% 
  filter(Species %in% c("Bacteroides uniformis","Bacteroides vulgatus")) %>%
  pivot_longer(cols = 2:(dim(.)[2]), names_to = "Sample") %>%
  mutate(Sample = factor(Sample, c("Pre_FMT", "Donor", "M1", "M4", "M6"))) 


ggplot(data = bacteroides.gatekeepers.ra,
     aes(x=Sample, 
         y=value*100,
         fill=Species)) +
  geom_bar(position="stack", stat="identity") +
  theme_classic() +
  scale_y_continuous(expand = c(0, 0)) + #limits = c(0,5), 
  labs(y = "Relative abundance (%)", x="Sample") +
  scale_fill_manual(values=c("#9E499C","#F7CF46"))

# ggsave("Results/Fig3_add_Bacteroides_gatekeepers_relative_abundance.pdf", width = 4, height = 3)