library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning in as.POSIXlt.POSIXct(Sys.time()): unknown timezone 'zone/tz/2019c.1.0/
## zoneinfo/America/New_York'
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.4.3
library(stringr)
## Warning: package 'stringr' was built under R version 3.4.4
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
##
## smiths
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 2.1.3 ✓ purrr 0.3.3
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## Warning: package 'readr' was built under R version 3.4.4
## Warning: package 'forcats' was built under R version 3.4.4
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
options(scipen=999)
DATA <- "../data"
# Read Inputs
sra_metadata <- read.delim(paste0(DATA,"/lactobacillus-results.txt")) %>% distinct(experiment_accession, .keep_all=TRUE)
sra_metadata$sample <- sra_metadata$experiment_accession
sequence_metrics = read.delim(paste0(DATA,"/summary/lactobacillus-report.txt"))
full_metrics = merge(sequence_metrics, sra_metadata, by='sample', all.x = TRUE)
full_metrics$year <- as.data.frame(select(full_metrics, first_public) %>% separate(first_public, into=c('year'), extra='drop'))$year
phyloflash <- read.delim(paste0(DATA,"/phyloflash/phyloflash-summary.txt"))
gtdb <- read.delim(paste0(DATA,"/gtdb/gtdbtk.summary.tsv"))
exclude <- read.delim(paste0(DATA,"/summary/lactobacillus-exclude.txt"))
#' write_plot
#'
#' A wrapper for to validate given vector is multiple ids and proper type. This
#' function should not be directly used by the user.
#'
#' @param plot_object A ggplot object
#' @param name Basename for the output PDF and PNG files
#' @param height The PDF height of the output (Default: 5)
#' @param width The PDF width of the object (Default: 12)
#'
#' @export
#' @return bool TRUE is multiple ids else FALSE.
write_plot <- function(plot_object, name, height = 5, width = 12) {
pdf(paste0(name, ".pdf"), width=width, height=height)
print(plot_object)
dev_null <- dev.off()
png(paste0(name, ".png"), width=width*100, height=height*100)
print(plot_object)
dev_null <- dev.off()
}
Technologies
table(sra_metadata$instrument_platform)
##
## ILLUMINA ION_TORRENT LS454 OXFORD_NANOPORE PACBIO_SMRT
## 1664 58 190 6 79
By Year Plots
Submissions Per Year
submissions <- select(full_metrics, year) %>% count(year)
p <- ggplot(data=submissions, aes(x=year, y=n)) +
xlab("Year") +
ylab("Count") +
geom_bar(stat='identity') +
geom_text(aes(label=n), vjust = -0.5) +
scale_x_discrete(breaks = round(
seq(min(submissions$year), max(submissions$year), by = 1),1
)) +
theme_bw() +
theme(axis.text=element_text(size=12),
axis.title=element_text(size=14,face="bold"))
p

Sequence Summary
Ranks
table(sequence_metrics$rank)
##
## bronze exclude gold silver
## 205 48 967 386
By Rank Table
Gold
sequence_metrics %>%
filter(rank == 'gold') %>%
select(original_coverage, final_coverage, final_qual_mean, final_read_mean, total_contig) %>%
map(summary)
## $original_coverage
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 101.4 160.2 207.4 258.7 301.7 1929.4
##
## $final_coverage
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 100 100 100 100 100 100
##
## $final_qual_mean
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 30.02 34.40 35.12 35.13 36.02 38.58
##
## $final_read_mean
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 95.40 99.88 99.96 127.22 143.04 299.18
##
## $total_contig
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.00 30.00 52.00 51.18 68.50 100.00
summary(sequence_metrics[sequence_metrics$rank == 'gold',]$total_contig_length/sequence_metrics[sequence_metrics$rank == 'gold',]$estimated_genome_size)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.3656 0.8286 0.9208 0.8776 0.9646 1.0748
Silver
sequence_metrics %>%
filter(rank == 'silver') %>%
select(original_coverage, final_coverage, final_qual_mean, final_read_mean, total_contig) %>%
map(summary)
## $original_coverage
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 51.43 86.32 137.32 166.55 196.13 1413.52
##
## $final_coverage
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 50.95 82.41 100.00 90.72 100.00 100.00
##
## $final_qual_mean
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 27.56 33.83 34.76 34.75 35.85 38.17
##
## $final_read_mean
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 87.31 99.91 99.97 128.84 142.04 298.31
##
## $total_contig
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.0 81.0 110.0 105.3 133.0 200.0
summary(sequence_metrics[sequence_metrics$rank == 'silver',]$total_contig_length/sequence_metrics[sequence_metrics$rank == 'silver',]$estimated_genome_size)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2116 0.8023 0.9258 0.8829 0.9930 1.0586
Bronze
sequence_metrics %>%
filter(rank == 'bronze') %>%
select(original_coverage, final_coverage, final_qual_mean, final_read_mean, total_contig) %>%
map(summary)
## $original_coverage
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 21.83 42.31 102.39 175.16 187.07 1810.33
##
## $final_coverage
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 21.02 38.85 100.00 72.08 100.00 100.00
##
## $final_qual_mean
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 29.28 32.61 33.92 34.10 35.47 38.07
##
## $final_read_mean
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 49.90 94.23 99.94 149.83 219.96 305.94
##
## $total_contig
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 11.0 63.0 90.0 122.5 192.0 481.0
summary(sequence_metrics[sequence_metrics$rank == 'silver',]$total_contig_length/sequence_metrics[sequence_metrics$rank == 'silver',]$estimated_genome_size)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2116 0.8023 0.9258 0.8829 0.9930 1.0586
Exclude
sequence_metrics %>%
filter(rank == 'exclude') %>%
select(original_coverage, final_coverage, final_qual_mean, final_read_mean, total_contig) %>%
map(summary)
## $original_coverage
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.295 18.451 25.736 73.629 95.629 478.288
##
## $final_coverage
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.187 17.451 22.476 43.736 77.461 100.000
##
## $final_qual_mean
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 29.43 33.40 33.88 34.46 35.89 38.93
##
## $final_read_mean
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 35.00 65.98 95.18 92.93 110.26 244.38
##
## $total_contig
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 81.0 528.5 705.5 688.2 867.8 1440.0
summary(sequence_metrics[sequence_metrics$rank == 'silver',]$total_contig_length/sequence_metrics[sequence_metrics$rank == 'silver',]$estimated_genome_size)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2116 0.8023 0.9258 0.8829 0.9930 1.0586
Taxonmic Comparisons
sra_species <- select(full_metrics, experiment_accession, rank, estimated_genome_size, total_contig_length, scientific_name) %>%
separate(scientific_name, into=c('genus', 'species'), extra='drop', fill='right') %>%
unite('sra', c('genus', 'species'), sep=" ")
colnames(sra_species) <- c('sample', 'rank', 'estimated_genome_size', 'assembled_genome_size', 'sra')
phyloflash_species <- filter(phyloflash, !grepl("WARNING", message)) %>%
select(sample, assembly_species) %>%
separate(assembly_species, into=c('genus', 'species'), extra='drop', fill='right') %>%
unite('phyloflash', c('genus', 'species'), sep=" ")
gtdb_species <- select(gtdb, user_genome, classification) %>%
separate(classification, into=c('genus', 'species'), sep=";s__", extra='drop', fill='right') %>%
select(user_genome, genus, species) %>%
separate(genus, into=c('taxon', 'genus'), sep="g__", extra='drop', fill='right') %>%
separate(species, into=c('temp_genus', 'species'), sep=" ", extra='drop', fill='right') %>%
separate(temp_genus, into=c('temp_genus'), sep="_", extra='drop', fill='right') %>%
separate(species, into=c('species'), sep="_", extra='drop', fill='right') %>%
unite('species', c('temp_genus', 'species'), sep=" ") %>%
select(user_genome, genus, species)
colnames(gtdb_species) <- c('sample', 'genus', 'species')
gtdb_species$gtdb <- ifelse(gtdb_species$species == ' NA', gtdb_species$genus, gtdb_species$species)
gtdb_species$is_lacto = ifelse(grepl("Lactobacillus", gtdb_species$gtdb), TRUE, FALSE)
species <- merge(sra_species,
merge(gtdb_species, phyloflash_species, by=c("sample"), all.x = TRUE),
by=c("sample"),
all.x = TRUE
)
species$phyloflash[is.na(species$phyloflash)] <- 'unknown_phyloflash'
species$gtdb[is.na(species$gtdb)] <- 'unknown_gtdb'
species$all_agree <- species$sra == species$phyloflash & species$sra == species$gtdb
species$sra_phyloflash <- species$sra == species$phyloflash
species$sra_gtdb <- species$sra == species$gtdb
species$phyloflash_gtdb <- species$phyloflash == species$gtdb
species$is_lacto <- ifelse(grepl("Lactobacillus", species$gtdb), TRUE, FALSE)
Samples with GTDB assignments
nrow(gtdb_species)
## [1] 1554
Non-Lactobacillus assignments by GTDB
table(gtdb_species$is_lacto)
##
## FALSE TRUE
## 58 1496
table(gtdb_species[!gtdb_species$is_lacto,]$gtdb)
##
## Actinomyces viscosus Aerococcus urinae
## 1 2
## Bifidobacterium vaginale Campylobacter ureolyticus
## 3 2
## Eggerthia catenaformis Facklamia hominis
## 1 1
## KLE1615 sp900066985 Lachnospira rogosae
## 1 1
## Microbacterium sp001595495 Neisseria bacilliformis
## 2 1
## Pseudoglutamicibacter cumminsii Staphylococcus epidermidis
## 2 1
## Streptococcus agalactiae Streptococcus parasanguinis
## 3 1
## Streptococcus pasteurianus Streptococcus pneumoniae
## 1 34
## Winkia sp002849225
## 1
Mismatches between assignments
# unknown_gtdb means gtdb wasn't run on the sample
table(species[species$gtdb != "unknown_gtdb",]$all_agree)
##
## FALSE TRUE
## 505 1049
Mismatches between 16S and GTDB
# comapre samples with phyloflash and gtdb results
nrow(species %>% filter(gtdb != "unknown_gtdb" & phyloflash != "unknown_phyloflash"))
## [1] 1467
gtdb_phyloflash_mismatch <- species %>%
filter(gtdb != "unknown_gtdb" & phyloflash != "unknown_phyloflash" & !phyloflash_gtdb) %>%
select(gtdb, phyloflash) %>%
unite('mismatch', c('gtdb', 'phyloflash'), sep=";")
nrow(gtdb_phyloflash_mismatch)
## [1] 154
table(gtdb_phyloflash_mismatch)
## gtdb_phyloflash_mismatch
## Actinomyces viscosus;Actinomyces oris
## 1
## Bifidobacterium vaginale;Gardnerella vaginalis
## 3
## KLE1615 sp900066985;metagenome NA
## 1
## Lachnospira rogosae;Lactobacillus rogosae
## 1
## Lactobacillus acidifarinae;Lactobacillus zymae
## 2
## Lactobacillus apinorum;Lactobacillus kunkeei
## 1
## Lactobacillus collinoides;Lactobacillus paracollinoides
## 2
## Lactobacillus delbrueckii;Lactobacillus porci
## 1
## Lactobacillus delbrueckii;uncultured bacterium
## 1
## Lactobacillus gasseri;Lactobacillus paragasseri
## 72
## Lactobacillus hilgardii;Lactobacillus buchneri
## 1
## Lactobacillus hordei;Lactobacillus mali
## 2
## Lactobacillus kimbladii;Lactobacillus kullabergensis
## 1
## Lactobacillus kimchicus;Heliconius cydno
## 1
## Lactobacillus kimchiensis;uncultured bacterium
## 1
## Lactobacillus melliventris;uncultured Lactobacillus
## 3
## Lactobacillus nantensis;Lactobacillus heilongjiangensis
## 2
## Lactobacillus panis;uncultured bacterium
## 1
## Lactobacillus paracasei;Lactobacillus casei
## 2
## Lactobacillus pentosus;Lactobacillus plantarum
## 1
## Lactobacillus psittaci;Lactobacillus jensenii
## 3
## Lactobacillus reuteri;Lactobacillus sp
## 1
## Lactobacillus ruminis;Lactobacillus sp
## 1
## Lactobacillus thailandensis;Lactobacillus pantheris
## 1
## Lactobacillus uvarum;Lactobacillus aquaticus
## 2
## Lactobacillus vaginalis;uncultured bacterium
## 24
## Lactobacillus xiangfangensis;Lactobacillus plantarum
## 1
## Lactobacillus zeae;Lactobacillus casei
## 2
## Lactobacillus_G;Lactobacillus kunkeei
## 2
## Lactobacillus_H;Lactobacillus mucosae
## 7
## Lactobacillus;Lactobacillus bombicola
## 4
## Lactobacillus;Lactobacillus iners
## 1
## Lactobacillus;Lactobacillus rodentium
## 1
## Lactobacillus;uncultured Lactobacillus
## 1
## Microbacterium sp001595495;Microbacterium sp
## 1
## Microbacterium sp001595495;Zhihengliuella sp
## 1
## Streptococcus pasteurianus;Streptococcus gallolyticus
## 1
estimated_sizes <- select(species, gtdb, estimated_genome_size) %>%
filter(gtdb != "unknown_gtdb") %>%
group_by(gtdb) %>%
summarise_all(funs(length, min, median, mean, max, max-min, sd))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
colnames(estimated_sizes) <- c('species', 'count', 'min', 'median', 'mean', 'max', 'range', 'standard_deviation')
filter(estimated_sizes, count >50) %>% arrange(desc(count))
assembly_sizes <- select(species, gtdb, assembled_genome_size) %>%
filter(gtdb != "unknown_gtdb") %>%
group_by(gtdb) %>%
summarise_all(funs(length, min, median, mean, max, max-min, sd))
colnames(assembly_sizes) <- c('species', 'count', 'min', 'median', 'mean', 'max', 'range', 'standard_deviation')
filter(assembly_sizes, count >50) %>% arrange(desc(count))
Supplementary Tables and Figures
Supplementary Table 1 - Supplementary Table 1 - Lactobacillus excluded from analysis
exclude %>% select(sample, reason)
Supplementary Table 2 - Samples with a non-Lactobacillus taxonomic classification
species %>% filter(gtdb != 'unknown_gtdb' & !is_lacto ) %>% select(sample, sra, gtdb)
Antibiotic Resistance
AMR_finder <- read.delim(paste0(DATA,"/summary/amrfinder/amrfinder-gene-summary.txt")) %>%
rename(sample = sample_name) %>%
mutate_at(2:11,as.logical)
CARD <- read.delim(paste0(DATA,"/summary/ariba/ariba-card-summary.txt")) %>%
rename(sample = sample_name) %>%
mutate_at(2:48,as.logical)
Lacto_AMR <- inner_join(gtdb_species,AMR_finder,by="sample") %>%
filter(is_lacto == TRUE) %>%
select(-c(is_lacto)) %>%
mutate(ANY = ifelse(rowSums(.[5:14]),TRUE,FALSE))
## Warning: Column `sample` joining factors with different levels, coercing to
## character vector
Lacto_CARD <- inner_join(gtdb_species,CARD,by="sample") %>%
filter(is_lacto == TRUE) %>%
select(-c(is_lacto)) %>%
mutate(ANY = ifelse(rowSums(.[5:51]),TRUE,FALSE))
## Warning: Column `sample` joining factors with different levels, coercing to
## character vector
count_AMR <- Lacto_AMR %>%
group_by(gtdb) %>%
summarise(count = n())
Lacto_AMR_summary <- Lacto_AMR %>%
group_by(gtdb) %>%
summarise_if(is.logical,sum,na.rm = T)
Lacto_AMR_summary2 <- inner_join(count_AMR,Lacto_AMR_summary, by = "gtdb")
colSums(Lacto_AMR_summary2[,2:13])
## count AMINOGLYCOSIDE BETA.LACTAM
## 1496 11 0
## LINCOSAMIDE LINCOSAMIDE.STREPTOGRAMIN MACROLIDE
## 15 0 16
## PHENICOL QUATERNARY.AMMONIUM STREPTOGRAMIN
## 1 0 2
## TETRACYCLINE TRIMETHOPRIM ANY
## 67 0 79
Lacto_AMR_summary2$total <- rowSums(Lacto_AMR_summary2[3:12])
filter(Lacto_AMR_summary2, total >5) %>% arrange(desc(total))
# Ariba was not run on the single-end sampels, hence different counts then
# the AMRFinder+ counts
count_CARD <- Lacto_CARD %>%
group_by(gtdb) %>%
summarise(count = n())
CARD_summary <- Lacto_CARD %>%
group_by(gtdb) %>%
summarise_if(is.logical,sum,na.rm = T)
CARD_summary2 <- inner_join(count_CARD,CARD_summary, by = "gtdb")
CARD_numbers <- colSums(CARD_summary2[,2:50])
CARD_numbers[CARD_numbers > 0]
## count ANT_4___Ib
## 1415 1
## ANT_6__Ia APH_3___Ia
## 2 1
## Clostridioides_difficile_23S ErmB
## 1 3
## ErmT Lactobacillus_reuteri_cat_TC.
## 1 1
## Staphylococcus_aureus_FosB Staphylococcus_aureus_GlpT
## 1 1
## TEM_. arlR
## 2 1
## fexA lnuA
## 1 4
## lnuC lnuG
## 10 1
## mepA mepR
## 1 1
## mgrA rrsB.
## 1 3
## tet. tetM
## 42 16
## tetO tet_38_
## 2 1
## tet_C_ tet_K_
## 1 1
## vatE ANY
## 2 77
Session Info
sessionInfo()
## R version 3.4.1 (2017-06-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.15.3
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] forcats_0.4.0 purrr_0.3.3 readr_1.3.1 tibble_2.1.3
## [5] tidyverse_1.3.0 tidyr_1.0.2 stringr_1.4.0 reshape2_1.4.3
## [9] ggplot2_3.2.1 dplyr_0.8.4
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.0.0 xfun_0.12 haven_2.2.0 lattice_0.20-40
## [5] colorspace_1.4-1 vctrs_0.2.3 generics_0.0.2 htmltools_0.4.0
## [9] yaml_2.2.1 rlang_0.4.4 pillar_1.4.3 glue_1.3.1
## [13] withr_2.1.2 DBI_1.1.0 dbplyr_1.4.2 modelr_0.1.6
## [17] readxl_1.3.1 lifecycle_0.1.0 plyr_1.8.5 munsell_0.5.0
## [21] gtable_0.3.0 cellranger_1.1.0 rvest_0.3.5 evaluate_0.14
## [25] labeling_0.3 knitr_1.28 fansi_0.4.1 broom_0.5.4
## [29] Rcpp_1.0.3 scales_1.1.0 backports_1.1.5 jsonlite_1.6.1
## [33] farver_2.0.3 fs_1.3.1 hms_0.5.3 digest_0.6.25
## [37] stringi_1.4.6 grid_3.4.1 cli_2.0.1 tools_3.4.1
## [41] magrittr_1.5 lazyeval_0.2.2 crayon_1.3.4 pkgconfig_2.0.3
## [45] ellipsis_0.3.0 xml2_1.2.2 reprex_0.3.0 lubridate_1.7.4
## [49] assertthat_0.2.1 rmarkdown_2.1 httr_1.4.1 rstudioapi_0.11
## [53] R6_2.4.1 nlme_3.1-137 compiler_3.4.1