Date generated: 2022-06-10
These are the R packages used to generate this report.
library(stringr)
library(readr)
library(dplyr)
library(tidyr)
library(rlang)
library(ggtree)
library(plotly)
library(patchwork)
library(ggplot2)
library(viridis)
library(formattable)
library(kableExtra)
packages <- (.packages())
clean_pkg_version <- function(x) {
y <- getNamespaceVersion(x)
z <- data.frame("Package" = x,
"Version" = y)
row.names(z) <- NULL
return(z)
}
lapply(packages, function(x) clean_pkg_version(x)) %>%
bind_rows() %>%
kbl() %>%
kable_styling(bootstrap_options = c("striped",
"hover",
"responsive"),
full_width = TRUE)
Package | Version |
---|---|
kableExtra | 1.3.4 |
formattable | 0.2.1 |
viridis | 0.6.2 |
viridisLite | 0.4.0 |
patchwork | 1.1.1 |
plotly | 4.10.0 |
ggplot2 | 3.3.5 |
ggtree | 3.2.1 |
rlang | 1.0.1 |
tidyr | 1.2.0 |
dplyr | 1.0.8 |
readr | 2.1.2 |
stringr | 1.4.0 |
stats | 4.1.2 |
graphics | 4.1.2 |
grDevices | 4.1.2 |
utils | 4.1.2 |
datasets | 4.1.2 |
methods | 4.1.2 |
base | 4.1.2 |
These are the tools used to run the analysis within the core genome workflow of ALPPACA.
tool_data <- data.frame(
Tool = c("Prokka",
"Panaroo",
"Snp-dists",
"Snp-sites",
"IQTree"),
Version = c("1.14.6",
"1.2.9",
"0.8.2",
"2.5.1",
"2.1.4")
)
tool_data %>%
kbl() %>%
kable_styling(bootstrap_options = c("striped",
"hover",
"responsive"),
full_width = TRUE)
Tool | Version |
---|---|
Prokka | 1.14.6 |
Panaroo | 1.2.9 |
Snp-dists | 0.8.2 |
Snp-sites | 2.5.1 |
IQTree | 2.1.4 |
panaroo_ngenes_data <- read_delim(
params$ngenes_data,
delim = "\t",
show_col_types = FALSE
)
panaroo_ncontigs_data <- read_delim(
params$ncontigs_data,
delim = "\t",
show_col_types = FALSE
)
plot_ncontigs <- ggplot(panaroo_ncontigs_data, aes(no_contigs, sample)) +
geom_col() +
scale_x_reverse() +
labs(x = "Number of contigs") +
theme_bw() +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.title.y = element_blank())
plot_ngenes <- ggplot(panaroo_ngenes_data, aes(no_genes, sample)) +
geom_col() +
labs(x = "Number of genes") +
theme_bw() +
theme(axis.text.y = element_blank(),
axis.title.y = element_blank(),
axis.ticks.y = element_blank())
plot_names <- ggplot(panaroo_ncontigs_data, aes(1, sample)) +
geom_text(aes(label = sample)) +
scale_x_reverse() +
theme_void()
plot_ncontigs + plot_names + plot_ngenes +
plot_layout(widths = c(4, 2, 4))
panaroo_mds_data <- read_delim(params$mds_data,
delim = "\t",
show_col_types = FALSE)
mds_plot <- ggplot(panaroo_mds_data, aes(coordx, coordy, text = sample)) +
geom_point() +
theme_bw()
ggplotly(mds_plot, tooltip = "text")
read_delim(params$pangenome_data,
delim = "\t",
show_col_types = FALSE,
col_names = FALSE) %>%
rename("Type" = X1,
"Definition" = X2,
"Number of genes" = X3) %>%
kbl() %>%
kable_styling(bootstrap_options = c("striped",
"hover",
"responsive"),
full_width = TRUE)
Type | Definition | Number of genes |
---|---|---|
Core genes | (99% <= strains <= 100%) | 4303 |
Soft core genes | (95% <= strains < 99%) | 0 |
Shell genes | (15% <= strains < 95%) | 2462 |
Cloud genes | (0% <= strains < 15%) | 0 |
Total genes | (0% <= strains <= 100%) | 6765 |
snpdist_data <- read_delim(params$snpdist_report,
delim = "\t",
show_col_types = FALSE) %>%
rename("isol1" = 1) %>%
pivot_longer(cols = -isol1,
names_to = "isol2",
values_to = "value")
snpdist_data %>%
filter(isol1 != isol2) %>%
mutate(Median = median(value),
Mean = round(mean(value), 0),
Range = paste0(min(value), " - ", max(value))) %>%
select(Mean, Median, Range) %>%
head(1) %>%
kbl() %>%
kable_styling(bootstrap_options = c("striped",
"hover",
"responsive"),
full_width = TRUE)
Mean | Median | Range |
---|---|---|
25706 | 27575 | 665 - 28981 |
ggplot(snpdist_data, aes(isol1, isol2, fill = value)) +
geom_tile() +
labs(fill = "SNP distance") +
scale_fill_viridis(direction = -1,
option = "D") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.3),
axis.text = element_text(size = 4),
panel.grid = element_blank(),
axis.title = element_blank()) +
coord_fixed()
Figure 3: SNP distance matrix of all included isolates. A lighter color represents a smaller SNP distance.
# Read IQTree data
iqtree_data <- readLines(params$phylo_data)
aln_info <- sub(
"Input data: ",
"",
iqtree_data[str_detect(iqtree_data,"Input data:")]
)
const_sites <- sub(
".+= (.+) of all sites)",
"\\1",
iqtree_data[str_detect(iqtree_data,"Number of constant sites:")]
)
parsimony_sites <- sub(
"Number of parsimony informative sites: ",
"",
iqtree_data[str_detect(iqtree_data,"Number of parsimony informative sites:")]
)
iqtree_model <- sub(
"Model of substitution: ",
"",
iqtree_data[str_detect(iqtree_data,"Model of substitution: ")]
)
iqtree_cpu <- sub(
"Total CPU time used: ",
"",
iqtree_data[str_detect(iqtree_data,"Total CPU time used:")]
)
iqtree_wallclock <- sub(
"Total wall-clock time used: ",
"",
iqtree_data[str_detect(iqtree_data,"Total wall-clock time used:")]
)
IQTree was run on an alignment composed of 6 sequences with 4158616 nucleotide sites, where 98.5732% of the alignment were constant sites. The number of parsimonious sites were 27488. IQTree detected the evolutionary model GTR+F+I. IQTree used 109.002767 seconds (0h:1m:49s) of CPU time, which converts to 39.77674222 seconds (0h:0m:39s) of wall-clock time.
Here, the phylogenetic tree generated by IQTree is plotted. The figure is interactive, and you can zoom in and out and pan as you see fit.
tree <- read.tree(params$phylo_tree)
tree_plot <- ggtree(tree) +
geom_text(aes(label = label),
hjust = 0,
size = 2) +
geom_treescale()
ggplotly(tree_plot) %>%
plotly::style(textposition = "right")
Figure 4: Maximum likelihood phylogenetic tree generated by IQTree. Bootstrap values are presented for each node. The tree is unrooted