Run information

Date generated: 2022-06-10

R Packages used

These are the R packages used to generate this report.

library(stringr)
library(readr)
library(dplyr)
library(tidyr)
library(rlang)
library(ggtree)
library(plotly)
library(patchwork)
library(ggplot2)
library(viridis)
library(formattable)
library(kableExtra)

packages <- (.packages())

clean_pkg_version <- function(x) {
  y <- getNamespaceVersion(x)
  
  z <- data.frame("Package" = x,
                  "Version" = y)
  
  row.names(z) <- NULL
  
  return(z)
}

lapply(packages, function(x) clean_pkg_version(x)) %>%
  bind_rows() %>%
  kbl() %>%
  kable_styling(bootstrap_options = c("striped",
                                      "hover",
                                      "responsive"),
                full_width = TRUE)
Package Version
kableExtra 1.3.4
formattable 0.2.1
viridis 0.6.2
viridisLite 0.4.0
patchwork 1.1.1
plotly 4.10.0
ggplot2 3.3.5
ggtree 3.2.1
rlang 1.0.1
tidyr 1.2.0
dplyr 1.0.8
readr 2.1.2
stringr 1.4.0
stats 4.1.2
graphics 4.1.2
grDevices 4.1.2
utils 4.1.2
datasets 4.1.2
methods 4.1.2
base 4.1.2


Tool versions

These are the tools used to run the analysis within the core genome workflow of ALPPACA.

tool_data <- data.frame(
  Tool = c("Prokka",
           "Panaroo",
           "Snp-dists",
           "Snp-sites",
           "IQTree"),
  Version = c("1.14.6",
              "1.2.9",
              "0.8.2",
              "2.5.1",
              "2.1.4")
)

tool_data %>%
  kbl() %>%
  kable_styling(bootstrap_options = c("striped",
                                      "hover",
                                      "responsive"),
                full_width = TRUE)
Tool Version
Prokka 1.14.6
Panaroo 1.2.9
Snp-dists 0.8.2
Snp-sites 2.5.1
IQTree 2.1.4



Tool information

Panaroo QC

panaroo_ngenes_data <- read_delim(
  params$ngenes_data,
  delim = "\t",
  show_col_types = FALSE
)

panaroo_ncontigs_data <- read_delim(
  params$ncontigs_data,
  delim = "\t",
  show_col_types = FALSE
)

plot_ncontigs <- ggplot(panaroo_ncontigs_data, aes(no_contigs, sample)) +
  geom_col() +
  scale_x_reverse() +
  labs(x = "Number of contigs") +
  theme_bw() +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.title.y = element_blank())

plot_ngenes <- ggplot(panaroo_ngenes_data, aes(no_genes, sample)) +
  geom_col() +
  labs(x = "Number of genes") +
  theme_bw() +
  theme(axis.text.y = element_blank(),
        axis.title.y = element_blank(),
        axis.ticks.y = element_blank())

plot_names <- ggplot(panaroo_ncontigs_data, aes(1, sample)) +
  geom_text(aes(label = sample)) +
  scale_x_reverse() +
  theme_void()

plot_ncontigs + plot_names + plot_ngenes +
  plot_layout(widths = c(4, 2, 4))
<center>***Figure 1:** Number of genes and contigs in all samples.*

Figure 1: Number of genes and contigs in all samples.

panaroo_mds_data <- read_delim(params$mds_data,
                               delim = "\t",
                               show_col_types = FALSE)

mds_plot <- ggplot(panaroo_mds_data, aes(coordx, coordy, text = sample)) +
  geom_point() +
  theme_bw()

ggplotly(mds_plot, tooltip = "text")

Figure 2: MDS analysis based on pairwise mash distances between all samples. Outliers should be investigated.

Pangenome analysis

Number of genes detected in the pangenome analysis. The core genes were aligned and used in the phylogenetic analysis.
read_delim(params$pangenome_data,
           delim = "\t",
           show_col_types = FALSE,
           col_names = FALSE) %>%
  rename("Type" = X1,
         "Definition" = X2,
         "Number of genes" = X3) %>%
  kbl() %>%
  kable_styling(bootstrap_options = c("striped",
                                      "hover",
                                      "responsive"),
                full_width = TRUE)
Type Definition Number of genes
Core genes (99% <= strains <= 100%) 4303
Soft core genes (95% <= strains < 99%) 0
Shell genes (15% <= strains < 95%) 2462
Cloud genes (0% <= strains < 15%) 0
Total genes (0% <= strains <= 100%) 6765


SNPdist

SNP distance summary statistics for all isolates.
snpdist_data <- read_delim(params$snpdist_report,
                           delim = "\t",
                           show_col_types = FALSE) %>%
  rename("isol1" = 1) %>%
  pivot_longer(cols = -isol1,
               names_to = "isol2",
               values_to = "value")


snpdist_data %>%
  filter(isol1 != isol2) %>%
  mutate(Median = median(value),
         Mean = round(mean(value), 0),
         Range = paste0(min(value), " - ", max(value))) %>%
  select(Mean, Median, Range) %>%
  head(1) %>%
  kbl() %>%
  kable_styling(bootstrap_options = c("striped",
                                      "hover",
                                      "responsive"),
                full_width = TRUE)
Mean Median Range
25706 27575 665 - 28981
ggplot(snpdist_data, aes(isol1, isol2, fill = value)) +
  geom_tile() +
  labs(fill = "SNP distance") +
  scale_fill_viridis(direction = -1,
                     option = "D") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.3),
    axis.text = element_text(size = 4),
        panel.grid = element_blank(),
        axis.title = element_blank()) +
  coord_fixed()
**Figure 3**: SNP distance matrix of all included isolates. A lighter color represents a smaller SNP distance.

Figure 3: SNP distance matrix of all included isolates. A lighter color represents a smaller SNP distance.


IQTree

# Read IQTree data
iqtree_data <- readLines(params$phylo_data)

aln_info <- sub(
  "Input data: ",
  "",
  iqtree_data[str_detect(iqtree_data,"Input data:")]
)

const_sites <- sub(
  ".+= (.+) of all sites)",
  "\\1",
  iqtree_data[str_detect(iqtree_data,"Number of constant sites:")]
)

parsimony_sites <- sub(
  "Number of parsimony informative sites: ",
  "",
  iqtree_data[str_detect(iqtree_data,"Number of parsimony informative sites:")]
)

iqtree_model <- sub(
  "Model of substitution: ",
  "",
  iqtree_data[str_detect(iqtree_data,"Model of substitution: ")]
)

iqtree_cpu <- sub(
  "Total CPU time used: ",
  "",
  iqtree_data[str_detect(iqtree_data,"Total CPU time used:")]
)

iqtree_wallclock <- sub(
  "Total wall-clock time used: ",
  "",
  iqtree_data[str_detect(iqtree_data,"Total wall-clock time used:")]
)

IQTree was run on an alignment composed of 6 sequences with 4158616 nucleotide sites, where 98.5732% of the alignment were constant sites. The number of parsimonious sites were 27488. IQTree detected the evolutionary model GTR+F+I. IQTree used 109.002767 seconds (0h:1m:49s) of CPU time, which converts to 39.77674222 seconds (0h:0m:39s) of wall-clock time.



Phylogenetic tree

Here, the phylogenetic tree generated by IQTree is plotted. The figure is interactive, and you can zoom in and out and pan as you see fit.

tree <- read.tree(params$phylo_tree)

tree_plot <- ggtree(tree) +
  geom_text(aes(label = label),
            hjust = 0,
        size = 2) +
  geom_treescale()

ggplotly(tree_plot) %>%
  plotly::style(textposition = "right")

Figure 4: Maximum likelihood phylogenetic tree generated by IQTree. Bootstrap values are presented for each node. The tree is unrooted