Version information

ALPPACA version 1.0.0.

R Packages

These are the R packages used to generate this report.

library(stringr)
library(readr)
library(dplyr)
library(tidyr)
library(rlang)
library(ggtree)
library(plotly)
library(ggplot2)
library(viridis)
library(formattable)
library(kableExtra)

packages <- (.packages())

clean_pkg_version <- function(x) {
  y <- getNamespaceVersion(x)
  
  z <- data.frame("Package" = x,
                  "Version" = y)
  
  row.names(z) <- NULL
  
  return(z)
}

lapply(packages, function(x) clean_pkg_version(x)) %>%
  bind_rows() %>%
  kbl() %>%
  kable_styling(bootstrap_options = c("striped",
                                      "hover",
                                      "responsive"),
                full_width = TRUE)
Package Version
kableExtra 1.3.4
formattable 0.2.1
viridis 0.6.2
viridisLite 0.4.0
plotly 4.10.0
ggplot2 3.3.5
ggtree 3.2.1
rlang 1.0.1
tidyr 1.2.0
dplyr 1.0.8
readr 2.1.2
stringr 1.4.0
stats 4.1.2
graphics 4.1.2
grDevices 4.1.2
utils 4.1.2
datasets 4.1.2
methods 4.1.2
base 4.1.2


Tool versions

These are the tools used to run the analysis within the core genome workflow of ALPPACA.

tool_data <- data.frame(
  Tool = c("ParSNP",
           "Gubbins",
           "MaskRC",
           "Snp-dists",
           "Snp-sites",
           "IQTree"),
  Version = c("1.6.1",
              "3.1.3",
              "0.5",
              "0.8.2",
              "2.5.1",
              "2.1.4")
)

tool_data %>%
  kbl() %>%
  kable_styling(bootstrap_options = c("striped",
                                      "hover",
                                      "responsive"),
                full_width = TRUE)
Tool Version
ParSNP 1.6.1
Gubbins 3.1.3
MaskRC 0.5
Snp-dists 0.8.2
Snp-sites 2.5.1
IQTree 2.1.4



Tool information

ParSNP

ParSNP average coverage report. The “Type” column refers to which of the samples were used as a reference in the analysis. The “Percent Coverage” column represents the % coverage of each genome that was used to generate the core genome alignment. The green bar is relative to 100%. The total coverage is the average across all samples. Samples with expected high similarity (e.g. outbreak or same ST) are expected to have a high percent coverage.
# Courtesy of Eve Zeyl Fiskebeck for the script!
# Read PARSNP results file
parsnp_data <- readLines(params$parsnp_report)

# Extract the data needed
parsnp_cov <- parsnp_data[str_detect(parsnp_data, "(Cluster coverage in sequence)|(Total coverage among all sequences)")]
parsnp_samples <- parsnp_data[str_detect(parsnp_data, "(Sequence)|(q value)")]

df_samples <- as.data.frame(cbind(parsnp_samples)) %>%
  head(-1) %>%
  separate("parsnp_samples", into = c("Sequence", "Sample"), sep = " :") %>%
  mutate(Sample = basename(Sample))

# Get number of samples in the data
number_samples <- nrow(df_samples)

# Create nice table
as.data.frame(cbind(parsnp_cov)) %>%
  separate("parsnp_cov", into = c("Sequence","Percent Coverage"), sep = ":   ") %>%
  add_row(Sequence = "control_100", `Percent Coverage` = "100%") %>%
  mutate(Sequence = sub(".+(sequence .+)", "\\1", Sequence),
         Sequence = ifelse(grepl("coverage", Sequence),
                           "Total coverage",
                           sub("sequence ", "Sequence ", Sequence)),
         `Percent Coverage` = sub("%", "", `Percent Coverage`),
         `Percent Coverage` = color_bar("lightgreen")(`Percent Coverage`)) %>%
  filter(Sequence != "control_100") %>%
  left_join(df_samples, by = "Sequence") %>%
  mutate(Sample = ifelse(Sequence == "Total coverage", "", Sample),
         type_ref = case_when(Sequence == "Sequence 1" ~ "Reference",
                              Sequence == "Total coverage" ~ "",
                              TRUE ~ "Sample"),
         Type = cell_spec(type_ref,
                          color = "white",
                          background = factor(
                            type_ref,
                            c("Reference", "Sample", ""),
                            c("#fdbf6f", "#a6cee3", "")
                          ))) %>%
  select(Sequence, Sample, Type, `Percent Coverage`) %>%
  kbl(escape = FALSE) %>%
  row_spec(number_samples + 1, bold = TRUE) %>%
  kable_styling(bootstrap_options = c("striped",
                                      "hover",
                                      "responsive"),
                full_width = TRUE)
Sequence Sample Type Percent Coverage
Sequence 1 isolate6.fasta.ref Reference 87.2
Sequence 2 isolate1.fasta Sample 85.0
Sequence 3 isolate2.fasta Sample 81.4
Sequence 4 isolate3.fasta Sample 87.4
Sequence 5 isolate4.fasta Sample 87.6
Sequence 6 isolate5.fasta Sample 88.3
Sequence 7 isolate6.fasta Sample 87.2
Total coverage 85.9


SNPdist

SNP distance summary statistics for all isolates.
snpdist_data <- read_delim(params$snpdist_report,
                           delim = "\t",
                           show_col_types = FALSE) %>%
  rename("isol1" = 1) %>%
  pivot_longer(cols = -isol1,
               names_to = "isol2",
               values_to = "value")


snpdist_data %>%
  filter(isol1 != isol2) %>%
  mutate(Median = median(value),
         Mean = round(mean(value), 0),
         Range = paste0(min(value), " - ", max(value))) %>%
  select(Mean, Median, Range) %>%
  head(1) %>%
  kbl() %>%
  kable_styling(bootstrap_options = c("striped",
                                      "hover",
                                      "responsive"),
                full_width = TRUE)
Mean Median Range
20502 24159 0 - 24623
ggplot(snpdist_data, aes(isol1, isol2, fill = value)) +
  geom_tile() +
  labs(fill = "SNP distance") +
  scale_fill_viridis(direction = -1,
                     option = "D") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.3),
    axis.text = element_text(size = 4),
        panel.grid = element_blank(),
        axis.title = element_blank()) +
  coord_fixed()
**Figure 1**: SNP distance matrix of all included isolates. A lighter color represents a smaller SNP distance.

Figure 1: SNP distance matrix of all included isolates. A lighter color represents a smaller SNP distance.


IQTree

# Read IQTree data
iqtree_data <- readLines(params$phylo_data)

aln_info <- sub(
  "Input data: ",
  "",
  iqtree_data[str_detect(iqtree_data,"Input data:")]
)

const_sites <- sub(
  ".+= (.+) of all sites)",
  "\\1",
  iqtree_data[str_detect(iqtree_data,"Number of constant sites:")]
)

parsimony_sites <- sub(
  "Number of parsimony informative sites: ",
  "",
  iqtree_data[str_detect(iqtree_data,"Number of parsimony informative sites:")]
)

iqtree_model <- sub(
  "Model of substitution: ",
  "",
  iqtree_data[str_detect(iqtree_data,"Model of substitution: ")]
)

iqtree_cpu <- sub(
  "Total CPU time used: ",
  "",
  iqtree_data[str_detect(iqtree_data,"Total CPU time used:")]
)

iqtree_wallclock <- sub(
  "Total wall-clock time used: ",
  "",
  iqtree_data[str_detect(iqtree_data,"Total wall-clock time used:")]
)

IQTree was run on an alignment composed of 7 sequences with 4591524 nucleotide sites, where 98.852% of the alignment were constant sites. The number of parsimonious sites were 23708. IQTree detected the evolutionary model GTR+F+I. IQTree used 121.858981 seconds (0h:2m:1s) of CPU time, which converts to 48.13189391 seconds (0h:0m:48s) of wall-clock time.



Phylogenetic tree

Here, the phylogenetic tree generated by IQTree is plotted. The figure is interactive, and you can zoom in and out and pan as you see fit.

tree <- read.tree(params$phylo_tree)

tree_plot <- ggtree(tree) +
  geom_text(aes(label = label),
            hjust = 0,
        size = 2) +
  geom_treescale()

ggplotly(tree_plot) %>%
  plotly::style(textposition = "right")

Figure 2: Maximum likelihood phylogenetic tree generated by IQTree. Bootstrap values are presented for each node. The tree is unrooted