ALPPACA version 1.0.0.
These are the R packages used to generate this report.
library(stringr)
library(readr)
library(dplyr)
library(tidyr)
library(rlang)
library(ggtree)
library(plotly)
library(ggplot2)
library(viridis)
library(formattable)
library(kableExtra)
packages <- (.packages())
clean_pkg_version <- function(x) {
y <- getNamespaceVersion(x)
z <- data.frame("Package" = x,
"Version" = y)
row.names(z) <- NULL
return(z)
}
lapply(packages, function(x) clean_pkg_version(x)) %>%
bind_rows() %>%
kbl() %>%
kable_styling(bootstrap_options = c("striped",
"hover",
"responsive"),
full_width = TRUE)
Package | Version |
---|---|
kableExtra | 1.3.4 |
formattable | 0.2.1 |
viridis | 0.6.2 |
viridisLite | 0.4.0 |
plotly | 4.10.0 |
ggplot2 | 3.3.5 |
ggtree | 3.2.1 |
rlang | 1.0.1 |
tidyr | 1.2.0 |
dplyr | 1.0.8 |
readr | 2.1.2 |
stringr | 1.4.0 |
stats | 4.1.2 |
graphics | 4.1.2 |
grDevices | 4.1.2 |
utils | 4.1.2 |
datasets | 4.1.2 |
methods | 4.1.2 |
base | 4.1.2 |
These are the tools used to run the analysis within the core genome workflow of ALPPACA.
tool_data <- data.frame(
Tool = c("ParSNP",
"Gubbins",
"MaskRC",
"Snp-dists",
"Snp-sites",
"IQTree"),
Version = c("1.6.1",
"3.1.3",
"0.5",
"0.8.2",
"2.5.1",
"2.1.4")
)
tool_data %>%
kbl() %>%
kable_styling(bootstrap_options = c("striped",
"hover",
"responsive"),
full_width = TRUE)
Tool | Version |
---|---|
ParSNP | 1.6.1 |
Gubbins | 3.1.3 |
MaskRC | 0.5 |
Snp-dists | 0.8.2 |
Snp-sites | 2.5.1 |
IQTree | 2.1.4 |
# Courtesy of Eve Zeyl Fiskebeck for the script!
# Read PARSNP results file
parsnp_data <- readLines(params$parsnp_report)
# Extract the data needed
parsnp_cov <- parsnp_data[str_detect(parsnp_data, "(Cluster coverage in sequence)|(Total coverage among all sequences)")]
parsnp_samples <- parsnp_data[str_detect(parsnp_data, "(Sequence)|(q value)")]
df_samples <- as.data.frame(cbind(parsnp_samples)) %>%
head(-1) %>%
separate("parsnp_samples", into = c("Sequence", "Sample"), sep = " :") %>%
mutate(Sample = basename(Sample))
# Get number of samples in the data
number_samples <- nrow(df_samples)
# Create nice table
as.data.frame(cbind(parsnp_cov)) %>%
separate("parsnp_cov", into = c("Sequence","Percent Coverage"), sep = ": ") %>%
add_row(Sequence = "control_100", `Percent Coverage` = "100%") %>%
mutate(Sequence = sub(".+(sequence .+)", "\\1", Sequence),
Sequence = ifelse(grepl("coverage", Sequence),
"Total coverage",
sub("sequence ", "Sequence ", Sequence)),
`Percent Coverage` = sub("%", "", `Percent Coverage`),
`Percent Coverage` = color_bar("lightgreen")(`Percent Coverage`)) %>%
filter(Sequence != "control_100") %>%
left_join(df_samples, by = "Sequence") %>%
mutate(Sample = ifelse(Sequence == "Total coverage", "", Sample),
type_ref = case_when(Sequence == "Sequence 1" ~ "Reference",
Sequence == "Total coverage" ~ "",
TRUE ~ "Sample"),
Type = cell_spec(type_ref,
color = "white",
background = factor(
type_ref,
c("Reference", "Sample", ""),
c("#fdbf6f", "#a6cee3", "")
))) %>%
select(Sequence, Sample, Type, `Percent Coverage`) %>%
kbl(escape = FALSE) %>%
row_spec(number_samples + 1, bold = TRUE) %>%
kable_styling(bootstrap_options = c("striped",
"hover",
"responsive"),
full_width = TRUE)
Sequence | Sample | Type | Percent Coverage |
---|---|---|---|
Sequence 1 | isolate6.fasta.ref | Reference | 87.2 |
Sequence 2 | isolate1.fasta | Sample | 85.0 |
Sequence 3 | isolate2.fasta | Sample | 81.4 |
Sequence 4 | isolate3.fasta | Sample | 87.4 |
Sequence 5 | isolate4.fasta | Sample | 87.6 |
Sequence 6 | isolate5.fasta | Sample | 88.3 |
Sequence 7 | isolate6.fasta | Sample | 87.2 |
Total coverage | 85.9 |
snpdist_data <- read_delim(params$snpdist_report,
delim = "\t",
show_col_types = FALSE) %>%
rename("isol1" = 1) %>%
pivot_longer(cols = -isol1,
names_to = "isol2",
values_to = "value")
snpdist_data %>%
filter(isol1 != isol2) %>%
mutate(Median = median(value),
Mean = round(mean(value), 0),
Range = paste0(min(value), " - ", max(value))) %>%
select(Mean, Median, Range) %>%
head(1) %>%
kbl() %>%
kable_styling(bootstrap_options = c("striped",
"hover",
"responsive"),
full_width = TRUE)
Mean | Median | Range |
---|---|---|
20502 | 24159 | 0 - 24623 |
ggplot(snpdist_data, aes(isol1, isol2, fill = value)) +
geom_tile() +
labs(fill = "SNP distance") +
scale_fill_viridis(direction = -1,
option = "D") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.3),
axis.text = element_text(size = 4),
panel.grid = element_blank(),
axis.title = element_blank()) +
coord_fixed()
Figure 1: SNP distance matrix of all included isolates. A lighter color represents a smaller SNP distance.
# Read IQTree data
iqtree_data <- readLines(params$phylo_data)
aln_info <- sub(
"Input data: ",
"",
iqtree_data[str_detect(iqtree_data,"Input data:")]
)
const_sites <- sub(
".+= (.+) of all sites)",
"\\1",
iqtree_data[str_detect(iqtree_data,"Number of constant sites:")]
)
parsimony_sites <- sub(
"Number of parsimony informative sites: ",
"",
iqtree_data[str_detect(iqtree_data,"Number of parsimony informative sites:")]
)
iqtree_model <- sub(
"Model of substitution: ",
"",
iqtree_data[str_detect(iqtree_data,"Model of substitution: ")]
)
iqtree_cpu <- sub(
"Total CPU time used: ",
"",
iqtree_data[str_detect(iqtree_data,"Total CPU time used:")]
)
iqtree_wallclock <- sub(
"Total wall-clock time used: ",
"",
iqtree_data[str_detect(iqtree_data,"Total wall-clock time used:")]
)
IQTree was run on an alignment composed of 7 sequences with 4591524 nucleotide sites, where 98.852% of the alignment were constant sites. The number of parsimonious sites were 23708. IQTree detected the evolutionary model GTR+F+I. IQTree used 121.858981 seconds (0h:2m:1s) of CPU time, which converts to 48.13189391 seconds (0h:0m:48s) of wall-clock time.
Here, the phylogenetic tree generated by IQTree is plotted. The figure is interactive, and you can zoom in and out and pan as you see fit.
tree <- read.tree(params$phylo_tree)
tree_plot <- ggtree(tree) +
geom_text(aes(label = label),
hjust = 0,
size = 2) +
geom_treescale()
ggplotly(tree_plot) %>%
plotly::style(textposition = "right")
Figure 2: Maximum likelihood phylogenetic tree generated by IQTree. Bootstrap values are presented for each node. The tree is unrooted