AncientMetagenomeDir is a repository listing published ancient metagenome (and related) samples. It contains critical metadata for ancient metagenome studies, and acts as a ‘sign post’ towards important data to help facilitate more robust and efficient comparative data.
This page provides summary statistics of the current status of the directory.
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
[37m── [1mAttaching packages[22m ────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──[39m
[37m[32m✓[37m [34mggplot2[37m 3.3.2 [32m✓[37m [34mpurrr [37m 0.3.4
[32m✓[37m [34mtibble [37m 3.0.3 [32m✓[37m [34mdplyr [37m 1.0.2
[32m✓[37m [34mtidyr [37m 1.1.2 [32m✓[37m [34mstringr[37m 1.4.0
[32m✓[37m [34mreadr [37m 1.3.1 [32m✓[37m [34mforcats[37m 0.5.0[39m
[37m── [1mConflicts[22m ───────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31mx[37m [34mdplyr[37m::[32mfilter()[37m masks [34mstats[37m::filter()
[31mx[37m [34mdplyr[37m::[32mlag()[37m masks [34mstats[37m::lag()[39m
library(scales)
Attaching package: ‘scales’
The following object is masked from ‘package:purrr’:
discard
The following object is masked from ‘package:readr’:
col_factor
library(lubridate)
Attaching package: ‘lubridate’
The following objects are masked from ‘package:base’:
date, intersect, setdiff, union
library(maps)
Attaching package: ‘maps’
The following object is masked from ‘package:purrr’:
map
library(patchwork)
sessionInfo()
R version 3.6.3 (2020-02-29)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.1 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
locale:
[1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8 LC_MONETARY=en_GB.UTF-8
[6] LC_MESSAGES=en_GB.UTF-8 LC_PAPER=en_GB.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] patchwork_1.0.1 maps_3.3.0 lubridate_1.7.9 scales_1.1.1 forcats_0.5.0 stringr_1.4.0 dplyr_1.0.2 purrr_0.3.4
[9] readr_1.3.1 tidyr_1.1.2 tibble_3.0.3 ggplot2_3.3.2 tidyverse_1.3.0
loaded via a namespace (and not attached):
[1] Rcpp_1.0.5 cellranger_1.1.0 pillar_1.4.6 compiler_3.6.3 dbplyr_1.4.4 tools_3.6.3 jsonlite_1.7.0 lifecycle_0.2.0
[9] gtable_0.3.0 pkgconfig_2.0.3 rlang_0.4.7 reprex_0.3.0 cli_2.0.2 DBI_1.1.0 rstudioapi_0.11 haven_2.3.1
[17] xfun_0.16 withr_2.2.0 xml2_1.3.2 httr_1.4.2 knitr_1.29 fs_1.5.0 generics_0.0.2 vctrs_0.3.4
[25] hms_0.5.3 grid_3.6.3 tidyselect_1.1.0 glue_1.4.2 R6_2.4.1 fansi_0.4.1 readxl_1.3.1 modelr_0.1.8
[33] blob_1.2.1 magrittr_1.5 backports_1.1.9 ellipsis_0.3.1 rvest_0.3.6 assertthat_0.2.1 colorspace_1.4-1 stringi_1.4.6
[41] munsell_0.5.0 broom_0.7.0 crayon_1.3.4
We will prepare some custom functions
## Load and standardise date across tables
load_thedir_data <- function(path, name) {
read_tsv(path, col_types = cols()) %>%
mutate(List = name) %>%
select(List, everything())
}
stats_pub_timeline <- function(...) {
x <- list(...)
## Get only relevent columns
lapply(x, FUN = function(y) {
select(
y,
List, publication_doi,
publication_year
) %>% distinct()
}) %>%
bind_rows() %>%
mutate(List = factor(List, levels = names(colours)))
}
plot_pub_timeline <- function(dat) {
## Get range so we plot x-axis nicely
spanning_years <- dat %>% summarise(min = min(publication_year), max = max(publication_year))
ggplot(dat, aes(publication_year, fill = List)) +
scale_fill_manual(values = colours, guide = guide_legend(ncol = 1)) +
scale_y_continuous(labels = scales::number_format(accuracy = 1)) +
ylab("Number of publications") +
xlab("Publication year") +
geom_bar(bins = spanning_years$max - spanning_years$min, binwidth = 1) +
scale_x_continuous(breaks = seq(spanning_years$min, spanning_years$max, 2)) +
theme_classic() +
theme(legend.position = "bottom") +
facet_wrap(~List, ncol = 1) +
labs(fill = NULL)
}
stats_cumulative_timeline <- function(...) {
## Takes a list of AncientMetagenomeDir TSVs
x <- list(...)
## Get only relevent columns
dat <- lapply(x, FUN = function(y) {
select(
y,
List, sample_name, publication_year,
publication_year
) %>% distinct()
}) %>%
bind_rows() %>%
mutate(List = factor(List, levels = names(colours)))
spanning_years <- dat %>%
ungroup() %>%
summarise(min = min(publication_year), max = max(publication_year))
## Make fake base table to ensure all years for all lists
## Currently manually defined
base_table <- list(
seq(spanning_years$min, spanning_years$max, 1),
seq(spanning_years$min, spanning_years$max, 1),
seq(spanning_years$min, spanning_years$max, 1)
)
names(base_table) <- levels(dat$List)
base_table <- base_table %>%
enframe(name = "List", value = "publication_year") %>%
unnest(publication_year)
dat <- dat %>%
group_by(List, publication_year) %>%
summarise(count = n())
dat %>%
right_join(base_table, by = c("List", "publication_year")) %>%
replace_na(list(count = 0)) %>%
arrange(List, publication_year) %>%
mutate(List = factor(List, levels = names(colours))) %>%
group_by(List) %>%
mutate(cumulative_sum = cumsum(count))
}
plot_cumulative_timeline <- function(x) {
spanning_years <- list(min_year = min(x$publication_year), max_year = max(x$publication_year))
## Get range so we plot x-axis nicely
ggplot(x, aes(publication_year, cumulative_sum, fill = List)) +
geom_bar(stat = "identity") +
scale_y_continuous(labels = scales::number_format(accuracy = 1)) +
scale_x_continuous(breaks = seq(spanning_years$min_year, spanning_years$max_year, 2)) +
theme_classic() +
xlab("Publication year") +
ylab("Number of samples (cumulative sum)") +
scale_fill_manual(values = colours, guide = guide_legend(ncol = 1)) +
theme(legend.position = "bottom") +
facet_wrap(~List, ncol = 1) +
labs(fill = NULL)
}
stats_map <- function(...){
x <- list(raw_environmental, raw_hostmetagenome, raw_hostsinglegenome)
dat <- lapply(x, FUN = function(y) {
select(
y,
List, sample_name, geo_loc_name, latitude, longitude,
publication_year
) %>% distinct()
}) %>%
bind_rows() %>%
mutate(List = factor(List, levels = names(colours)))
dat <- dat %>%
group_by(List, geo_loc_name, latitude, longitude) %>%
summarise(count = n()) %>%
mutate(List = factor(List, levels = names(colours)))
}
plot_map <- function(dat){
world_map <- map_data("world")
ggplot() +
geom_polygon(data = world_map, aes(x = long, y = lat, group = group), fill = "white", colour = "grey") +
geom_point(data = dat, aes(x = longitude, y = latitude, fill = List, size = count), shape = 21, alpha = 0.5) +
scale_fill_brewer(palette ="Set1") +
theme_linedraw() +
facet_wrap(~List, ncol = 1) +
theme_classic() +
scale_fill_manual(values = colours, guide = guide_legend(ncol = 1)) +
theme(legend.position = "bottom", legend.direction="vertical") +
labs(fill = "Sample Type", size = "Sample Count") +
xlab("Longitude") +
ylab("Latitude")
}
We will load and analyse each list separately, as they contain slightly different metadata depending on context.
raw_hostmetagenome <- load_thedir_data("../../ancientmetagenome-hostassociated/ancientmetagenome-hostassociated.tsv", "Host Associated Metagenome")
raw_hostsinglegenome <- load_thedir_data("../../ancientsinglegenome-hostassociated/ancientsinglegenome-hostassociated.tsv", "Host Associated Single Genome")
raw_environmental <- load_thedir_data("../../ancientmetagenome-environmental/ancientmetagenome-environmental.tsv", "Environmental Metagenome")
#raw_anthropogenic <- load_thedir_data("../../ancientmetagenome-anthropogenic/ancientmetagenome-anthropogenic.tsv", "Anthropogenic Metagenome")
colours <- c(`Host Associated Metagenome` = "#73cff3",
`Host Associated Single Genome` = "#d74182",
`Environmental Metagenome` = "#2da46a")
notused_colours <- c(`Anthropogenic Metagenome` = "#d74182")
figure_publication_time <- stats_pub_timeline(raw_hostmetagenome, raw_hostsinglegenome, raw_environmental) %>% plot_pub_timeline()
Ignoring unknown parameters: bins, binwidth
figure_publication_time
Summary stats
stats_pub_timeline(raw_hostmetagenome, raw_hostsinglegenome, raw_environmental) %>%
select(publication_doi) %>%
arrange %>%
distinct %>%
summarise(n = n())
figure_cumulative_samples <- stats_cumulative_timeline(raw_hostmetagenome, raw_hostsinglegenome, raw_environmental) %>% plot_cumulative_timeline()
`summarise()` regrouping output by 'List' (override with `.groups` argument)
figure_cumulative_samples
Summary stats
stats_cumulative_timeline(raw_hostmetagenome, raw_hostsinglegenome, raw_environmental) %>%
group_by(List) %>%
summarise(total = sum(count))
`summarise()` regrouping output by 'List' (override with `.groups` argument)
`summarise()` ungrouping output (override with `.groups` argument)
figure_timelines <- figure_publication_time + figure_cumulative_samples + plot_layout(ncol = 2) + plot_annotation(tag_levels = 'a')
figure_timelines
ggsave("AncientMetagenomeDir-PublicationSample_Timeline.pdf",
figure_timelines,
device = cairo_pdf(),
units = "in",
width = 3.5,
height = 4,
scale = 2
)
ggsave("AncientMetagenomeDir-PublicationSample_Timeline.png",
figure_timelines,
device = "png",
units = "in",
width = 3.5,
height = 4,
scale = 2
)
With known coordinates!
figure_map <- stats_map(raw_hostmetagenome, raw_hostsinglegenome, raw_environmental) %>% plot_map
`summarise()` regrouping output by 'List', 'geo_loc_name', 'latitude' (override with `.groups` argument)
Scale for 'fill' is already present. Adding another scale for 'fill', which will replace the existing scale.
figure_map
ggsave("AncientMetagenomeDir-Sample_Map.pdf",
figure_map,
device = cairo_pdf(),
units = "in",
width = 1,
height = 2,
scale = 4
)
ggsave("AncientMetagenomeDir-Sample_Map.png",
figure_map,
device = "png",
units = "in",
width = 1,
height = 2,
scale = 4
)
How many countries does this cover?
stats_map(raw_hostmetagenome, raw_hostsinglegenome, raw_environmental) %>%
ungroup() %>%
select(geo_loc_name) %>%
distinct() %>%
summarise(n = n())
`summarise()` regrouping output by 'List', 'geo_loc_name', 'latitude' (override with `.groups` argument)