Synoptic Arctic Survey 2021 - metatranscriptomics and metabarcoding

Author

george.westmeijer@umu.se

Published

April 28, 2026

Load libraries

Expand code
cite_packages(output = "table", pkgs = "Session", out.dir = getwd())
    Package Version   Citation
1      base   4.5.3      @base
2 patchwork   1.3.2 @patchwork
3 tidyverse   2.0.0 @tidyverse
Expand code
cols.layer <- c(
  "10m" = "#78B7C5",
  "chlMax" = "#046C9A"
)

Read input files RNA

The raw data was procesesd using nf-core/metatdenovo (v1.0.1): https://nf-co.re/metatdenovo/1.0.1/ using Prokka as orf caller, GTDB release 220 for taxonomic annotation, and EggNOG mapper for functional annotation.

Expand code
tax <- read_tsv('data/metatdenovo/gtdb_ncbi_merged.tsv.gz', col_types = 'cccccccc')
Expand code
tpm <- read_tsv("data/metatdenovo/user_assembly.prokka.counts.tsv.gz", show_col_types = F) %>% 
  mutate(sample = gsub("R", ".", sample)) %>%
  mutate(sample = factor(sample, levels = c(
    "S1.1", "S1.2", "S2.1", "S2.2", "S3.1", "S3.2", "S4.1", "S4.2", "S5.1", "S6.1", 
    "S5.2", "S6.2", "S7.1", "S7.2", "S8.1", "S8.2", "S9.1", "S9.2", "S10.1", "S10.2"
  ))) %>%
  select(-start, -end,- strand, -length, -chr)

meta <- read_tsv("data/sample_key.tsv", show_col_types = FALSE) %>%
  mutate(station = as.character(station)) %>%
  mutate(sample = gsub("R", ".", sample)) %>%
  mutate(sample = factor(sample, levels = c(
    "S1.1", "S1.2", "S2.1", "S2.2", "S3.1", "S3.2", "S4.1", "S4.2", "S5.1", "S6.1", 
    "S5.2", "S6.2", "S7.1", "S7.2", "S8.1", "S8.2", "S9.1", "S9.2", "S10.1", "S10.2"
  ))) %>%
  mutate(layer = factor(layer, levels = c("10m", "chlMax")))

In total, there are 20 metatranscriptomes from five stations. At every station, samples were taken in duplicates from 10 m depth and from chl. max. On average, each sample has 81.2 million reads (sd 8.7, n = 20). Off all the transcripts, 53.0 % was functionally annotated (Eggnog mapper), 36.5 % was functionally annotated including a KO reference, 51.7 % of the transcripts was annotated on phylum level, 48.0 % on order level, 43.5 % on genus level, and 32.7 % on species level.

63.6 % of the transcript were functionally annotated with Kofam scan. A summary of the bioinformatic pipeline will soon be added to this introduction.

Expand code
kofam <- read_tsv("data/metatdenovo/user_assembly.prokka.kofamscan-uniq.tsv.gz", show_col_types = F) %>%
  select(-thrshld, -score)
Expand code
emapper <- read_tsv("data/metatdenovo/user_assembly.prokka.emapper.tsv.gz", show_col_types = F) %>% 
  select(orf, cog = cog_category)

Data exploration

Percentage of annotated transcripts

Expand code
i <- c("domain","phylum","class","order","family","genus")

for (level in i) {
  ## The (!!as.name(level)) converts the variable level to e.g. 'genus'
  d <- tpm %>% left_join(tax, by = "orf") %>% filter(!is.na((!!as.name(level))))
  format(nrow(d) / nrow(tpm) * 100, digits = 3) %>% paste(stringr::str_to_title(level), ": ", ., " %", sep = "") %>% print()
}
[1] "Domain: 55.4 %"
[1] "Phylum: 51.7 %"
[1] "Class: 50.1 %"
[1] "Order: 48 %"
[1] "Family: 47 %"
[1] "Genus: 43.5 %"
Expand code
i <- c("domain","phylum","class","order","family","genus")

for (level in i) {
  ## The (!!as.name(level)) converts the variable level to e.g. 'genus'
  d <- tpm %>% left_join(tax, by = "orf") %>% inner_join(kofam, by = "orf") %>% filter(!is.na((!!as.name(level))))
  format(nrow(d) / nrow(tpm) * 100, digits = 3) %>% paste(stringr::str_to_title(level), ": ", ., " %", sep = "") %>% print()
}
[1] "Domain: 46.7 %"
[1] "Phylum: 43.2 %"
[1] "Class: 41.8 %"
[1] "Order: 40.2 %"
[1] "Family: 39.2 %"
[1] "Genus: 36 %"

Community composition on domain level

Expand code
tpm %>%
  left_join(tax, by = "orf") %>%
  replace_na(list("domain" = "Unknown")) %>%
  group_by(domain) %>% summarise(tpm = sum(tpm) / 20, .groups = "drop") %>%
  arrange(desc(tpm))
Table 1
# A tibble: 5 × 2
  domain        tpm
  <chr>       <dbl>
1 Unknown   407152.
2 Bacteria  377129.
3 Eukaryota 101564.
4 Archaea    94688.
5 Viruses    19467.

Figure 1 depicts the ratio between bacteria and archaea for both depth layers. Note how the abundance of archaea is slightly higher in most of the deeper samples (chlorophyll max depth).

Expand code
tpm %>%
  inner_join(tax, by = "orf") %>%
  inner_join(meta, by = "sample") %>%
  group_by(sample, domain) %>% summarise(tpm = sum(tpm), .groups = "drop") %>%
  mutate(domain = factor(domain, levels = c("Viruses", "Eukaryota", "Archaea", "Bacteria"))) %>%
  ggplot(aes(
    x = sample,
    y = tpm,
    fill = domain
  )) +
  geom_col() + theme_tidy() +
  scale_y_continuous(breaks = c(0, 2e5, 4e5, 6e5), labels = c(0, 0.2, 0.4, 0.6)) +
  labs(y = expression('Relative abundance (tpm '%*% '10'^'6'*')')) + 
  scale_fill_manual(
    values = c("Bacteria" = "#C7B19C", "Archaea" = "#A2A475", "Viruses" = "#D8B70A", "Eukaryota" = "#FAEFD1")
    ) +
  theme(
    aspect.ratio = 0.5,
    legend.title = element_blank(),
    axis.title.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.text.x = element_blank()
  ) +
  annotate("segment", x =  0.55, xend =  2.45, y = -30000, yend = -30000, color = "#78B7C5") +
  annotate("segment", x =  2.55, xend =  4.45, y = -30000, yend = -30000, color = "#046C9A") +
  annotate("segment", x =  4.55, xend =  6.45, y = -30000, yend = -30000, color = "#78B7C5") +
  annotate("segment", x =  6.55, xend =  8.45, y = -30000, yend = -30000, color = "#046C9A") +
  annotate("segment", x =  8.55, xend = 10.45, y = -30000, yend = -30000, color = "#78B7C5") +
  annotate("segment", x = 10.55, xend = 12.45, y = -30000, yend = -30000, color = "#046C9A") +
  annotate("segment", x = 12.55, xend = 14.45, y = -30000, yend = -30000, color = "#78B7C5") +
  annotate("segment", x = 14.55, xend = 16.45, y = -30000, yend = -30000, color = "#046C9A") +
  annotate("segment", x = 16.55, xend = 18.45, y = -30000, yend = -30000, color = "#78B7C5") +
  annotate("segment", x = 18.55, xend = 20.45, y = -30000, yend = -30000, color = "#046C9A") +
  annotate("label", x = 2.5, y = -30000, label = "5", label.size = 0, size = 6/.pt) +
  annotate("label", x = 6.5, y = -30000, label = "26", label.size = 0, size = 6/.pt) +
  annotate("label", x =10.5, y = -30000, label = "42", label.size = 0, size = 6/.pt) +
  annotate("label", x =14.5, y = -30000, label = "53", label.size = 0, size = 6/.pt) +
  annotate("label", x =18.5, y = -30000, label = "56", label.size = 0, size = 6/.pt)
Warning in annotate("label", x = 2.5, y = -30000, label = "5", label.size = 0,
: Ignoring unknown parameters: `label.size`
Warning in annotate("label", x = 6.5, y = -30000, label = "26", label.size = 0,
: Ignoring unknown parameters: `label.size`
Warning in annotate("label", x = 10.5, y = -30000, label = "42", label.size =
0, : Ignoring unknown parameters: `label.size`
Warning in annotate("label", x = 14.5, y = -30000, label = "53", label.size =
0, : Ignoring unknown parameters: `label.size`
Warning in annotate("label", x = 18.5, y = -30000, label = "56", label.size =
0, : Ignoring unknown parameters: `label.size`
Figure 1: Community overview on domain level, removing transcripts without a taxonomic annotation.

Community composition

Community composition - order

Expand code
t <- tpm %>%
  inner_join(tax, by = "orf") %>% filter(domain %in% c("Bacteria", "Archaea")) %>%
  group_by(order, sample) %>%
  # Sum the abundance of each phylum within a sample
  summarise(tpm = sum(tpm), .groups = 'drop_last') %>%
  # Calculate the mean abundance of each phylum over the categories
  summarise(mean.tpm = sum(tpm), .groups = 'drop') %>%
  filter(!is.na(order)) %>%
  top_n(9, mean.tpm)

t %>% arrange(desc(mean.tpm))
# A tibble: 9 × 2
  order             mean.tpm
  <chr>                <dbl>
1 Pseudomonadales   1270276.
2 Poseidoniales     1188108.
3 Pelagibacterales   775835.
4 PS1                651304.
5 Nitrososphaerales  632899.
6 Flavobacteriales   626936.
7 Enterobacterales   333125.
8 SAR86              248401.
9 Rhodobacterales    204835.

Figure 2 explores the data by plotting the most abundant orders over all samples, regardless if the RNA transcript was functionally annotated or not. Note that transcripts without a taxonomic annotation (the unidentified fraction in Figure 3) are not included in this figure. It clearly shows how the Nitrosphaerales are more abundant in the deeper layer.

Expand code
p.order <- tpm %>%
  left_join(tax, by = "orf") %>%
  mutate(order = case_when(
    order %in% t$order ~ order,
    domain == "Eukaryota" ~ "Eukaryotes",
    is.na(order) ~ "Unidentified",
    !order %in% t$order ~ "Other"
  )) %>%
  filter(order != "Unidentified") %>%
  # Summarize in order to have the sum for each category and topphylum
  group_by(sample, order) %>% 
  summarise(tpm = sum(tpm), .groups = 'drop') %>%
  # Call the plot
  ggplot(aes(x = sample, 
             y = tpm, 
             fill = fct_relevel(order, rev(names(cols.order)))
             )
         ) +
  labs(x = '', y = expression('Relative abundance (TPM '%*% '10'^'6'*')'), fill = "Order") +
  geom_col() +
  scale_y_continuous(breaks = c(0, 2e5, 4e5, 6e5, 8e5, 1e6),
                labels = c("0.0", "0.2", "0.4", "0.6", "0.8", "1.0")
                ) +
  scale_fill_manual(values = cols.order) +
  theme_tidy(legend = "right", fontsize = 5) +
  #guides(fill = guide_legend(nrow = 3, reverse = TRUE)) +
  theme(
    axis.ticks.x = element_blank(),
    legend.key.width = unit(3.5, "mm"),
    legend.key.height = unit(4, "mm"),
    legend.margin = margin(t = -10),
    legend.text = element_text(margin = margin(l = 2)),
    axis.text.x = element_blank()
    ) +
  annotate("segment", x =  0.55, xend =  2.45, y = -25000, yend = -25000, color = "#78B7C5") +
  annotate("segment", x =  2.55, xend =  4.45, y = -25000, yend = -25000, color = "#046C9A") +
  annotate("segment", x =  4.55, xend =  6.45, y = -25000, yend = -25000, color = "#78B7C5") +
  annotate("segment", x =  6.55, xend =  8.45, y = -25000, yend = -25000, color = "#046C9A") +
  annotate("segment", x =  8.55, xend = 10.45, y = -25000, yend = -25000, color = "#78B7C5") +
  annotate("segment", x = 10.55, xend = 12.45, y = -25000, yend = -25000, color = "#046C9A") +
  annotate("segment", x = 12.55, xend = 14.45, y = -25000, yend = -25000, color = "#78B7C5") +
  annotate("segment", x = 14.55, xend = 16.45, y = -25000, yend = -25000, color = "#046C9A") +
  annotate("segment", x = 16.55, xend = 18.45, y = -25000, yend = -25000, color = "#78B7C5") +
  annotate("segment", x = 18.55, xend = 20.45, y = -25000, yend = -25000, color = "#046C9A") +
  annotate("label", x = 2.5, y = -25000, label = "Station 5", label.size = 0, size = 5/.pt) +
  annotate("label", x = 6.5, y = -25000, label = "26", label.size = 0, size = 5/.pt) +
  annotate("label", x =10.5, y = -25000, label = "42", label.size = 0, size = 5/.pt) +
  annotate("label", x =14.5, y = -25000, label = "53", label.size = 0, size = 5/.pt) +
  annotate("label", x =18.5, y = -25000, label = "56", label.size = 0, size = 5/.pt)
Warning in annotate("label", x = 2.5, y = -25000, label = "Station 5",
label.size = 0, : Ignoring unknown parameters: `label.size`
Warning in annotate("label", x = 6.5, y = -25000, label = "26", label.size = 0,
: Ignoring unknown parameters: `label.size`
Warning in annotate("label", x = 10.5, y = -25000, label = "42", label.size =
0, : Ignoring unknown parameters: `label.size`
Warning in annotate("label", x = 14.5, y = -25000, label = "53", label.size =
0, : Ignoring unknown parameters: `label.size`
Warning in annotate("label", x = 18.5, y = -25000, label = "56", label.size =
0, : Ignoring unknown parameters: `label.size`
Expand code
p.order
Figure 2: Community composition of all RNA transcripts on order level. The orders are sorted according to tpm while grouping low-abundant orders as ‘Other’. Transcripts without taxonomic annotation are not included. For each station, the two bars left of the label were sampled at 10 m depth, while the two bars on the right were sampled at chlorophyll max (between 20 and 45 m depth)

Community composition - phylum

Expand code
t <- tpm %>%
  inner_join(tax, by = "orf") %>% filter(domain %in% c("Bacteria", "Archaea")) %>%
  group_by(phylum, sample) %>%
  # Sum the abundance of each phylum within a sample
  summarise(tpm = sum(tpm), .groups = 'drop_last') %>%
  # Calculate the mean abundance of each phylum over the categories
  summarise(mean.tpm = sum(tpm), .groups = 'drop') %>%
  filter(!is.na(phylum)) %>%
  top_n(8, mean.tpm)

t %>% arrange(desc(mean.tpm))
# A tibble: 8 × 2
  phylum           mean.tpm
  <chr>               <dbl>
1 Pseudomonadota   4848562.
2 Thermoplasmatota 1203098.
3 Bacteroidota      791534.
4 Thermoproteota    644346.
5 Cyanobacteriota   604239.
6 Marinisomatota    225380.
7 Chloroflexota     202820.
8 SAR324            147854.

Figure 3 explores the data by plotting the most abundant phyla over all samples, regardless if the RNA transcript was functionally annotated or not. Note that the taxonomy is outdated and needs to be updated (e.g. Pseudomonadota instead of Proteobacteria). 87.7 % of the annotated ORFs are affiliated with bacteria, while 13.1 % is affiliated with Archaea.

Expand code
p.phylum <- tpm %>%
  left_join(tax, by = "orf") %>%
  mutate(phylum = case_when(
    phylum %in% t$phylum ~ phylum,
    domain == "Eukaryota" ~ "Eukaryotes",
    is.na(phylum) ~ "Unidentified",
    !phylum %in% t$phylum ~ "Other"
  )) %>%
  filter(phylum != "Unidentified") %>%
  # Summarize in order to have the sum for each category and topphylum
  group_by(sample, phylum) %>% 
  summarise(tpm = sum(tpm), .groups = 'drop') %>%
  # Call the plot
  ggplot(aes(x = sample, 
             y = tpm, 
             fill = fct_relevel(phylum, names(cols.phylum) %>% rev())
             )
         ) +
  labs(x = '', y = expression('Relative abundance (tpm '%*% '10'^'6'*')'), fill = "") +
  geom_col() +
  scale_y_continuous(breaks = c(0, 2e5, 4e5, 6e5, 8e5, 1e6),
                labels = c("0.0", "0.2", "0.4", "0.6", "0.8", "1.0")
                ) +
  scale_fill_manual(values = cols.phylum) +
  theme_tidy(fontsize = 5) + 
  theme(
    aspect.ratio = 0.5,
    axis.ticks.x = element_blank(),
    legend.key.height = unit(3.5, "mm"),
    legend.margin = margin(),
    panel.border = element_rect(fill = "transparent", linewidth = 0.5),
    axis.text.x = element_blank()
    ) +
  annotate("segment", x =  0.55, xend =  2.45, y = -30000, yend = -30000, color = "#78B7C5") +
  annotate("segment", x =  2.55, xend =  4.45, y = -30000, yend = -30000, color = "#046C9A") +
  annotate("segment", x =  4.55, xend =  6.45, y = -30000, yend = -30000, color = "#78B7C5") +
  annotate("segment", x =  6.55, xend =  8.45, y = -30000, yend = -30000, color = "#046C9A") +
  annotate("segment", x =  8.55, xend = 10.45, y = -30000, yend = -30000, color = "#78B7C5") +
  annotate("segment", x = 10.55, xend = 12.45, y = -30000, yend = -30000, color = "#046C9A") +
  annotate("segment", x = 12.55, xend = 14.45, y = -30000, yend = -30000, color = "#78B7C5") +
  annotate("segment", x = 14.55, xend = 16.45, y = -30000, yend = -30000, color = "#046C9A") +
  annotate("segment", x = 16.55, xend = 18.45, y = -30000, yend = -30000, color = "#78B7C5") +
  annotate("segment", x = 18.55, xend = 20.45, y = -30000, yend = -30000, color = "#046C9A") +
  annotate("label", x = 2.5, y = -30000, label = "Station 5", label.size = 0, size = 5/.pt) +
  annotate("label", x = 6.5, y = -30000, label = "26", label.size = 0, size = 5/.pt) +
  annotate("label", x =10.5, y = -30000, label = "42", label.size = 0, size = 5/.pt) +
  annotate("label", x =14.5, y = -30000, label = "53", label.size = 0, size = 5/.pt) +
  annotate("label", x =18.5, y = -30000, label = "56", label.size = 0, size = 5/.pt)
Warning in annotate("label", x = 2.5, y = -30000, label = "Station 5",
label.size = 0, : Ignoring unknown parameters: `label.size`
Warning in annotate("label", x = 6.5, y = -30000, label = "26", label.size = 0,
: Ignoring unknown parameters: `label.size`
Warning in annotate("label", x = 10.5, y = -30000, label = "42", label.size =
0, : Ignoring unknown parameters: `label.size`
Warning in annotate("label", x = 14.5, y = -30000, label = "53", label.size =
0, : Ignoring unknown parameters: `label.size`
Warning in annotate("label", x = 18.5, y = -30000, label = "56", label.size =
0, : Ignoring unknown parameters: `label.size`
Expand code
p.phylum
Figure 3: Community composition of all RNA transcripts on phylum level. The phyla are sorted according to relative abundance while grouping low-abundant phyla as ‘Other’ and transcripts without any taxonomic annotation as ‘Unidentified’

Community composition - genus

Expand code
t <- tpm %>%
  inner_join(tax, by = "orf") %>% filter(domain %in% c("Bacteria", "Archaea")) %>%
  group_by(genus, sample) %>%
  # Sum the abundance of each phylum within a sample
  summarise(tpm = sum(tpm), .groups = 'drop_last') %>%
  # Calculate the mean abundance of each phylum over the categories
  summarise(mean.tpm = sum(tpm), .groups = 'drop') %>%
  filter(!is.na(genus)) %>%
  top_n(10, mean.tpm)

t %>% left_join(tax, by = "genus") %>% 
  select(genus, order, phylum, mean.tpm) %>% 
  distinct() %>%
  arrange(desc(mean.tpm)) %>%
  rename(Genus = genus, Order = order, Phylum = phylum) %>%
  knitr::kable()
Twenty most abundant genera over all samples, based on tpm. The order and phylum of the respective genera are also displayed.
Genus Order Phylum mean.tpm
MGIIb-O3 Poseidoniales Thermoplasmatota 952516.8
Pseudothioglobus PS1 Pseudomonadota 587638.1
Pelagibacter Pelagibacterales Pseudomonadota 545749.0
Nitrosopumilus Nitrososphaerales Thermoproteota 436459.6
HTCC2207 Pseudomonadales Pseudomonadota 325660.1
UBA9145 Pseudomonadales Pseudomonadota 302374.6
Lutibacter Flavobacteriales Bacteroidota 242030.1
Photorhabdus Enterobacterales Pseudomonadota 214127.5
ASP10-02a Pseudomonadales Pseudomonadota 204665.4
Arctic96AD-7 SAR324 SAR324 141825.1

Figure 4 shows the most abundant genera. It reveals how some genera (e.g. Pelagibacter and Ketobacter) are more abundant (and active) in the 10 m depth, while for example Nitrosopumilus is more abundant in the deeper sites. Note from the metadata that the depth of chlorophyll max strongly varies between the stations.

Expand code
p.genus <- tpm %>%
  left_join(tax, by = "orf") %>%
  mutate(genus = case_when(
    genus %in% t$genus ~ genus,
    is.na(genus) ~ "Unidentified",
    !genus %in% t$genus ~ "Other"
  )) %>%
  filter(genus != "Unidentified") %>%
  # Summarize in order to have the sum for each category and topphylum
  group_by(sample, genus) %>% 
  summarise(tpm = sum(tpm), .groups = 'drop') %>%
  # Call the plot
  ggplot(aes(x = sample, 
             y = tpm, 
             fill = fct_relevel(genus, rev(names(cols.genus)))
             )
         ) +
  labs(x = '', y = expression('Relative abundance (tpm '%*% '10'^'6'*')'), fill = "Genus") +
  geom_col() +
  scale_y_continuous(breaks = c(0, 2e5, 4e5, 6e5),
                labels = c("0.0", "0.2", "0.4", "0.6")
                ) +
  scale_fill_manual(values = cols.genus) +
  theme_tidy(legend = "right", fontsize = 5) +
  theme(
    aspect.ratio = 0.5,
    axis.ticks.x = element_blank(),
    legend.key.height = unit(3.5, "mm"),
    axis.text.x = element_blank(),
    legend.title = element_blank(),
    legend.margin = margin()
    ) +
  annotate("segment", x =  0.55, xend =  2.45, y = -30000, yend = -30000, color = "#78B7C5") +
  annotate("segment", x =  2.55, xend =  4.45, y = -30000, yend = -30000, color = "#046C9A") +
  annotate("segment", x =  4.55, xend =  6.45, y = -30000, yend = -30000, color = "#78B7C5") +
  annotate("segment", x =  6.55, xend =  8.45, y = -30000, yend = -30000, color = "#046C9A") +
  annotate("segment", x =  8.55, xend = 10.45, y = -30000, yend = -30000, color = "#78B7C5") +
  annotate("segment", x = 10.55, xend = 12.45, y = -30000, yend = -30000, color = "#046C9A") +
  annotate("segment", x = 12.55, xend = 14.45, y = -30000, yend = -30000, color = "#78B7C5") +
  annotate("segment", x = 14.55, xend = 16.45, y = -30000, yend = -30000, color = "#046C9A") +
  annotate("segment", x = 16.55, xend = 18.45, y = -30000, yend = -30000, color = "#78B7C5") +
  annotate("segment", x = 18.55, xend = 20.45, y = -30000, yend = -30000, color = "#046C9A") +
  annotate("label", x = 2.5, y = -30000, label = "Station 5", label.size = 0, size = 5/.pt) +
  annotate("label", x = 6.5, y = -30000, label = "26", label.size = 0, size = 5/.pt) +
  annotate("label", x =10.5, y = -30000, label = "42", label.size = 0, size = 5/.pt) +
  annotate("label", x =14.5, y = -30000, label = "53", label.size = 0, size = 5/.pt) +
  annotate("label", x =18.5, y = -30000, label = "56", label.size = 0, size = 5/.pt)
Warning in annotate("label", x = 2.5, y = -30000, label = "Station 5",
label.size = 0, : Ignoring unknown parameters: `label.size`
Warning in annotate("label", x = 6.5, y = -30000, label = "26", label.size = 0,
: Ignoring unknown parameters: `label.size`
Warning in annotate("label", x = 10.5, y = -30000, label = "42", label.size =
0, : Ignoring unknown parameters: `label.size`
Warning in annotate("label", x = 14.5, y = -30000, label = "53", label.size =
0, : Ignoring unknown parameters: `label.size`
Warning in annotate("label", x = 18.5, y = -30000, label = "56", label.size =
0, : Ignoring unknown parameters: `label.size`
Expand code
p.genus
Figure 4: Community composition of all RNA transcripts on genus level. Only the top ten most abundant genera are shown, accounting for ~ 20 % of the tpm on average.
Expand code
p.phylum / p.genus + plot_annotation(tag_levels = "a") & 
  theme(
    plot.tag.location = "panel",
    plot.tag.position = c(0.03, 0.95)
  )

Expand code
ggsave(filename = "figures/community-genus-phylum.pdf", width = 180, height = 160, units = "mm")

Redundancy analysis (RDA)

Expand code
hydro <- readxl::read_excel("data/primarydata.xlsx", sheet = 1) %>%
  mutate(station = as.character(station)) %>%
  inner_join(meta, by = c("station", "layer", "replicate")) %>%
  rename_with(tolower) %>%
  mutate(sample = factor(sample, levels = c(
    "S1.1", "S1.2", "S2.1", "S2.2", "S3.1", "S3.2", "S4.1", "S4.2", "S5.1", "S6.1", 
    "S5.2", "S6.2", "S7.1", "S7.2", "S8.1", "S8.2", "S9.1", "S9.2", "S10.1", "S10.2"
  ))) %>% arrange(sample)
Expand code
m <- tpm %>%
  inner_join(kofam, by = "orf") %>% group_by(sample, ko) %>% summarise(tpm = sum(tpm), .groups = "drop") %>%
  # Standard Vegan transformation: Turn table with with samples as *rows*
  select(sample, ko, tpm) %>%
  pivot_wider(names_from = "ko", values_from = "tpm", values_fill = 0) %>%
  # Turn into a numeric matrix
  column_to_rownames('sample') %>% vegan::decostand(method = "hellinger") %>%
  rownames_to_column("sample") %>%
  pivot_longer(-1, names_to = "ko", values_to = "hellinger")

m <- m %>% # Create a matrix
  spread(ko, hellinger) %>%
  mutate(sample = factor(sample, levels = c(
    "S1.1", "S1.2", "S2.1", "S2.2", "S3.1", "S3.2", "S4.1", "S4.2", "S5.1", "S6.1", 
    "S5.2", "S6.2", "S7.1", "S7.2", "S8.1", "S8.2", "S9.1", "S9.2", "S10.1", "S10.2"
  ))) %>% arrange(sample) %>%
  column_to_rownames("sample")
Expand code
set.seed(999)

vegan::rda(
  m ~ depth + doc + tdp + tdn + chlach, 
  correlation = T,
  data = hydro %>%
    column_to_rownames('sample') 
) -> rda

# This one will contain the proportions explained which we get by dividing the
# eigenvalue of each component with the sum of eigenvalues for all components.
p <- c(rda$CCA$eig, rda$CA$eig)/sum(c(rda$CCA$eig, rda$CA$eig))
# This one is collecting the coordinates for arrows that will depict how the 
# factors in our model point in the coordinate
a <- rda$CCA$biplot %>% data.frame() %>% tibble::rownames_to_column('factor')

lab <- tibble(factor = a$factor,
            label = c("Depth", "", "TDP & DOC", "TDN", "Chl.")
) %>% inner_join(a, by = "factor")

summary(rda)

Call:
rda(formula = m ~ depth + doc + tdp + tdn + chlach, data = hydro %>%      column_to_rownames("sample"), correlation = T) 

Partitioning of variance:
              Inertia Proportion
Total         0.12901     1.0000
Constrained   0.08715     0.6756
Unconstrained 0.04185     0.3244

Eigenvalues, and their contribution to the variance 

Importance of components:
                         RDA1    RDA2     RDA3     RDA4    RDA5     PC1
Eigenvalue            0.05428 0.01333 0.008325 0.007262 0.00396 0.01153
Proportion Explained  0.42073 0.10331 0.064530 0.056293 0.03070 0.08935
Cumulative Proportion 0.42073 0.52404 0.588568 0.644861 0.67556 0.76491
                           PC2      PC3      PC4      PC5      PC6      PC7
Eigenvalue            0.006923 0.006138 0.004627 0.003283 0.002629 0.001858
Proportion Explained  0.053661 0.047575 0.035864 0.025451 0.020377 0.014403
Cumulative Proportion 0.818568 0.866144 0.902008 0.927459 0.947836 0.962239
                           PC8       PC9      PC10      PC11      PC12
Eigenvalue            0.001149 0.0009144 0.0007716 0.0005756 0.0005587
Proportion Explained  0.008908 0.0070881 0.0059811 0.0044620 0.0043310
Cumulative Proportion 0.971147 0.9782351 0.9842161 0.9886781 0.9930091
                           PC13      PC14
Eigenvalue            0.0004823 0.0004195
Proportion Explained  0.0037389 0.0032520
Cumulative Proportion 0.9967480 1.0000000

Accumulated constrained eigenvalues
Importance of components:
                         RDA1    RDA2     RDA3     RDA4    RDA5
Eigenvalue            0.05428 0.01333 0.008325 0.007262 0.00396
Proportion Explained  0.62278 0.15292 0.095521 0.083328 0.04544
Cumulative Proportion 0.62278 0.77571 0.871229 0.954557 1.00000
Expand code
vegan::RsquareAdj(rda)
$r.squared
[1] 0.6755601

$adj.r.squared
[1] 0.5596887
Expand code
p.rda <- rda$CCA$wa %>% data.frame() %>% tibble::rownames_to_column('sample') %>%
  inner_join(hydro, by = 'sample') %>%
  ggplot(aes(x = RDA1, y = RDA2)) +
  geom_vline(xintercept = 0, colour = "grey", linetype = "dotted") +
  geom_hline(yintercept = 0, colour = "grey", linetype = "dotted") +
  geom_point(aes(colour = layer, fill = layer), size = 5, shape = 21, stroke = 0.75) +
  xlab(sprintf("RDA1 (%2.1f%% explained)", p[[1]] * 100)) +
  ylab(sprintf("RDA2 (%2.1f%% explained)", p[[2]] * 100)) +
  geom_text(aes(label = station), size = 5/.pt, color = "black") +
  geom_segment(
    data = a, mapping = aes(xend = RDA1/4, yend = RDA2/4), linewidth = 0.3,
    x = 0, y = 0, arrow = arrow(length = unit(1.5, "mm"), type = "closed") 
    ) +
  geom_label(
    data = lab, mapping = aes(x = RDA1/8, y = RDA2/8, label = label),
    size = 5/.pt, parse = F, label.size = 0
    ) + 
  # Add R-squared
  annotate("text", x = Inf, y = -Inf, 
           label = "paste(R ^ 2-adj, \" = 0.56\")", 
           size = 5/.pt, vjust = -0.5, hjust = 1.05, parse = TRUE) +
  scale_color_manual(values = cols.layer, labels = c("10 m", "Chl. max"), name = "") +  
  scale_fill_manual(values = alpha(cols.layer, alpha = 0.2), guide = "none") +      
  theme_tidy(fontsize = 5) +
  theme(
    legend.background = element_rect(fill = NA),
    legend.text = element_text(margin = margin(r = 12, unit = "pt")),
    legend.position = "inside",
    legend.position.inside  = c(0.89,0.93)
    )
Warning: The `label.size` argument of `geom_label()` is deprecated as of ggplot2 3.5.0.
ℹ Please use the `linewidth` argument instead.
Expand code
p.rda

Expand code
(p.order + p.rda) + plot_annotation(tag_levels = "a") & theme(
  plot.margin = margin(r = 2, t = 2),
  plot.tag.location = "panel",
  plot.tag.position = c(0.03, 0.97),
  )
Figure 5
Expand code
ggsave(filename = "figures/rda.pdf", width = 180, height = 80, units = "mm")

Broad overview community functions using COG categories

Expand code
cols.order <- c( # Sorted on abundance
  "Pseudomonadales" = "#C7B19C",
  "Poseidoniales" = "#D69C4E",
  "Pelagibacterales" = "#972D15",
  "PS1" = "#FAEFD1",
  "Flavobacteriales" = "#D8B70A",
  "Nitrososphaerales" = "#A2A475",
  "Cyanobacteriales" = "#00A08A",
  "SAR86" = "#1B5331",
  "Rhodobacterales" = "#9986A5",
  "Other" = "#D5D5D3",
  "Not annotated" = "#046C9A"
) %>% rev()
Expand code
cogs <- read_tsv("data/cogs.tsv", show_col_types = FALSE) %>%
  rename(cog = cog_category) %>%
  filter(cog %in% c('C','J','P','G','U','M'))

i <- emapper %>% 
  inner_join(cogs, by = "cog") %>%
  inner_join(tpm, by = "orf") %>%
  inner_join(meta, by = "sample") %>%
  inner_join(tax, by = "orf") %>%
  replace_na(list(order = 'Not annotated')) %>%
  mutate(order = case_when(
    order %in% names(cols.order) ~ order,
    !order %in% names(cols.order) ~ 'Other',
    .default = NA
  )) %>%
  group_by(cat, station, layer, order) %>% 
  summarise(tpm = sum(tpm), .groups = "drop") %>%
  # Prepare the labels
  mutate(layer = if_else(layer == '10m', '10 m', 'chl max.')) %>%
  mutate(layer = paste(station, ' (', layer, ')', sep = '')) %>%
  mutate(layer = factor(layer, levels = 
                          c('5 (10 m)', '5 (chl max.)', 
                            '26 (10 m)', '26 (chl max.)', 
                            '42 (10 m)', '42 (chl max.)',
                            '53 (10 m)', '53 (chl max.)',
                            '56 (10 m)', '56 (chl max.)'
                        )))
Expand code
i %>%
  mutate(cat = gsub(' \\[[A-Z]\\]', '', cat)) %>%
  ggplot(aes(
    x = tpm,
    y = fct_rev(layer),
    fill = fct_relevel(order, names(cols.order))
  )) +
  geom_col() + facet_wrap(~ cat, scales = 'free_x') + labs(x = 'Relative abundance (TPM)', fill = 'Order') + 
  scale_x_continuous(labels = scales::label_comma()) +
  scale_fill_manual(values = cols.order) +
  theme_tidy(fontsize = 5) + 
  theme(
    axis.title.y = element_blank(),
    axis.ticks.y = element_blank(),
    legend.key.width = unit(4, 'mm'), legend.key.height = unit(5, 'mm')
  )
Figure 6: The abundance (in tpm) of COG categories summed for each depth layer.
Expand code
ggsave('figures/cogs-stations.pdf', width = 180, height = 140, units = 'mm')

Functional annotation - most abundant KOs

Expand code
kofam %>%
  inner_join(tpm, by = "orf") %>%
  group_by(ko, ko_definition) %>% summarise(tpm = sum(tpm), .groups = "drop") %>%
  slice_max(tpm, n = 100) %>%
  knitr::kable(digits = 0, format = "simple")
The most abundant KEGG orthologs (KO), their description (from Eggnog), and the summed tpm of each KO.
ko ko_definition tpm
K02703 photosystem II P680 reaction center D1 protein [EC:1.10.3.9] 627346
K14178 amorpha-4,11-diene synthase [EC:4.2.3.24] 331289
K07374 tubulin alpha 313355
K03320 ammonium transporter, Amt family 284483
K03283 heat shock 70kDa protein 1/2/6/8 207749
K16116 pristinamycin I synthetase 1 165412
K02002 glycine betaine/proline transport system substrate-binding protein 118536
K11294 nucleolin 104259
K02358 elongation factor Tu 100407
K02256 cytochrome c oxidase subunit 1 [EC:7.1.1.9] 88388
K21395 TRAP-type transport system periplasmic protein 78857
K04643 sensory rhodopsin 76747
K00336 NADH-quinone oxidoreductase subunit G [EC:7.1.1.2] 70494
K10946 methane/ammonia monooxygenase subunit C 69480
K02710 photosystem II PsbI protein 67351
K02707 photosystem II cytochrome b559 subunit alpha 65452
K25467 secretoglobin family 2A (mammaglobin) 63538
K14016 ubiquitin fusion degradation protein 1 61662
K05284 GPI mannosyltransferase 1 subunit M [EC:2.4.1.-] 58292
K01601 ribulose-bisphosphate carboxylase large chain [EC:4.1.1.39] 55658
K14393 cation/acetate symporter 55339
K02704 photosystem II CP47 chlorophyll apoprotein 54066
K02711 photosystem II PsbJ protein 51502
K02705 photosystem II CP43 chlorophyll apoprotein 49359
K02718 photosystem II PsbT protein 47237
K01074 palmitoyl-protein thioesterase [EC:3.1.2.22] 45535
K08720 outer membrane protein OmpU 45487
K02697 photosystem I subunit IX 44336
K02110 F-type H+-transporting ATPase subunit c 41208
K03704 cold shock protein 40770
K10408 dynein axonemal heavy chain 39878
K04078 chaperonin GroES 38936
K02128 F-type H+-transporting ATPase subunit c 37358
K02014 iron complex outermembrane recepter protein 37218
K00412 ubiquinol-cytochrome c reductase cytochrome b subunit 36617
K15210 snRNA-activating protein complex subunit 3 35474
K03040 DNA-directed RNA polymerase subunit alpha [EC:2.7.7.6] 34003
K02126 F-type H+-transporting ATPase subunit a 32951
K00410 ubiquinol-cytochrome c reductase cytochrome b/c1 subunit 32685
K15551 taurine transport system substrate-binding protein 32549
K02950 small subunit ribosomal protein S12 32518
K09969 general L-amino acid transport system substrate-binding protein 32510
K15738 ABC transport system ATP-binding/permease protein 32359
K02261 cytochrome c oxidase subunit 2 31900
K04659 thrombospondin 2/3/4/5 31820
K02262 cytochrome c oxidase subunit 3 31691
K02055 putative spermidine/putrescine transport system substrate-binding protein 30024
K07335 basic membrane protein A and related proteins 28741
K13049 carboxypeptidase PM20D1 [EC:3.4.17.-] 28712
K02712 photosystem II PsbK protein 28349
K02992 small subunit ribosomal protein S7 26946
K02027 multiple sugar transport system substrate-binding protein 26593
K02864 large subunit ribosomal protein L10 26404
K02108 F-type H+-transporting ATPase subunit a 26390
K03530 DNA-binding protein HU-beta 26093
K02114 F-type H+-transporting ATPase subunit epsilon 25872
K02706 photosystem II P680 reaction center D2 protein [EC:1.10.3.9] 25397
K02935 large subunit ribosomal protein L7/L12 25388
K04968 transient receptor potential cation channel subfamily C member 5 25164
K02274 cytochrome c oxidase subunit I [EC:7.1.1.9] 24163
K00368 nitrite reductase (NO-forming) [EC:1.7.2.1] 23730
K02109 F-type H+-transporting ATPase subunit b 23473
K07795 putative tricarboxylic transport membrane protein 22989
K02640 cytochrome b6-f complex subunit 5 22808
K11244 cell wall integrity and stress response component 22290
K04077 chaperonin GroEL [EC:5.6.1.7] 22116
K10944 methane/ammonia monooxygenase subunit A [EC:1.14.18.3 1.14.99.39] 20857
K07375 tubulin beta 20511
K02058 simple sugar transport system substrate-binding protein 20436
K02690 photosystem I P700 chlorophyll a apoprotein A2 20313
K17644 norsolorinic acid ketoreductase [EC:1.1.1.349] 20203
K02996 small subunit ribosomal protein S9 20109
K02952 small subunit ribosomal protein S13 20018
K01602 ribulose-bisphosphate carboxylase small chain [EC:4.1.1.39] 20002
K02874 large subunit ribosomal protein L14 19822
K02887 large subunit ribosomal protein L20 19548
K03935 NADH dehydrogenase (ubiquinone) Fe-S protein 2 [EC:7.1.1.2] 19480
K02982 small subunit ribosomal protein S3 19430
K02945 small subunit ribosomal protein S1 19321
K02111 F-type H+/Na+-transporting ATPase subunit alpha [EC:7.1.2.2 7.2.2.1] 19255
K00053 ketol-acid reductoisomerase [EC:1.1.1.86] 19188
K02906 large subunit ribosomal protein L3 19139
K06806 cadherin 19, type 2 18980
K02035 peptide/nickel transport system substrate-binding protein 18898
K02954 small subunit ribosomal protein S14 18745
K02133 F-type H+-transporting ATPase subunit beta [EC:7.1.2.2] 18269
K00138 aldehyde dehydrogenase [EC:1.2.1.-] 18228
K19824 rubrerythrin 18202
K16815 calcium-independent phospholipase A2-gamma 17831
K00344 NADPH:quinone reductase [EC:1.6.5.5] 17640
K02965 small subunit ribosomal protein S19 17463
K03574 8-oxo-dGTP diphosphatase [EC:3.6.1.55] 17142
K05849 solute carrier family 8 (sodium/calcium exchanger) 17097
K20410 target of rapamycin complex 2 subunit MAPKAP1/AVO1 16779
K01999 branched-chain amino acid transport system substrate-binding protein 16584
K02899 large subunit ribosomal protein L27 16160
K04564 superoxide dismutase, Fe-Mn family [EC:1.15.1.1] 16108
K04641 bacteriorhodopsin 16015
K11253 histone H3 15958
K02638 plastocyanin 15927

The gene list is from Royo-Llonch et al. [1]. It contains the KEGG ortholog (KO) and the pathway the gene is involved in. KOs of interest in the data that are not represented in the gene list are manually categorized. Shows the RNA transcripts for the carbon fixation pathways for each depth layer. To do so, one KO catalyzing the rate-limiting reaction was selected for each pathway. The Wood-Ljungdahl pathway was also included in this analysis but the representing KO was not detected, or the detected KO did not have a taxonomic annotation.

Expand code
cols.ko <- c( # Sorted on abundance
  "Nitrososphaerales" = "#A2A475",
  "Cyanobacteriales" = "#D69C4E",
  "Flavobacteriales" = "#D8B70A",
  "Burkholderiales" = "#78B7C5",
  "Pseudomonadales" = "#C7B19C",
  "Chitinophagales" = "#00A08A",
  "Rhodobacterales" = "#046C9A",
  "Campylobacterales" = "#1B5331",
  "Pelagibacterales" = "#972D15",
  "PS1" = "#FAEFD1",
  "Other" = "#D5D5D3",
  "Not annotated" = "#8D8680"
) %>% rev()
Expand code
k <- read_tsv('data/genelist.tsv', col_types = 'cccc') %>%
  filter(pathway != 'Remodelling')
Expand code
kos <- kofam %>% 
  filter(evalue < 1e-3) %>%
  # Use only the KOs from the gene list
  inner_join(k, by = 'ko') %>%
  left_join(tax, by = "orf") %>%
  replace_na(list("domain" = "Bacteria", "order" = "Not annotated", "phylum" = "Not annotated")) %>%
  filter(domain %in% c("Bacteria", "Archaea")) %>%
  # Add the tpm for each ORF and add the metadata
  inner_join(tpm, by = "orf") %>%
  inner_join(meta, by = "sample") %>%
  mutate(
    order = if_else(order %in% names(cols.ko), order, "Other"),
    order = factor(order, levels = names(cols.ko))
    ) %>%
  # Sum the tpm 
  group_by(pathway, metabolism, layer, order) %>% summarise(tpm = sum(tpm), .groups = "drop")
Warning in inner_join(., k, by = "ko"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 20998 of `x` matches multiple rows in `y`.
ℹ Row 2 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
Warning in inner_join(., tpm, by = "orf"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 1 of `x` matches multiple rows in `y`.
ℹ Row 23645 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
Expand code
kofam %>% 
  filter(evalue < 1e-3) %>%
  # Use only the KOs from the gene list
  inner_join(k, by = 'ko') %>%
  left_join(tax, by = "orf") %>%
  replace_na(list("domain" = "Bacteria", "order" = "Not annotated", "phylum" = "Not annotated")) %>%
  filter(domain %in% c("Bacteria", "Archaea")) %>%
  # Add the tpm for each ORF and add the metadata
  inner_join(tpm, by = "orf") %>%
  filter(metabolism %in% c('N metabolism','P metabolism','Central C metabolism')) %>%
  group_by(order) %>% summarise(n = sum(tpm)) %>% slice_max(n, n = 11)
Warning in inner_join(., k, by = "ko"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 20998 of `x` matches multiple rows in `y`.
ℹ Row 2 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
Warning in inner_join(., tpm, by = "orf"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 1 of `x` matches multiple rows in `y`.
ℹ Row 23645 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
# A tibble: 11 × 2
   order                    n
   <chr>                <dbl>
 1 Nitrososphaerales  442029.
 2 PS1                 32018.
 3 Not annotated       25433.
 4 Pelagibacterales    21718.
 5 Pseudomonadales     17107.
 6 Rhodospirillales_A  11230.
 7 SAR86                3300.
 8 Burkholderiales      2091.
 9 SAR324               1977.
10 Flavobacteriales     1799.
11 HIMB59               1700.
Expand code
p1 <- kos %>% 
  filter(metabolism == "Central C metabolism") %>%
  filter(!pathway %in% c('Uronate pathway (udh)','ED pathway (edd)','PRPP biosynthesis (RPPK)')) %>%
  ggplot(aes(
    x = tpm, 
    y = paste(pathway, layer) %>% fct_rev(),
    fill = fct_relevel(order, names(cols.ko))
    )) +
  geom_col(show.legend = TRUE) + 
  scale_fill_manual(values = cols.ko, drop = FALSE) +
  scale_y_discrete(
    labels = . %>% gsub('.* chlMax', '', .) %>% gsub(' 10m', '', .)
    ) +
  scale_x_continuous(labels = scales::label_comma()) +
  labs(x = "TPM", y = "") + 
  theme_tidy(fontsize = 5)

p1
Figure 7

The genes listed below are typically catalyze the rate limiting step of the pathway. Like all other analysis, filtering is done based on the KOs and grouping of the KOs is avoided.

Expand code
i <- k %>%
  filter(pathway == 'N fixation') %>%
  mutate(
    layer = '10m',
    order = 'Other',
    tpm = 0
  ) %>%
  mutate(layer = if_else(ko == 'K02588', 'chlMax', layer)) %>%
  select(-ko, -ko_definition)
  

p2 <- kos %>% 
  filter(metabolism == "N metabolism") %>%
  rbind(i) %>%
  mutate(pathway = if_else(pathway == 'Denitrification', 'Nitrite reductase (nirK)', pathway)) %>%
  filter(pathway != 'Urea metabolism') %>%
  ggplot(aes(
    x = tpm, 
    y = paste(pathway, layer) %>% fct_rev(),
    fill = fct_relevel(order, names(cols.ko))
    )) +
  geom_col(show.legend = TRUE) + 
  scale_fill_manual(values = cols.ko, drop = FALSE) +
  scale_y_discrete(
    labels = . %>% gsub('.* chlMax', '', .) %>% gsub(' 10m', '', .)
    ) +
  scale_x_continuous(labels = scales::label_comma()) +
  labs(x = "TPM", y = "") + 
  theme_tidy(fontsize = 5)

p2
Figure 8
Expand code
p3 <- kos %>% 
  filter(metabolism == "P metabolism") %>%
  ggplot(aes(
    x = tpm, 
    y = paste(pathway, layer) %>% fct_rev(),
    fill = fct_relevel(order, names(cols.ko))
    )) +
  geom_col(show.legend = TRUE) + 
  scale_fill_manual(values = cols.ko, drop = FALSE) +
  scale_y_discrete(
    labels = . %>% gsub('.* chlMax', '', .) %>% gsub(' 10m', '', .)
    ) +
  scale_x_continuous(labels = scales::label_comma()) +
  labs(x = "TPM", y = "") +
  theme_tidy(fontsize = 5)

p3
Figure 9

Figure 10 shows the tpm for each representative KO summed over all the samples (n = 20). N transformation, central C metabolism, and photosynthesis plus aerobic respiration. Udh uronate dehydrogenase, edd phosphogluconate dehydratase, icd isocitrate dehydrogenase, ICL isocitrate lyase, G6PD glucose-6-phosphate 1-dehydrogenase, PRPP phosphoribosyl pyrophosphate, RPPK ribose-phosphate pyrophosphokinase, ED: Entner–Doudoroff. Note how the transcripts involved in carbon fixation are predominantly represented by the Streptomycetales (Streptomyces sp.) and the Burkholderiales (Nitrosomonas sp.) while only few transcripts were detected affiliated with cyanobacteria. The oxidation from ammonia to nitrite (nitrification) is performed by Nitrosopumilus sp. while the further oxidation from nitrite to nitrate (K00370) is done by a representative from the family Nitrospinaceae.

Expand code
(p1 + p2 + p3) + plot_annotation(tag_levels = list(c('c','d','e'))) + plot_layout(guides = 'collect') &
  theme(
    axis.ticks.y = element_blank(),
    legend.position = 'bottom',
    legend.key.height = unit(3.5, "mm"), legend.key.width = unit(2.5, "mm"),
    legend.text = element_text(margin = margin(l = 2)),
    legend.margin = margin(t = -2),
    plot.margin = margin(t = 10),
    plot.tag.location = "panel",
    plot.tag.position = c(0.93, 0.95),
    legend.title = element_blank(),
    aspect.ratio = 1.4,
    axis.text.y = element_text(margin = margin())
    ) & guides(fill = guide_legend(reverse = TRUE))
Figure 10: KO plot
Expand code
ggsave("figures/ko-modules.pdf", width = 180, height = 160, units = "mm")

Linking metat to morphology (SEM)

Expand code
cols.order <- c( # Sorted on abundance
  "Pseudomonadales" = "#C7B19C",
  "Pelagibacterales" = "#972D15",
  "SAR324" = "#D69C4E",
  "Marinisomatales" = "#046C9A",
  "Puniceispirillales_A" = "#9986A5",
  "PS1" = "#FAEFD1",
  "Rhodobacterales" = "#00A08A",
  "SAR86" = "#D5D5D3",
  "Burkholderiales" = "#78B7C5",
  "UBA8366" = "#A2A475",
  "Flavobacteriales" = "#D8B70A",
  "Other" = "#D5D5D3"
) %>% rev()
Expand code
kos <- read_tsv('data/morphology_genes.tsv.gz', col_types = 'ccdccc')

kos <- kos %>%
  inner_join(tax, by = "orf") %>% 
  inner_join(tpm, by = "orf") %>% 
  inner_join(meta, by = "sample") %>%
  replace_na(list("order" = "Other")) %>%
  mutate(order = if_else(order %in% names(cols.order), order, "Other")) %>%
  group_by(layer, label, order, category) %>% summarise(tpm = sum(tpm), .groups = "drop") %>%
  mutate(order = factor(order, names(cols.order))) %>%
  mutate(i = if_else(
     layer == "10m", true = label, false = paste(label, layer)
   ))
Expand code
kos %>%
  group_by(order) %>%
  summarise(n = sum(tpm)) %>% filter(!is.na(order)) %>%
  arrange(desc(n))
# A tibble: 12 × 2
   order                     n
   <fct>                 <dbl>
 1 Pseudomonadales      17963.
 2 Other                16800.
 3 Pelagibacterales     10111.
 4 SAR324                5653.
 5 Marinisomatales       5321.
 6 PS1                   2997.
 7 Puniceispirillales_A  2628.
 8 SAR86                 1367.
 9 Burkholderiales       1355.
10 Flavobacteriales      1328.
11 Rhodobacterales       1188.
12 UBA8366                802.
Expand code
lab <- kos %>%
  filter(category == "EPS biosynthesis") %>%
  arrange(i) %>%
  select(i) %>%
  distinct() %>%
  mutate(i = if_else(grepl("chlMax", i), "", i)) %>%
  pull(i) %>% rev()

p1 <- kos %>%
  filter(category == "EPS biosynthesis") %>%
  ggplot(aes(
    x = tpm, 
    y = fct_rev(i),
    fill = order
    )) +
  geom_col(width = 0.85, show.legend = TRUE) +
  scale_fill_manual(values = cols.order, drop = FALSE) +
  labs(x = "Relative abundance (tpm)", y = "") + 
  theme_tidy(legend = "right", fontsize = 5) +
  scale_x_continuous(labels = scales::label_comma()) +
  scale_y_discrete(labels = lab) +
  guides(fill = guide_legend(reverse = TRUE)) +
  theme(
    legend.title = element_blank(),
    axis.ticks.y = element_blank(),
    legend.text = element_text(margin = margin(l = 2)),
    legend.key.height = unit(3.5, "mm"), legend.key.width = unit(2.5, "mm"),

  )

p1

Expand code
lab <- kos %>%
  filter(category == "Vesicle formation") %>%
  arrange(i) %>%
  select(i) %>%
  distinct() %>%
  mutate(i = if_else(grepl("chlMax", i), "", i)) %>%
  pull(i) %>% rev()

p2 <- kos %>%
  filter(category == "Vesicle formation") %>%
  ggplot(aes(
    x = tpm, 
    y = fct_rev(i),
    fill = order
    )) +
  geom_col(width = 0.85, show.legend = TRUE) +
  scale_fill_manual(values = cols.order, drop = FALSE) +
  labs(x = "Relative abundance (tpm)", y = "") + 
  scale_y_discrete(labels = lab) +
  theme_tidy(legend = "right", fontsize = 5) +
  scale_x_continuous(labels = scales::label_comma()) +
  guides(fill = guide_legend(reverse = TRUE)) +
  theme(
    legend.title = element_blank(),
    axis.ticks.y = element_blank(),
    legend.text = element_text(margin = margin(l = 2)),
    legend.key.height = unit(3.5, "mm"), legend.key.width = unit(2.5, "mm"),

  )

p2

Expand code
lab <- kos %>%
  filter(category == "External appendages and membrane extensions") %>%
  arrange(i) %>%
  select(i) %>%
  distinct() %>%
  mutate(i = if_else(grepl("chlMax", i), "", i)) %>%
  pull(i) %>% rev()

p3 <- kos %>%
  filter(category == "External appendages and membrane extensions") %>%
  ggplot(aes(
    x = tpm, 
    y = fct_rev(i),
    fill = order
    )) +
  geom_col(width = 0.85, show.legend = TRUE) +
  scale_fill_manual(values = cols.order, drop = FALSE) +
  labs(x = "Relative abundance (tpm)", y = "") + 
  theme_tidy(legend = "right", fontsize = 5) +
  guides(fill = guide_legend(reverse = TRUE)) +
  scale_y_discrete(labels = lab) +
  scale_x_continuous(labels = scales::label_comma()) +
  theme(
    legend.title = element_blank(),
    axis.ticks.y = element_blank(),
    legend.text = element_text(margin = margin(l = 2)),
    legend.key.height = unit(3.5, "mm"), legend.key.width = unit(2.5, "mm"),

  )

p3

Osmoregulation

Expand code
d <- read_tsv('data/genelist_maintenance.tsv', col_types = 'c') 
Expand code
d <- d %>%
  left_join(kofam, by = "ko", relationship = "many-to-many") %>%
  inner_join(tax, by = "orf") %>%
  inner_join(tpm, by = "orf") %>%
  inner_join(meta, by = "sample") %>%
  mutate(layer = factor(layer, c("10m", "chlMax"))) %>%
  mutate(i = if_else(
     layer == "10m", true = subcategory, paste(subcategory, layer)
   )) %>%
  mutate(order = if_else(is.na(order), "Other", order)) %>%
  mutate(order = if_else(!order %in% names(cols.order), "Other", order)) %>%
  mutate(order = factor(order, levels = names(cols.order))) %>%
  group_by(layer, order, i, category) %>% summarise(tpm = sum(tpm), .groups = "drop")
Warning in inner_join(., tpm, by = "orf"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 1 of `x` matches multiple rows in `y`.
ℹ Row 5186 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
Expand code
lab <- d %>%
  filter(category == "Defense system") %>%
  arrange(i) %>%
  select(i) %>%
  distinct() %>%
  mutate(i = if_else(grepl("chlMax", i), "", i)) %>%
  pull(i) %>% rev()

px <- d %>%
  filter(category == "Defense system") %>%
  ggplot(aes(
    x = tpm, 
    y = fct_rev(i),
    fill = fct_relevel(order, names(cols.order))
    )) +
  geom_col(show.legend = TRUE, width = 0.8) + 
  scale_fill_manual(values = cols.order, drop = FALSE) +
  scale_y_discrete(labels = lab) +
  scale_x_continuous(labels = scales::label_comma()) +
  labs(x = "Relative abundance (tpm)", y = "", fill = "") + 
  theme_tidy(fontsize = 5, legend = "right") +
  guides(fill = guide_legend(reverse = T)) +
  theme(
     axis.ticks.y = element_blank(),
     legend.key.height = unit(3.5, "mm"),
     aspect.ratio = 1.4,
     panel.border = element_rect(fill = "transparent", linewidth = 0.75)
   )

px
Figure 11: KEGG orthologs involved in osmoregulation or the regulation of osmoregulation. Each bar shows the summed tpm (n = 10) for the depth layers. Showing the ten most relevant orders while grouping low-abundant lineages and transcripts without taxonomy as “Other”. The x-axis was transformed with a square-root to also visualize KOs with a low tpm.
Expand code
lab <- d %>%
  filter(category == "Osmoregulation") %>%
  arrange(i) %>%
  select(i) %>%
  distinct() %>%
  mutate(i = if_else(grepl("chlMax", i), "", i)) %>%
  pull(i) %>% rev()

py <- d %>%
  filter(category == "Osmoregulation") %>%
  ggplot(aes(
    x = tpm, 
    y = fct_rev(i),
    fill = fct_relevel(order, names(cols.order))
    )) +
  geom_col(show.legend = TRUE, width = 0.8) + 
  scale_fill_manual(values = cols.order, drop = FALSE) +
  scale_y_discrete(labels = lab) +
  scale_x_continuous(labels = scales::label_comma()) +
  labs(x = "Relative abundance (tpm)", y = "", fill = "") + 
  theme_tidy(fontsize = 5, legend = "right") +
  guides(fill = guide_legend(reverse = T)) +
  theme(
     axis.ticks.y = element_blank(),
     legend.key.height = unit(3.5, "mm"),
     aspect.ratio = 1.4,
     panel.border = element_rect(fill = "transparent", linewidth = 0.75)
   )

py
Figure 12: KEGG orthologs involved in osmoregulation or the regulation of osmoregulation. Each bar shows the summed tpm (n = 10) for the depth layers. Showing the ten most relevant orders while grouping low-abundant lineages and transcripts without taxonomy as “Other”. The x-axis was transformed with a square-root to also visualize KOs with a low tpm.
Expand code
lab <- d %>%
  filter(category == "DNA repair and recombination") %>%
  arrange(i) %>%
  select(i) %>%
  distinct() %>%
  mutate(i = if_else(grepl("chlMax", i), "", i)) %>%
  pull(i) %>% rev()

pz <- d %>%
  filter(category == "DNA repair and recombination") %>%
  ggplot(aes(
    x = tpm, 
    y = fct_rev(i),
    fill = fct_relevel(order, names(cols.order))
    )) +
  geom_col(show.legend = TRUE, width = 0.8) + 
  scale_fill_manual(values = cols.order, drop = FALSE) +
  scale_x_continuous(labels = scales::label_comma()) +
  scale_y_discrete(labels = lab) +
  labs(x = "Relative abundance (tpm)", y = "", fill = "") + 
  theme_tidy(fontsize = 5, legend = "right") +
  guides(fill = guide_legend(reverse = T)) +
  theme(
     axis.ticks.y = element_blank(),
     legend.key.height = unit(3.5, "mm"),
     aspect.ratio = 1.4,
     panel.border = element_rect(fill = "transparent", linewidth = 0.75)
   )

pz
Figure 13: KEGG orthologs involved in osmoregulation or the regulation of osmoregulation. Each bar shows the summed tpm (n = 10) for the depth layers. Showing the ten most relevant orders while grouping low-abundant lineages and transcripts without taxonomy as “Other”. The x-axis was transformed with a square-root to also visualize KOs with a low tpm.
Expand code
(p1 + p2 + p3) / (px + py + pz) + 
  plot_annotation(tag_levels = list(c('a: EPS biosynthesis', 'b: Vesicle formation', 'c: External appendages\nand membrane extensions','d: Defense system', 'e: Osmoregulation', 'f: DNA repair and recombination'))) + 
  plot_layout(guides = "collect") & guides(fill = guide_legend(reverse = TRUE)) &
  theme(
    legend.key.height = unit(3.5, "mm"), legend.key.width = unit(2.5, "mm"),
    plot.margin = margin(t = 10),
    plot.tag.location = "panel",
    plot.tag.position = c(0.5, 1.05), plot.tag = element_text(size = 5, face = "plain"),
    axis.ticks.y = element_blank(),
    legend.position = "right", legend.title = element_blank(),
    legend.margin = margin(t = -10), legend.text = element_text(margin = margin(l = 2))
    )
Figure 14: KO plot
Expand code
ggsave(filename = "figures/maintenance.pdf", width = 180, height = 140, units = "mm")

References

1.
Royo-Llonch M, Sánchez P, Ruiz-González C, et al (2021) Compendium of 530 metagenome-assembled bacterial and archaeal genomes from the polar arctic ocean. Nature Microbiology 6(12):1561–1574