PR2 version 5.0.0
Update diatom taxonomy

Author

Daniel Vaulot

Published

February 19, 2023

Aim

  • Update diatom taxonomy using the correct classes.
  • The latest taxonomy is read from Worms web site using the worms package
  • This is then refined manually from AlgaeBase web site

Initialization

source('../R/PR2_init.R', echo=FALSE)

Set up files

  pr2.env$date = format(Sys.time(), "%Y-%m-%d")
  dir_pr2_update <- "B - diatom_classes"
  
  full_path <- function(file_name){
    str_c(dir_pr2_update,"/", file_name)
    }

  file_pr2_taxonomy <- full_path(str_c("pr2_diatoms_", pr2.env$date, ".xlsx"))  
  file_worms_taxonomy <- full_path("worms_diatoms_2023-02-11.xlsx") 
  file_merged_taxonomy <- full_path(str_c("merged_diatoms_", pr2.env$date, ".xlsx"))  

Read diatom taxonomy

Read from PR2 database

pr2_db <- db_info("pr2_google")
pr2_db_con <- db_connect(pr2_db)

pr2_taxo <- tbl(pr2_db_con, "pr2_taxonomy") %>%
  filter (is.na(taxo_removed_version),
          class_9 == "Diatomeae") %>% 
  collect() 


db_disconnect(pr2_db_con)

Save into file

rio::export(pr2_taxo, 
            file_pr2_taxonomy, 
            firstActiveRow = 2, 
            firstActiveCol = 2)

Summarize taxonomy at genus level

pr2_grouped <- pr2_taxo %>% 
  filter(!str_detect(genus_9, "_")) %>% 
  count(genus_8, genus_9) 

# Check that genus_8 and genus_9 are similar

pr2_grouped %>% filter(genus_8 != genus_9)

# Display list of genera

DT::datatable(pr2_grouped)

Extract information from Worms

DO ONLY ONCE

  • Use the WORMS package
  • Search based on genus
  • Need to slice by groups of 50 because WORMS will not take more request at once
# Slice file by 50
slice_size = 50

df <- pr2_grouped

df_number = nrow(df)

df <- df %>%
  mutate(slice =ggplot2::cut_width(1:df_number,labels=FALSE, width=slice_size, boundary = 1) )

slice_number = max(pull(df, slice))

df_list <- list()

for (i in 1:slice_number) {

  vector <- df %>%
    filter(slice == i) %>%
    pull(genus_9)
  
  df_list[[i]] = worms::wormsbymatchnames(vector, marine_only = FALSE)
}

worms_taxo <- df_list %>%
  reduce(bind_rows) %>% 
  filter(class == "Bacillariophyceae")

rio::export(worms_taxo, 
            file_worms_taxonomy, 
            firstActiveRow = 2, 
            firstActiveCol = 2)

Merge WORMS and pr2

Read WORMS file and merge with taxo file

genus_worms <- rio::import(file_worms_taxonomy) %>% 
  select(order, family, genus) %>% 
  rename_with(~str_c(., "_worms"))

Just select class_9 to genus_8 from PR2

genus_pr2 <- pr2_taxo %>% 
  select(class_9, order_9, family_9, genus_9) %>% 
  mutate(class_9 = order_9,
         order_9 = "") %>% 
  distinct()

Left join between pr2 and worms

genus_merged <- genus_pr2 %>% 
  left_join(genus_worms, by = join_by(genus_9 == genus_worms)) %>% 
  arrange(across(any_of(pr2.env$taxo_levels_9))) %>% 
  rename_with(~str_c(., "_old"), contains("_9")) %>% 
  mutate( class_9 = class_9_old,
          order_9 = order_worms,
          family_9 = family_worms,
          genus_9 = genus_9_old,
         ) %>% 
  relocate(any_of(pr2.env$taxo_levels_9))

Save the resulting file

rio::export(genus_merged, 
            file_merged_taxonomy, 
            firstActiveRow = 2, 
            firstActiveCol = 2)

Edit Merged file (Manually)

This is done using the AlgaeBase web site: https://www.algaebase.org:

  • For each line the genus is searched and the correct class, order and family checked and eventually corrected
  • In some case the genus_9 needs to be changed (e.g. genus_9 Bacillariophyceae_XXX to replace Bacillariophyceae_XX). In that case:
    • genus_9 and species_9 changed in pr2_taxonomy
    • species_9 changed in pr2_main

Make file to update pr2_taxonomy

  • Join the genus merged file and the pr2_taxonomy file
  • Replace class_9, order_9, family_9 with the Worms and Algaebase corrected names
  • The resulting file will be re-imported in to the pr2_taxonomy table
genus_merged_corrected <- rio::import(full_path("merged_diatoms_2023-02-16_fixed.xlsx")) %>% 
  select(any_of(pr2.env$taxo_levels_9)) 

pr2_taxo_updated <- pr2_taxo %>% 
  rename_with(~str_replace(., "_9", "_9_old"),
              c("class_9", "order_9", "family_9")) %>% 
  left_join(genus_merged_corrected, 
            by = join_by(genus_9)) %>% 
  relocate(any_of(c("class_9", "order_9", "family_9")), .before = genus_9) %>% 
  relocate(any_of(contains("old")), .after = last_col()) %>%  
  arrange(across(any_of(pr2.env$taxo_levels_9))) 

rio::export(pr2_taxo_updated, 
            full_path("pr2_taxo_diatoms_final.xlsx"), 
            firstActiveRow = 2, 
            firstActiveCol = 10)