PR2 version 5.0.0
New taxonomy with 9 ranks

Author

Daniel Vaulot

Published

March 23, 2023

Aim

Check 9-levels taxonomy

Initialization

source(here::here("R",'PR2_init.R'), echo=FALSE)

Set up files

  pr2.env$date = format(Sys.time(), "%Y-%m-%d")
  dir_pr2_update <- here::here("5.0", "A - new taxonomy - 2023")
  
  full_path <- function(file_name){
    str_c(dir_pr2_update,"/", file_name)
    }

  file_pr2_taxonomy <- full_path(str_c("pr2_taxonomy_", pr2.env$date, ".xlsx")) 

# create the directory for taxonomy output
  dir.create(full_path("taxo"), showWarnings = FALSE)

Initial move from 8 to 9 levels

Read taxonomy and main from database

pr2_db <- db_info("pr2_google")
pr2_db_con <- db_connect(pr2_db)

pr2_taxo <- tbl(pr2_db_con, "pr2_taxonomy") %>%
  filter (is.na(taxo_removed_version)) %>% 
  collect()

pr2_main <- tbl(pr2_db_con, "pr2_main") %>%
  filter (is.na(removed_version),
          !is.na(species_8)) %>% 
  collect() 

pr2 <- left_join(pr2_main, pr2_taxo, by = join_by(species_8, species_9)) %>% 
  select(-(kingdom_8:genus_8)) %>% 
  relocate(contains("_9"), species_9, contains("_8"), .before = NULL) %>% 
  relocate(species_9,  .before = species_8) %>% 
  arrange_at(pr2.env$taxo_levels_9) 

db_disconnect(pr2_db_con)

Summarize taxonomy at different levels

Class

pr2_taxo_class <- pr2_taxo %>% 
  filter(!(domain_9 %in% c("Bacteria", "Archaea"))) %>% 
  count(supergroup_9, division_9, subdivision_9, class_9, supergroup_8, division_8, class_8) 

DT::datatable(pr2_taxo_class)

Find incoherencies between pr2_main and pr2_taxonomy

Note: must be done in this order

Find entries with no taxonomy

missing_species <- pr2 %>% 
  filter(is.na(domain_9)) %>% 
  select(species_8, species_9) %>% 
  distinct()

missing_species

List duplicate species_9 entries

duplicate_species_9 <- pr2_taxo %>% 
  count(species_9) %>% 
  filter(n > 1) %>% 
  left_join(pr2_taxo) %>% 
  relocate(species_8, .after = "species_9")

duplicate_species_9 

Check taxonomy entries which have no associated sequences

orphan_species <- pr2_taxo %>% 
  filter(!(species_9 %in% unique(pr2$species_9))) %>% 
  relocate(species_8, species_9)

orphan_species

Check taxonomy 9 levels follows PR2 rules

pr2_taxo_9 <- pr2_taxo %>% 
   select(domain_9:species_9) %>% 
   rename_with(~str_replace(., "_9", ""), everything()) %>% 
   distinct()

dvutils::pr2_taxo_check(pr2_taxo_9, 
                        taxo_levels = pr2.env$taxo_levels[[9]], 
                        dir_taxo=full_path("taxo"))

Save into file

  • Do only do once after each update
pr2_list <- list(
                 "pr2_taxo" = pr2_taxo,
                 "pr2_taxo_class" = pr2_taxo_class, 
                 "pr2_merged_species_8" = duplicate_species_9, 
                 "pr2_species_without_pr2_sequences"= orphan_species)

rio::export(pr2_list, 
            file_pr2_taxonomy, 
            firstActiveRow = 2, 
            firstActiveCol = 2)

Merge some info from Javier file - 2023-03-23

source(here::here("R",'PR2_read_google_quick.R'), echo=FALSE)

Update bacteria with supergroup

pr2_taxo_javier <- rio::import(full_path("pr2_taxonomy_2023-02-19.MJ_jdc.xlsx"), guess_max = 10000)

pr2_taxo_bact <- pr2_taxo_javier %>% 
  filter(domain %in% c("Bacteria", "Archaea"),
         Marked == 1) %>% 
  select(domain:division) %>% 
  count(domain, supergroup, division)

rio::export(pr2_taxo_bact, full_path("pr2_taxonomy_jdc_bacteria.xlsx"))