source(here::here("R",'PR2_init.R'), echo=FALSE)
PR2 version 5.0.0
New taxonomy with 9 ranks
Aim
Check 9-levels taxonomy
Initialization
Set up files
$date = format(Sys.time(), "%Y-%m-%d")
pr2.env<- here::here("5.0", "A - new taxonomy - 2023")
dir_pr2_update
<- function(file_name){
full_path str_c(dir_pr2_update,"/", file_name)
}
<- full_path(str_c("pr2_taxonomy_", pr2.env$date, ".xlsx"))
file_pr2_taxonomy
# create the directory for taxonomy output
dir.create(full_path("taxo"), showWarnings = FALSE)
Initial move from 8 to 9 levels
Read taxonomy and main from database
<- db_info("pr2_google")
pr2_db <- db_connect(pr2_db)
pr2_db_con
<- tbl(pr2_db_con, "pr2_taxonomy") %>%
pr2_taxo filter (is.na(taxo_removed_version)) %>%
collect()
<- tbl(pr2_db_con, "pr2_main") %>%
pr2_main filter (is.na(removed_version),
!is.na(species_8)) %>%
collect()
<- left_join(pr2_main, pr2_taxo, by = join_by(species_8, species_9)) %>%
pr2 select(-(kingdom_8:genus_8)) %>%
relocate(contains("_9"), species_9, contains("_8"), .before = NULL) %>%
relocate(species_9, .before = species_8) %>%
arrange_at(pr2.env$taxo_levels_9)
db_disconnect(pr2_db_con)
Summarize taxonomy at different levels
Class
<- pr2_taxo %>%
pr2_taxo_class filter(!(domain_9 %in% c("Bacteria", "Archaea"))) %>%
count(supergroup_9, division_9, subdivision_9, class_9, supergroup_8, division_8, class_8)
::datatable(pr2_taxo_class) DT
Find incoherencies between pr2_main and pr2_taxonomy
Note: must be done in this order
Find entries with no taxonomy
<- pr2 %>%
missing_species filter(is.na(domain_9)) %>%
select(species_8, species_9) %>%
distinct()
missing_species
List duplicate species_9 entries
<- pr2_taxo %>%
duplicate_species_9 count(species_9) %>%
filter(n > 1) %>%
left_join(pr2_taxo) %>%
relocate(species_8, .after = "species_9")
duplicate_species_9
Check taxonomy entries which have no associated sequences
<- pr2_taxo %>%
orphan_species filter(!(species_9 %in% unique(pr2$species_9))) %>%
relocate(species_8, species_9)
orphan_species
Check taxonomy 9 levels follows PR2 rules
<- pr2_taxo %>%
pr2_taxo_9 select(domain_9:species_9) %>%
rename_with(~str_replace(., "_9", ""), everything()) %>%
distinct()
::pr2_taxo_check(pr2_taxo_9,
dvutilstaxo_levels = pr2.env$taxo_levels[[9]],
dir_taxo=full_path("taxo"))
Save into file
- Do only do once after each update
<- list(
pr2_list "pr2_taxo" = pr2_taxo,
"pr2_taxo_class" = pr2_taxo_class,
"pr2_merged_species_8" = duplicate_species_9,
"pr2_species_without_pr2_sequences"= orphan_species)
::export(pr2_list,
rio
file_pr2_taxonomy, firstActiveRow = 2,
firstActiveCol = 2)
Merge some info from Javier file - 2023-03-23
source(here::here("R",'PR2_read_google_quick.R'), echo=FALSE)
Update bacteria with supergroup
<- rio::import(full_path("pr2_taxonomy_2023-02-19.MJ_jdc.xlsx"), guess_max = 10000)
pr2_taxo_javier
<- pr2_taxo_javier %>%
pr2_taxo_bact filter(domain %in% c("Bacteria", "Archaea"),
== 1) %>%
Marked select(domain:division) %>%
count(domain, supergroup, division)
::export(pr2_taxo_bact, full_path("pr2_taxonomy_jdc_bacteria.xlsx")) rio