source('../R/PR2_init.R', echo=FALSE)
PR2 version 5.0.0
Update diatom taxonomy
Aim
- Update diatom taxonomy using the correct classes.
- The latest taxonomy is read from Worms web site using the
worms
package - This is then refined manually from AlgaeBase web site
Initialization
Set up files
$date = format(Sys.time(), "%Y-%m-%d")
pr2.env<- "B - diatom_classes"
dir_pr2_update
<- function(file_name){
full_path str_c(dir_pr2_update,"/", file_name)
}
<- full_path(str_c("pr2_diatoms_", pr2.env$date, ".xlsx"))
file_pr2_taxonomy <- full_path("worms_diatoms_2023-02-11.xlsx")
file_worms_taxonomy <- full_path(str_c("merged_diatoms_", pr2.env$date, ".xlsx")) file_merged_taxonomy
Read diatom taxonomy
Read from PR2 database
<- db_info("pr2_google")
pr2_db <- db_connect(pr2_db)
pr2_db_con
<- tbl(pr2_db_con, "pr2_taxonomy") %>%
pr2_taxo filter (is.na(taxo_removed_version),
== "Diatomeae") %>%
class_9 collect()
db_disconnect(pr2_db_con)
Save into file
::export(pr2_taxo,
rio
file_pr2_taxonomy, firstActiveRow = 2,
firstActiveCol = 2)
Summarize taxonomy at genus level
<- pr2_taxo %>%
pr2_grouped filter(!str_detect(genus_9, "_")) %>%
count(genus_8, genus_9)
# Check that genus_8 and genus_9 are similar
%>% filter(genus_8 != genus_9)
pr2_grouped
# Display list of genera
::datatable(pr2_grouped) DT
Extract information from Worms
DO ONLY ONCE
- Use the WORMS package
- Search based on genus
- Need to slice by groups of 50 because WORMS will not take more request at once
# Slice file by 50
= 50
slice_size
<- pr2_grouped
df
= nrow(df)
df_number
<- df %>%
df mutate(slice =ggplot2::cut_width(1:df_number,labels=FALSE, width=slice_size, boundary = 1) )
= max(pull(df, slice))
slice_number
<- list()
df_list
for (i in 1:slice_number) {
<- df %>%
vector filter(slice == i) %>%
pull(genus_9)
= worms::wormsbymatchnames(vector, marine_only = FALSE)
df_list[[i]]
}
<- df_list %>%
worms_taxo reduce(bind_rows) %>%
filter(class == "Bacillariophyceae")
::export(worms_taxo,
rio
file_worms_taxonomy, firstActiveRow = 2,
firstActiveCol = 2)
Merge WORMS and pr2
Read WORMS file and merge with taxo file
<- rio::import(file_worms_taxonomy) %>%
genus_worms select(order, family, genus) %>%
rename_with(~str_c(., "_worms"))
Just select class_9 to genus_8 from PR2
<- pr2_taxo %>%
genus_pr2 select(class_9, order_9, family_9, genus_9) %>%
mutate(class_9 = order_9,
order_9 = "") %>%
distinct()
Left join between pr2 and worms
<- genus_pr2 %>%
genus_merged left_join(genus_worms, by = join_by(genus_9 == genus_worms)) %>%
arrange(across(any_of(pr2.env$taxo_levels_9))) %>%
rename_with(~str_c(., "_old"), contains("_9")) %>%
mutate( class_9 = class_9_old,
order_9 = order_worms,
family_9 = family_worms,
genus_9 = genus_9_old,
%>%
) relocate(any_of(pr2.env$taxo_levels_9))
Save the resulting file
::export(genus_merged,
rio
file_merged_taxonomy, firstActiveRow = 2,
firstActiveCol = 2)
Edit Merged file (Manually)
This is done using the AlgaeBase web site: https://www.algaebase.org:
- For each line the genus is searched and the correct class, order and family checked and eventually corrected
- In some case the genus_9 needs to be changed (e.g. genus_9 Bacillariophyceae_XXX to replace Bacillariophyceae_XX). In that case:
- genus_9 and species_9 changed in pr2_taxonomy
- species_9 changed in pr2_main
Make file to update pr2_taxonomy
- Join the genus merged file and the pr2_taxonomy file
- Replace class_9, order_9, family_9 with the Worms and Algaebase corrected names
- The resulting file will be re-imported in to the pr2_taxonomy table
<- rio::import(full_path("merged_diatoms_2023-02-16_fixed.xlsx")) %>%
genus_merged_corrected select(any_of(pr2.env$taxo_levels_9))
<- pr2_taxo %>%
pr2_taxo_updated rename_with(~str_replace(., "_9", "_9_old"),
c("class_9", "order_9", "family_9")) %>%
left_join(genus_merged_corrected,
by = join_by(genus_9)) %>%
relocate(any_of(c("class_9", "order_9", "family_9")), .before = genus_9) %>%
relocate(any_of(contains("old")), .after = last_col()) %>%
arrange(across(any_of(pr2.env$taxo_levels_9)))
::export(pr2_taxo_updated,
riofull_path("pr2_taxo_diatoms_final.xlsx"),
firstActiveRow = 2,
firstActiveCol = 10)