PR2 version 4.13.0
New taxonomy with 9 ranks
1 Init
Load the variables common to the different scripts and the necessary libraries
2 Reference
- Adl SM et al. 2019. Revisions to the classification, nomenclature, and diversity of eukaryotes. J. Eukaryot. Microbiol. 66(1):4-119.
- Burki F, Roger AJ, Brown MW, Simpson AGB. 2020. The New Tree of Eukaryotes. Trends. Ecol. Evol. 35(1):43-55.
3 Set up the files
dir_pr2_update <- "../updates/2020 4.13.0 new taxonomy"
pr2.env$editor <- "M. Jamy"
full_path <- function(file_name){str_c(dir_pr2_update,"/", file_name)}
file_pr2_update_excel <- full_path("pr2_new_taxonomy.xlsx")
# create the directory for taxonomy output
dir.create(full_path("taxo"), showWarnings = FALSE)
4 Read taxonomy update
5 Change at each level
5.1 Change the fields to _old
5.2 Species
one_level = "species"
# Only keep the lines for which the species_old is available
pr2_update_list[[one_level]] <- pr2_update %>%
filter(!is.na(species_old))
cat("N rows Excel - level ",one_level, "-", nrow(pr2_update_list[[one_level]]), "\n")
# Create a taxonomy list for the species level
pr2_taxo_list[[one_level]] <- pr2_taxo_new %>%
inner_join(pr2_update_list[[one_level]])
# inner_join(pr2_update_list[[one_level]], by = c("species_old" = "species_old"))
cat("N rows Taxo - level ",one_level, "-", nrow(pr2_taxo_list[[one_level]]), "\n")
df <- pr2_update_list[[one_level]] %>%
select(supergroup_old:species_old) %>%
filter(!species_old %in% pr2_taxo_new$species_old)
kable(df, caption = str_c("Species not yet in PR2 \n") )
df <- pr2_update_list[[one_level]] %>%
filter(!species_old %in% pr2_taxo_list[[one_level]]$species_old)
kable(df, caption = str_c(one_level, " for which the old taxo is wrong \n") )
df <- pr2_taxo_updated_duplicated <- pr2_taxo_list[[one_level]] %>%
add_count(taxo_id) %>%
filter(n > 1)
kable(df, caption = str_c(one_level, " for which the taxo duplicated \n") )
# Remove species that have been updated
pr2_taxo_new <- pr2_taxo_new %>%
filter(! taxo_id %in% pr2_taxo_list[[one_level]]$taxo_id)
5.3 Genus
one_level = "genus"
pr2_update_list[[one_level]] <- pr2_update %>%
filter(is.na(species_old)& !is.na(genus_old)) %>%
select(-species_old)
cat("N rows Excel - level ",one_level, "-", nrow(pr2_update_list[[one_level]]), "\n")
pr2_taxo_list[[one_level]] <- pr2_taxo_new %>%
inner_join(pr2_update_list[[one_level]]) %>%
mutate(species = case_when (is.na (species) ~ species_old,
TRUE ~species))
cat("N rows Taxo - level ",one_level, "-", nrow(pr2_taxo_list[[one_level]]), "\n")
cat(one_level," not in PR2 \n")
pr2_update_list[[one_level]] %>%
select(supergroup_old:genus_old) %>%
filter(!genus_old %in% pr2_taxo_new$genus_old)
cat(one_level, " for which the old taxo is wrong \n")
pr2_update_list[[one_level]] %>%
filter(!genus_old %in% pr2_taxo_list[[one_level]]$genus_old)
cat(one_level, " for which the taxo duplicated \n")
pr2_taxo_updated_duplicated <- pr2_taxo_list[[one_level]] %>%
add_count(taxo_id) %>%
filter(n > 1)
# Remove species that have been updated
pr2_taxo_new <- pr2_taxo_new %>%
filter(! taxo_id %in% pr2_taxo_list[[one_level]]$taxo_id)
5.4 Family
one_level = "family"
pr2_update_list[[one_level]] <- pr2_update %>%
filter(is.na(species_old)& is.na(genus_old) & !is.na(family_old)) %>%
select(-species_old, -genus_old)
cat("N rows Excel - level ",one_level, "-", nrow(pr2_update_list[[one_level]]), "\n")
junk <- pr2_taxo_new %>%
inner_join(pr2_update_list[[one_level]]) %>%
mutate(species = case_when (is.na (species) ~ species_old,
TRUE ~species),
genus = case_when (is.na (genus) ~ genus_old,
TRUE ~ genus))
pr2_taxo_list[[one_level]] <- pr2_taxo_new %>%
inner_join(pr2_update_list[[one_level]]) %>%
mutate(species = case_when (is.na (species) ~ species_old,
TRUE ~species),
genus = case_when (is.na (genus) ~ genus_old,
TRUE ~ genus))
cat("N rows Taxo - level ",one_level, "-", nrow(pr2_taxo_list[[one_level]]), "\n")
cat(one_level," not in PR2 \n")
pr2_update_list[[one_level]] %>%
select(supergroup_old:family_old) %>%
filter(!family_old %in% pr2_taxo_new$family_old)
cat(one_level, " for which the old taxo is wrong \n")
pr2_update_list[[one_level]] %>%
filter(!family_old %in% pr2_taxo_list[[one_level]]$family_old)
cat(one_level, " for which the taxo duplicated \n")
pr2_taxo_updated_duplicated <- pr2_taxo_list[[one_level]] %>%
add_count(taxo_id) %>%
filter(n > 1)
# Remove species that have been updated
pr2_taxo_new <- pr2_taxo_new %>%
filter(! taxo_id %in% pr2_taxo_list[[one_level]]$taxo_id)
5.5 Order
one_level = "order"
pr2_update_list[[one_level]] <- pr2_update %>%
filter(is.na(species_old)& is.na(genus_old) & is.na(family_old)& !is.na(order_old)) %>%
select(-species_old, -genus_old, -family_old)
cat("N rows Excel - level ",one_level, "-", nrow(pr2_update_list[[one_level]]), "\n")
pr2_taxo_list[[one_level]] <- pr2_taxo_new %>%
inner_join(pr2_update_list[[one_level]]) %>%
mutate(species = case_when (is.na (species) ~ species_old,
TRUE ~species),
genus = case_when (is.na (genus) ~ genus_old,
TRUE ~ genus),
family = case_when (is.na (family) ~ family_old,
TRUE ~ family))
cat("N rows Taxo - level ",one_level, "-", nrow(pr2_taxo_list[[one_level]]), "\n")
cat(one_level," not in PR2 \n")
pr2_update_list[[one_level]] %>%
select(supergroup_old:order_old) %>%
filter(!order_old %in% pr2_taxo_new$order_old)
cat(one_level, " for which the old taxo is wrong \n")
pr2_update_list[[one_level]] %>%
filter(!order_old %in% pr2_taxo_list[[one_level]]$order_old)
cat(one_level, " for which the taxo duplicated \n")
pr2_taxo_updated_duplicated <- pr2_taxo_list[[one_level]] %>%
add_count(taxo_id) %>%
filter(n > 1)
# Remove species that have been updated
pr2_taxo_new <- pr2_taxo_new %>%
filter(! taxo_id %in% pr2_taxo_list[[one_level]]$taxo_id)
5.6 Class
one_level = "class"
pr2_update_list[[one_level]] <- pr2_update %>%
filter(is.na(species_old)& is.na(genus_old) & is.na(family_old)& is.na(order_old) & !is.na(class_old)) %>%
select(-species_old, -genus_old, -family_old, -order_old)
cat("N rows Excel - level ",one_level, "-", nrow(pr2_update_list[[one_level]]), "\n")
pr2_taxo_list[[one_level]] <- pr2_taxo_new %>%
inner_join(pr2_update_list[[one_level]]) %>%
mutate(species = case_when (is.na (species) ~ species_old,
TRUE ~species),
genus = case_when (is.na (genus) ~ genus_old,
TRUE ~ genus),
family = case_when (is.na (family) ~ family_old,
TRUE ~ family),
order = case_when (is.na (order) ~ order_old,
TRUE ~ order))
cat("N rows Taxo - level ",one_level, "-", nrow(pr2_taxo_list[[one_level]]), "\n")
cat(one_level," not in PR2 \n")
pr2_update_list[[one_level]] %>%
select(supergroup_old:class_old) %>%
filter(!class_old %in% pr2_taxo_new$class_old)
cat(one_level, " for which the old taxo is wrong \n")
pr2_update_list[[one_level]] %>%
filter(!class_old %in% pr2_taxo_list[[one_level]]$class_old)
cat(one_level, " for which the taxo duplicated \n")
pr2_taxo_updated_duplicated <- pr2_taxo_list[[one_level]] %>%
add_count(taxo_id) %>%
filter(n > 1)
# Remove species that have been updated
pr2_taxo_new <- pr2_taxo_new %>%
filter(! taxo_id %in% pr2_taxo_list[[one_level]]$taxo_id)
5.7 Assembling it all
pr2_taxo_updated <- pr2_taxo_list %>%
reduce(bind_rows) %>%
distinct(.keep_all = TRUE)
pr2_taxo_missing <- pr2_taxo_new %>%
filter(!taxo_id %in% pr2_taxo_updated$taxo_id)
pr2_class_missing <- select(pr2_taxo_missing, kingdom_old:class_old) %>%
distinct()
kable(pr2_class_missing)
kable(pr2_taxo_missing)
pr2_taxo_updated <- pr2_taxo_updated %>%
bind_rows(pr2_taxo_missing) %>%
arrange(kingdom_old, supergroup_old, division_old, class_old, order_old, family_old, genus_old, species_old) %>%
relocate(!! pr2.env$taxo_new_levels, .before = taxo_id)
pr2_taxo_updated_duplicated <- pr2_taxo_updated %>%
add_count(taxo_id) %>%
filter(n > 1)
6 Check Xs are OK
To accelerate first filter_at to detect lines that have an _X. Then merge back the 2 df… https://stackoverflow.com/questions/55478879/filtering-multiple-columns-with-str-detect
taxo_levels = pr2.env$taxo_new_levels
taxo_levels_number <- length(taxo_levels)
pr2_taxo_updated_1 <- pr2_taxo_updated %>%
filter_all(any_vars(str_detect(., "_X")))
pr2_taxo_updated_2 <- pr2_taxo_updated %>%
filter(!(taxo_id %in% pr2_taxo_updated_1$taxo_id))
nrow(pr2_taxo_updated_1)
# for (i in 1:10) {
for (i in 1:nrow(pr2_taxo_updated_1)) {
for (j in 1:(taxo_levels_number-1) ){
get_out = FALSE
# Check first if level contains "_X"
if (str_detect(pr2_taxo_updated_1[i,taxo_levels[j]],
"_X")) {
# If true cycle the remaining columns and add the correct number of X
pr2_taxo_updated_1[i, "X_flag"] <- 1
for (k in (j+1):taxo_levels_number ) {
# print(k)
# If the next level does not contain _X get out
if (!str_detect(pr2_taxo_updated_1[i,taxo_levels[k]], "_X")) break
# For the species level, same number of X than for genus
if (k < taxo_levels_number) n_X = k-j+1 else n_X = k-j
# Replace _XXX by the good number of X (n_X)
pr2_taxo_updated_1[i,taxo_levels[k]] <- str_replace(pr2_taxo_updated_1[i,taxo_levels[k]] ,
"_[X]+",
str_c("_", str_dup("X", n_X)))
get_out = TRUE
}
}
if (get_out) break
}
}
pr2_taxo_updated <- bind_rows(pr2_taxo_updated_1, pr2_taxo_updated_2) %>%
arrange(kingdom_old, supergroup_old, division_old, class_old, order_old, family_old, genus_old, species_old)
openxlsx::write.xlsx(pr2_taxo_updated, full_path("pr2_new_taxonomy_species.xlsx"),
firstActiveRow = 2, firstActiveCol = 9, zoom=85)
7 Check taxonomy
8 Check duplicated taxa
if (FALSE){
td <- read_xlsx(full_path("taxa_duplicated.xlsx"))
td9 <- filter(td, level==9)
td8 <- filter(td, level==8)
td7 <- filter(td, level==7)
td6 <- filter(td, level==6)
td5 <- filter(td, level==5)
duplicates <- pr2_taxo_updated %>%
filter((species %in% td9$name)|
(genus %in% td8$name)|
(family %in% td7$name)|
(order %in% td6$name)|
(class %in% td5$name) )
openxlsx::write.xlsx(duplicates, full_path("duplicated_species.xlsx"),
firstActiveRow = 2, firstActiveCol = 9, zoom=85)
}