PR2 version 4.13.0
New taxonomy with 9 ranks

1 Init

Load the variables common to the different scripts and the necessary libraries

  knitr::opts_chunk$set(eval=FALSE, 
                        cache = TRUE, 
                        cache.extra = file.info("../updates/2020 4.13.0 new taxonomy/pr2_new_taxonomy.xlsx"),
                        tidy = FALSE)
  source('PR2_init.R', echo=FALSE)

2 Reference

  • Adl SM et al. 2019. Revisions to the classification, nomenclature, and diversity of eukaryotes. J. Eukaryot. Microbiol. 66(1):4-119.
  • Burki F, Roger AJ, Brown MW, Simpson AGB. 2020. The New Tree of Eukaryotes. Trends. Ecol. Evol. 35(1):43-55.

3 Set up the files

  dir_pr2_update <- "../updates/2020 4.13.0 new taxonomy"
  
  pr2.env$editor <- "M. Jamy"

  full_path <- function(file_name){str_c(dir_pr2_update,"/", file_name)}

  file_pr2_update_excel <- full_path("pr2_new_taxonomy.xlsx")  

# create the directory for taxonomy output
  dir.create(full_path("taxo"), showWarnings = FALSE)

4 Read taxonomy update

pr2_update <- read_excel(file_pr2_update_excel, sheet = "pr2_taxonomy_changes", guess_max=200000, na=c("", "-")) %>% 
  select(-pr2_accession, -sequence)
  
  str_c("Number of lines : ", nrow(pr2_update))

5 Change at each level

5.1 Change the fields to _old

  pr2_taxo_new <- pr2_taxo %>%
    dplyr::select(!! pr2.env$taxo_levels, taxo_id) %>% 
    rename_with(~ str_c(.,"_old"), any_of(pr2.env$taxo_levels))

  pr2_update_list <- list()
  pr2_taxo_list <- list()

5.2 Species

   one_level = "species"

# Only keep the lines for which the species_old is available
   pr2_update_list[[one_level]] <- pr2_update %>% 
     filter(!is.na(species_old)) 

   cat("N rows Excel - level ",one_level, "-", nrow(pr2_update_list[[one_level]]), "\n")
   
# Create a taxonomy list for the species level    
   pr2_taxo_list[[one_level]] <- pr2_taxo_new %>% 
     inner_join(pr2_update_list[[one_level]]) 
     # inner_join(pr2_update_list[[one_level]], by = c("species_old" = "species_old"))
   
   cat("N rows Taxo - level ",one_level, "-", nrow(pr2_taxo_list[[one_level]]), "\n")
   
   df <- pr2_update_list[[one_level]] %>% 
     select(supergroup_old:species_old) %>% 
     filter(!species_old %in% pr2_taxo_new$species_old)
   kable(df, caption = str_c("Species not yet in PR2 \n") )
   
   df <- pr2_update_list[[one_level]] %>% 
     filter(!species_old %in% pr2_taxo_list[[one_level]]$species_old) 
   kable(df, caption = str_c(one_level, " for which the  old taxo is wrong \n") )
     
   df <- pr2_taxo_updated_duplicated <- pr2_taxo_list[[one_level]] %>% 
    add_count(taxo_id) %>% 
    filter(n > 1)
   kable(df, caption = str_c(one_level, " for which the  taxo duplicated \n") )

# Remove species that have been updated
   pr2_taxo_new <- pr2_taxo_new %>% 
      filter(! taxo_id %in% pr2_taxo_list[[one_level]]$taxo_id)

5.3 Genus

   one_level = "genus"

   pr2_update_list[[one_level]] <- pr2_update %>% 
     filter(is.na(species_old)& !is.na(genus_old)) %>% 
     select(-species_old) 

   cat("N rows Excel - level ",one_level, "-", nrow(pr2_update_list[[one_level]]), "\n")
   
   pr2_taxo_list[[one_level]] <- pr2_taxo_new %>% 
     inner_join(pr2_update_list[[one_level]]) %>% 
     mutate(species = case_when (is.na (species) ~ species_old,
                                 TRUE ~species))
   
   cat("N rows Taxo - level ",one_level, "-", nrow(pr2_taxo_list[[one_level]]), "\n")
   
   cat(one_level," not in PR2 \n")
   pr2_update_list[[one_level]] %>% 
     select(supergroup_old:genus_old) %>% 
     filter(!genus_old %in% pr2_taxo_new$genus_old)
   
   cat(one_level, " for which the  old taxo is wrong \n")
   pr2_update_list[[one_level]] %>% 
     filter(!genus_old %in% pr2_taxo_list[[one_level]]$genus_old)
   
   cat(one_level, " for which the  taxo duplicated \n")   
   pr2_taxo_updated_duplicated <- pr2_taxo_list[[one_level]] %>% 
    add_count(taxo_id) %>% 
    filter(n > 1)
   
# Remove species that have been updated
   pr2_taxo_new <- pr2_taxo_new %>% 
      filter(! taxo_id %in% pr2_taxo_list[[one_level]]$taxo_id)

5.4 Family

   one_level = "family"

   pr2_update_list[[one_level]] <- pr2_update %>% 
     filter(is.na(species_old)& is.na(genus_old) & !is.na(family_old)) %>% 
     select(-species_old, -genus_old) 

   cat("N rows Excel - level ",one_level, "-", nrow(pr2_update_list[[one_level]]), "\n")
   
      junk <- pr2_taxo_new %>% 
     inner_join(pr2_update_list[[one_level]]) %>% 
     mutate(species = case_when (is.na (species) ~ species_old,
                                 TRUE ~species),
            genus = case_when (is.na (genus) ~ genus_old,
                                TRUE ~ genus))
   
   pr2_taxo_list[[one_level]] <- pr2_taxo_new %>% 
     inner_join(pr2_update_list[[one_level]]) %>% 
     mutate(species = case_when (is.na (species) ~ species_old,
                                 TRUE ~species),
            genus = case_when (is.na (genus) ~ genus_old,
                                TRUE ~ genus))
   
   cat("N rows Taxo - level ",one_level, "-", nrow(pr2_taxo_list[[one_level]]), "\n")
   
   cat(one_level," not in PR2 \n")
   pr2_update_list[[one_level]] %>% 
     select(supergroup_old:family_old) %>% 
     filter(!family_old %in% pr2_taxo_new$family_old)
   
   cat(one_level, " for which the  old taxo is wrong \n")
   pr2_update_list[[one_level]] %>% 
     filter(!family_old %in% pr2_taxo_list[[one_level]]$family_old)
   
   cat(one_level, " for which the  taxo duplicated \n")   
   pr2_taxo_updated_duplicated <- pr2_taxo_list[[one_level]] %>% 
    add_count(taxo_id) %>% 
    filter(n > 1)
   
      
# Remove species that have been updated
   pr2_taxo_new <- pr2_taxo_new %>% 
      filter(! taxo_id %in% pr2_taxo_list[[one_level]]$taxo_id)

5.5 Order

   one_level = "order"

   pr2_update_list[[one_level]] <- pr2_update %>% 
     filter(is.na(species_old)& is.na(genus_old) & is.na(family_old)& !is.na(order_old)) %>% 
     select(-species_old, -genus_old, -family_old) 

   cat("N rows Excel - level ",one_level, "-", nrow(pr2_update_list[[one_level]]), "\n")
   
   pr2_taxo_list[[one_level]] <- pr2_taxo_new %>% 
     inner_join(pr2_update_list[[one_level]]) %>% 
     mutate(species = case_when (is.na (species) ~ species_old,
                                 TRUE ~species),
            genus = case_when (is.na (genus) ~ genus_old,
                                TRUE ~ genus),
            family = case_when (is.na (family) ~ family_old,
                                TRUE ~ family))
   
   cat("N rows Taxo - level ",one_level, "-", nrow(pr2_taxo_list[[one_level]]), "\n")
   
   cat(one_level," not in PR2 \n")
   pr2_update_list[[one_level]] %>% 
     select(supergroup_old:order_old) %>% 
     filter(!order_old %in% pr2_taxo_new$order_old)
   
   cat(one_level, " for which the  old taxo is wrong \n")
   pr2_update_list[[one_level]] %>% 
     filter(!order_old %in% pr2_taxo_list[[one_level]]$order_old)
   
   cat(one_level, " for which the  taxo duplicated \n")   
   pr2_taxo_updated_duplicated <- pr2_taxo_list[[one_level]] %>% 
    add_count(taxo_id) %>% 
    filter(n > 1)
   
      
# Remove species that have been updated
   pr2_taxo_new <- pr2_taxo_new %>% 
      filter(! taxo_id %in% pr2_taxo_list[[one_level]]$taxo_id)

5.6 Class

   one_level = "class"

   pr2_update_list[[one_level]] <- pr2_update %>% 
     filter(is.na(species_old)& is.na(genus_old) & is.na(family_old)& is.na(order_old) & !is.na(class_old)) %>% 
     select(-species_old, -genus_old, -family_old, -order_old) 

   cat("N rows Excel - level ",one_level, "-", nrow(pr2_update_list[[one_level]]), "\n")
   
   pr2_taxo_list[[one_level]] <- pr2_taxo_new %>% 
     inner_join(pr2_update_list[[one_level]]) %>% 
     mutate(species = case_when (is.na (species) ~ species_old,
                                 TRUE ~species),
            genus = case_when (is.na (genus) ~ genus_old,
                                TRUE ~ genus),
            family = case_when (is.na (family) ~ family_old,
                                TRUE ~ family),
            order = case_when (is.na (order) ~ order_old,
                                TRUE ~ order))

   cat("N rows Taxo - level ",one_level, "-", nrow(pr2_taxo_list[[one_level]]), "\n")
   
   cat(one_level," not in PR2 \n")
   pr2_update_list[[one_level]] %>% 
     select(supergroup_old:class_old) %>% 
     filter(!class_old %in% pr2_taxo_new$class_old)
   
   cat(one_level, " for which the  old taxo is wrong \n")
   pr2_update_list[[one_level]] %>% 
     filter(!class_old %in% pr2_taxo_list[[one_level]]$class_old) 
   
   cat(one_level, " for which the  taxo duplicated \n")   
   pr2_taxo_updated_duplicated <- pr2_taxo_list[[one_level]] %>% 
    add_count(taxo_id) %>% 
    filter(n > 1)
   
   
# Remove species that have been updated
   pr2_taxo_new <- pr2_taxo_new %>% 
      filter(! taxo_id %in% pr2_taxo_list[[one_level]]$taxo_id)

5.7 Assembling it all

  pr2_taxo_updated <- pr2_taxo_list %>% 
    reduce(bind_rows) %>% 
    distinct(.keep_all = TRUE)

  pr2_taxo_missing <- pr2_taxo_new %>% 
    filter(!taxo_id %in% pr2_taxo_updated$taxo_id) 
  
  pr2_class_missing <- select(pr2_taxo_missing, kingdom_old:class_old) %>% 
     distinct()
  
  kable(pr2_class_missing)
  kable(pr2_taxo_missing)

  
  pr2_taxo_updated <- pr2_taxo_updated %>% 
    bind_rows(pr2_taxo_missing) %>% 
    arrange(kingdom_old, supergroup_old, division_old, class_old, order_old, family_old, genus_old, species_old) %>% 
    relocate(!! pr2.env$taxo_new_levels, .before = taxo_id)
  
  pr2_taxo_updated_duplicated <- pr2_taxo_updated %>% 
    add_count(taxo_id) %>% 
    filter(n > 1)

6 Check Xs are OK

To accelerate first filter_at to detect lines that have an _X. Then merge back the 2 df… https://stackoverflow.com/questions/55478879/filtering-multiple-columns-with-str-detect

  taxo_levels = pr2.env$taxo_new_levels
  taxo_levels_number <- length(taxo_levels)
  
  pr2_taxo_updated_1 <- pr2_taxo_updated %>%
     filter_all(any_vars(str_detect(., "_X")))
  
  pr2_taxo_updated_2 <- pr2_taxo_updated %>%
     filter(!(taxo_id %in%   pr2_taxo_updated_1$taxo_id))
  
  nrow(pr2_taxo_updated_1)

  # for (i in 1:10) {  
  for (i in 1:nrow(pr2_taxo_updated_1)) {
    for (j in 1:(taxo_levels_number-1) ){
      get_out = FALSE
      
      # Check first if level contains "_X"
      if (str_detect(pr2_taxo_updated_1[i,taxo_levels[j]], 
                     "_X")) {
         
         # If true cycle the remaining columns and add the correct number of X
         pr2_taxo_updated_1[i, "X_flag"] <- 1
         for (k in (j+1):taxo_levels_number ) { 
            # print(k)
            # If the next level does not contain _X get out 
             if (!str_detect(pr2_taxo_updated_1[i,taxo_levels[k]], "_X")) break
            
            # For the species level, same number of X than for genus
             if (k < taxo_levels_number) n_X = k-j+1 else  n_X = k-j
             
            # Replace _XXX by the good number of X (n_X)
             pr2_taxo_updated_1[i,taxo_levels[k]] <-  str_replace(pr2_taxo_updated_1[i,taxo_levels[k]] ,
                                                                "_[X]+", 
                                                                str_c("_", str_dup("X", n_X)))
             get_out = TRUE
         }
      }
      if (get_out) break
      
    }
  }
  
  pr2_taxo_updated <- bind_rows(pr2_taxo_updated_1, pr2_taxo_updated_2)  %>% 
    arrange(kingdom_old, supergroup_old, division_old, class_old, order_old, family_old, genus_old, species_old) 
     

 openxlsx::write.xlsx(pr2_taxo_updated, full_path("pr2_new_taxonomy_species.xlsx"),
                      firstActiveRow = 2,  firstActiveCol = 9, zoom=85)

7 Check taxonomy

pr2_taxo_9 <- pr2_taxo_updated %>% 
   select(domain:species) %>% 
   distinct()

dvutils::pr2_taxo_check(pr2_taxo_9, taxo_levels = pr2.env$taxo_new_levels, dir_taxo=full_path("taxo"))

8 Check duplicated taxa

if (FALSE){
 td <- read_xlsx(full_path("taxa_duplicated.xlsx"))

 td9 <- filter(td, level==9)
 td8 <- filter(td, level==8)
 td7 <- filter(td, level==7)
 td6 <- filter(td, level==6)
 td5 <- filter(td, level==5)

 duplicates <- pr2_taxo_updated %>% 
    filter((species %in% td9$name)|
           (genus %in% td8$name)| 
           (family %in% td7$name)| 
           (order %in% td6$name)|
           (class %in% td5$name)  )
 
  openxlsx::write.xlsx(duplicates, full_path("duplicated_species.xlsx"),
                      firstActiveRow = 2,  firstActiveCol = 9, zoom=85)
  }

Daniel Vaulot

06 06 2020