PR2 version 5.0.0
EukRibo version 2.0

Author

Daniel Vaulot

Published

March 21, 2023

Comments

  • Add sequences from EukRibo not present in PR2
  • Add taxa (only bonafide species genus+species)
  • Annotated PR2_sequences which were missing species with annotation from EukRibo

Init

source(here::here("R",'PR2_init.R'), echo=FALSE)
dir_pr2_update <- "K - EukRibo"

full_path <- function(file_name){here::here("5.0",dir_pr2_update , file_name)}

Import the different tables

pr2_db <- db_info("pr2_google")

pr2_db_con <- db_connect(pr2_db)

pr2_main <- tbl(pr2_db_con, "pr2_main") %>%
  filter(is.na(removed_version)) %>% 
  collect() 

pr2_main_not_annotated <- pr2_main %>%
  filter(is.na(species)) 

pr2_main_annotated <- pr2_main %>%
  filter(!is.na(species)) 

pr2_taxo <- tbl(pr2_db_con, "pr2_taxonomy") %>%
  filter (is.na(taxo_removed_version)) %>%
  collect()

eukribo <- tbl(pr2_db_con, "eukribo_v2") %>% 
  collect() 
 
db_disconnect(pr2_db_con)

Find sequences not yet in PR2

Number of new sequences added: 510

  pr2_new <-  filter(eukribo, !(gb_accession %in% pr2_main$genbank_accession)) 

# Updated sequences that are not present in PR2
  pr2_new
  
  # Use this data frame to download new sequences and then restart from beginning
  pr2_new <- pr2_new %>% 
    pull(gb_accession)
  
 saveRDS(pr2_new, full_path("accessions_new_eukribo.rds"))

Add species info to PR2 sequences

Number of species added: 930

Find EukRibo that have species information and are already in PR2 database but without species information

eukribo_to_add_species <- eukribo_to_add %>% 
  filter(str_detect(eukribo_UniEuk_taxonomy_string, "\\w+[+]\\w+")) %>% 
  mutate(species = str_extract(eukribo_UniEuk_taxonomy_string, "\\w+[+]\\w+"),
         species = str_replace(species,"[+]", "_"), 
         genus =  str_extract(species, "[:alpha:]+")) %>% 
  relocate(genus, species, .after = "gb_accession")

Find species missing in PR2

pr2_genus <- select(pr2_taxo,domain:genus) %>% 
  distinct()

eukribo_missing_species <- eukribo_to_add_species %>% 
  filter(!(species %in% pr2_taxo$species)) %>% 
  select(genus,species, eukribo_UniEuk_taxonomy_string) %>% 
  distinct() %>% 
  left_join(pr2_genus, by = join_by(genus)) %>% 
  mutate(taxo_edited_version = str_c(pr2.env$version),
         taxo_edited_by = str_c("EukRibo"),
         taxo_remark = "Added for new EukRibo")

rio::export(eukribo_missing_species, full_path("pr2_taxo_new.xlsx"))

Check duplicates and taxonomy

pr2_taxo_to_add <- rio::import(full_path("pr2_taxo_new_edited.xlsx"))

duplicates <- pr2_taxo_to_add %>% 
  count(species) %>% 
  filter(n>1)

Check final taxonomy

pr2_db <- db_info("pr2_google")

pr2_db_con <- db_connect(pr2_db)

pr2_taxo <- tbl(pr2_db_con, "pr2_taxonomy") %>%
  filter (is.na(taxo_removed_version)) %>%
  collect()

db_disconnect(pr2_db_con)

pr2_taxo_check(pr2_taxo, pr2.env$taxo_levels[[pr2.env$taxo_levels_number]], full_path("taxo"))

Merge pr2 and EukRibo

Number of EukRibo sequences used to annotate PR2: 1256

eukribo_update_pr2 <- eukribo %>% 
  filter(gb_accession %in% pr2_main_not_annotated$genbank_accession) %>% 
  filter(str_detect(eukribo_UniEuk_taxonomy_string, "\\w+[+]\\w+")) %>% 
  mutate(species = str_extract(eukribo_UniEuk_taxonomy_string, "\\w+[+]\\w+"),
         species = str_replace(species,"[+]", "_")) %>% 
  relocate(species, .after = "gb_accession")

pr2_main_update <- pr2_main_not_annotated %>% 
  rename(species_old = species) %>% 
  left_join(eukribo_update_pr2, by = join_by(genbank_accession == gb_accession)) %>% 
  filter(!is.na(species)) %>% 
  mutate(edited_version = str_c(pr2.env$version,"; ",  replace_na(edited_version, "")),
         edited_by = str_c("EukRibo", "; ", replace_na(edited_by, ""))) %>% 
  select(pr2_accession, species, edited_version, edited_by)


rio::export(pr2_main_update, full_path("pr2_main_update.xlsx"))


glue::glue("Number of EukRibo sequences used to annotate PR2: {nrow(pr2_main_update)}")

Final result

Number of PR2 sequences annotated with Eukribo: 48,136

Number of EukRibo sequences not in PR2: 1573

glue::glue("Number of PR2 sequences annotated with Eukribo; {nrow(filter(pr2_main_annotated, genbank_accession %in% eukribo$gb_accession))}")
glue::glue("Number of EukRibo sequences not in PR2: {nrow(filter(eukribo, !gb_accession %in% pr2_main_annotated$genbank_accession))}")