source(here::here("R",'PR2_init.R'), echo=FALSE)
PR2 version 5.0.0
EukRibo version 2.0
Init
<- "K - EukRibo"
dir_pr2_update
<- function(file_name){here::here("5.0",dir_pr2_update , file_name)} full_path
Import the different tables
<- db_info("pr2_google")
pr2_db
<- db_connect(pr2_db)
pr2_db_con
<- tbl(pr2_db_con, "pr2_main") %>%
pr2_main filter(is.na(removed_version)) %>%
collect()
<- pr2_main %>%
pr2_main_not_annotated filter(is.na(species))
<- pr2_main %>%
pr2_main_annotated filter(!is.na(species))
<- tbl(pr2_db_con, "pr2_taxonomy") %>%
pr2_taxo filter (is.na(taxo_removed_version)) %>%
collect()
<- tbl(pr2_db_con, "eukribo_v2") %>%
eukribo collect()
db_disconnect(pr2_db_con)
Find sequences not yet in PR2
Number of new sequences added: 510
<- filter(eukribo, !(gb_accession %in% pr2_main$genbank_accession))
pr2_new
# Updated sequences that are not present in PR2
pr2_new
# Use this data frame to download new sequences and then restart from beginning
<- pr2_new %>%
pr2_new pull(gb_accession)
saveRDS(pr2_new, full_path("accessions_new_eukribo.rds"))
Add species info to PR2 sequences
Number of species added: 930
Find EukRibo that have species information and are already in PR2 database but without species information
<- eukribo_to_add %>%
eukribo_to_add_species filter(str_detect(eukribo_UniEuk_taxonomy_string, "\\w+[+]\\w+")) %>%
mutate(species = str_extract(eukribo_UniEuk_taxonomy_string, "\\w+[+]\\w+"),
species = str_replace(species,"[+]", "_"),
genus = str_extract(species, "[:alpha:]+")) %>%
relocate(genus, species, .after = "gb_accession")
Find species missing in PR2
<- select(pr2_taxo,domain:genus) %>%
pr2_genus distinct()
<- eukribo_to_add_species %>%
eukribo_missing_species filter(!(species %in% pr2_taxo$species)) %>%
select(genus,species, eukribo_UniEuk_taxonomy_string) %>%
distinct() %>%
left_join(pr2_genus, by = join_by(genus)) %>%
mutate(taxo_edited_version = str_c(pr2.env$version),
taxo_edited_by = str_c("EukRibo"),
taxo_remark = "Added for new EukRibo")
::export(eukribo_missing_species, full_path("pr2_taxo_new.xlsx")) rio
Check duplicates and taxonomy
<- rio::import(full_path("pr2_taxo_new_edited.xlsx"))
pr2_taxo_to_add
<- pr2_taxo_to_add %>%
duplicates count(species) %>%
filter(n>1)
Check final taxonomy
<- db_info("pr2_google")
pr2_db
<- db_connect(pr2_db)
pr2_db_con
<- tbl(pr2_db_con, "pr2_taxonomy") %>%
pr2_taxo filter (is.na(taxo_removed_version)) %>%
collect()
db_disconnect(pr2_db_con)
pr2_taxo_check(pr2_taxo, pr2.env$taxo_levels[[pr2.env$taxo_levels_number]], full_path("taxo"))
Merge pr2 and EukRibo
Number of EukRibo sequences used to annotate PR2: 1256
<- eukribo %>%
eukribo_update_pr2 filter(gb_accession %in% pr2_main_not_annotated$genbank_accession) %>%
filter(str_detect(eukribo_UniEuk_taxonomy_string, "\\w+[+]\\w+")) %>%
mutate(species = str_extract(eukribo_UniEuk_taxonomy_string, "\\w+[+]\\w+"),
species = str_replace(species,"[+]", "_")) %>%
relocate(species, .after = "gb_accession")
<- pr2_main_not_annotated %>%
pr2_main_update rename(species_old = species) %>%
left_join(eukribo_update_pr2, by = join_by(genbank_accession == gb_accession)) %>%
filter(!is.na(species)) %>%
mutate(edited_version = str_c(pr2.env$version,"; ", replace_na(edited_version, "")),
edited_by = str_c("EukRibo", "; ", replace_na(edited_by, ""))) %>%
select(pr2_accession, species, edited_version, edited_by)
::export(pr2_main_update, full_path("pr2_main_update.xlsx"))
rio
::glue("Number of EukRibo sequences used to annotate PR2: {nrow(pr2_main_update)}") glue
Final result
Number of PR2 sequences annotated with Eukribo: 48,136
Number of EukRibo sequences not in PR2: 1573
::glue("Number of PR2 sequences annotated with Eukribo; {nrow(filter(pr2_main_annotated, genbank_accession %in% eukribo$gb_accession))}")
glue::glue("Number of EukRibo sequences not in PR2: {nrow(filter(eukribo, !gb_accession %in% pr2_main_annotated$genbank_accession))}") glue
Comments
genus+species
)