source(here::here("R",'PR2_init.R'), echo=FALSE)
source(here::here("R",'PR2_read_google.R'), echo=FALSE)
PR2 version 5.0.0
Misc check-in and tests
Aim
- Testing refdb package
- Checking sequences that have been labelled as EBS (erroneous barcode sequences by Jin et al. 2020)
- Assignement of strains sequences using the Genbank annotation at species level:
- Update pr2_taxonomy with new taxa: 2279
- Update pr2_main with new species assigned: 19,405
- Chimeras in initial PR2 database (2012): 1258 not yet annotated in PR2 are annotated
Initialization
Set up files
$date = format(Sys.time(), "%Y-%m-%d")
pr2.env<- here::here("5.0", "Z - Misc")
dir_pr2_update
<- function(file_name){
full_path str_c(dir_pr2_update,"/", file_name)
}
# create the directory for taxonomy output
dir.create(full_path("taxo"), showWarnings = FALSE)
Testing the refdb package
library(refdb)
<- pr2_taxo %>%
lib filter(domain %in% c("Eukaryota"))
<- refdb_set_fields(lib,
lib taxonomy = c(family = "supergroup",
genus = "division",
species = "class"))
refdb_plot_tax_treemap(lib)
refdb_plot_tax_tree(lib, leaf_col = "class",
expand_plot = 1.5)
Check duplicates in tables
%>%
pr2_seq count(pr2_accession) %>%
filter(n>1)
Check for that have been labelled as EBS (erroneous barcode sequences by Jin et al. 2020)
Decide not to remove these sequences because a large number are annotated by EukRibo.
- Sequences # : 4285
- Number of EBS sequences that are annotated by EuKRibo: 1739
- Number of EBS sequences that are removed: 29
Reference
- Jin, Soyeong, Kwang Young Kim, Min Seok Kim, et Chungoo Park. 2020. « An assessment of the taxonomic reliability of dna barcode sequences in publicly available databases ». Algae 35 (3): 293‑301. https://doi.org/10.4490/algae.2020.35.9.4.
<- dvutils::fasta_read(full_path("PR2_EBS_Jin_2020_wrong_sequences.fasta")) %>%
ebs pull(seq_name)
<- pr2 %>%
pr2_ebs filter(pr2_accession %in% ebs)
print(glue::glue("Number of EBS sequences that are annotated by EuKRibo: {nrow(filter(pr2_ebs, !is.na(eukribo_UniEuk_taxonomy_string)))}"))
print(glue::glue("Number of EBS sequences that are inactive or removed: {nrow(filter(pr2_ebs, !is.na(removed_version)))}"))
::export(pr2_ebs, full_path("PR2_EBS_Jin_2020_wrong_sequences.xlsx")) rio
PR2 sequences from strains not assigned
These sequences originate from a strain but are not yet identified in pr2_main (is.na(species))
- Extract species
- If species is not found in pr2_taxo, then extract genus and merge with taxo from genus.
- Remove all “aff.”, “var.”, “nr.”
- Update pr2_taxonomy with new taxa: 2279
- Update pr2_main with new species assigned: 19,405
Add new species
<- pr2 %>%
pr2_strains filter(is.na(species),
!is.na(gb_strain),
is.na(removed_version),
!str_detect(gb_organism, " OTU")) %>%
mutate(species_new = str_replace(gb_organism, "aff[.][ ]|cf[.][ ]|nr[.][ ]", ""),
species_new = str_extract(species_new, "\\w+[ ]\\w+"),
species_new = str_replace(species_new, " ", "_"),
species_new = str_replace(species_new, "sp$", "sp.")) %>%
select(-sequence)
<- pr2_taxo %>%
pr2_taxo_genus select(domain:genus) %>%
distinct()
<- pr2_strains %>%
pr2_strains_species_missing select(species = species_new) %>%
distinct() %>%
arrange(species) %>%
left_join(filter(pr2_taxo)) %>%
filter(is.na(domain)) %>%
select(species) %>%
mutate(genus = str_extract(species, "[:alpha:]+")) %>%
left_join(pr2_taxo_genus) %>%
filter(!is.na(domain)) %>%
mutate(taxo_edited_version = str_c(pr2.env$version),
taxo_edited_by = "D. Vaulot")
::export(pr2_strains_species_missing, full_path("pr2_taxo_strains_to_add.xlsx")) rio
Annotate the species
# Reload the taxonomy which has been updated
<- db_info("pr2_google")
pr2_db
<- db_connect(pr2_db)
pr2_db_con
<- tbl(pr2_db_con, "pr2_taxonomy") %>%
pr2_taxo filter (is.na(taxo_removed_version)) %>%
collect()
db_disconnect(pr2_db_con)
# Select unannotated sequences that correspond to strains
<- pr2 %>%
pr2_strains filter(is.na(species),
!is.na(gb_strain),
is.na(removed_version),
!str_detect(gb_organism, " OTU")) %>%
mutate(species_new = str_replace(gb_organism, "aff[.][ ]|cf[.][ ]|nr[.][ ]", ""),
species_new = str_extract(species_new, "\\w+[ ]\\w+"),
species_new = str_replace(species_new, " ", "_"),
species_new = str_replace(species_new, "sp$", "sp.")) %>%
filter(species_new %in% pr2_taxo$species) %>%
mutate(edited_version = str_c(pr2.env$version,"; ", replace_na(edited_version, "")),
edited_by = str_c(pr2.env$editor, "; ", replace_na(edited_by, "")) ,
edited_remark = "New sequences from strains") %>%
select(pr2_accession, species = species_new, edited_version, edited_by, edited_remark )
::export(pr2_strains, full_path("pr2_main_strains_to_update.xlsx")) rio
Chimeras version 2012
Read pr2_main
# Reload the taxonomy which has been updated
<- db_info("pr2_google")
pr2_db
<- db_connect(pr2_db)
pr2_db_con
<- tbl(pr2_db_con, "pr2_main") %>%
pr2_main collect()
db_disconnect(pr2_db_con)
Read pr2_chimeras from original PR2 database
- 1394 sequences in PR2 original table
- 1258 not yet annotated in PR2 are annotated
<- rio::import(full_path("pr2_chimera_original.tsv")) %>%
pr2_chimera mutate(genbank_accession = str_extract(id, "[:alnum:]*"))
<- pr2_main %>%
pr2_main_chimera filter(genbank_accession %in% pr2_chimera$genbank_accession,
is.na(chimera)) %>%
mutate(chimera = 1,
chimera_remark = "chimera in original PR2 database",
removed_version = pr2.env$version) %>%
select(pr2_accession, chimera, chimera_remark,removed_version )
::export(pr2_main_chimera, full_path("pr2_main_chimera.xlsx")) rio