PR2 version 5.0.0
Misc check-in and tests

Author

Daniel Vaulot

Published

April 4, 2023

Aim

  • Testing refdb package
  • Checking sequences that have been labelled as EBS (erroneous barcode sequences by Jin et al. 2020)
  • Assignement of strains sequences using the Genbank annotation at species level:
    • Update pr2_taxonomy with new taxa: 2279
    • Update pr2_main with new species assigned: 19,405
  • Chimeras in initial PR2 database (2012): 1258 not yet annotated in PR2 are annotated

Initialization

source(here::here("R",'PR2_init.R'), echo=FALSE)

source(here::here("R",'PR2_read_google.R'), echo=FALSE)

Set up files

  pr2.env$date = format(Sys.time(), "%Y-%m-%d")
  dir_pr2_update <- here::here("5.0", "Z - Misc")
  
  full_path <- function(file_name){
    str_c(dir_pr2_update,"/", file_name)
    }

# create the directory for taxonomy output
  dir.create(full_path("taxo"), showWarnings = FALSE)

Testing the refdb package

library(refdb)
lib <- pr2_taxo %>% 
  filter(domain %in% c("Eukaryota"))

lib <- refdb_set_fields(lib,
                        taxonomy = c(family = "supergroup",
                                     genus = "division",
                                     species = "class"))

refdb_plot_tax_treemap(lib)
refdb_plot_tax_tree(lib, leaf_col = "class", 
  expand_plot = 1.5)

Check duplicates in tables

pr2_seq %>% 
  count(pr2_accession) %>% 
  filter(n>1)

Check for that have been labelled as EBS (erroneous barcode sequences by Jin et al. 2020)

Decide not to remove these sequences because a large number are annotated by EukRibo.

  • Sequences # : 4285
  • Number of EBS sequences that are annotated by EuKRibo: 1739
  • Number of EBS sequences that are removed: 29

Reference

  • Jin, Soyeong, Kwang Young Kim, Min Seok Kim, et Chungoo Park. 2020. « An assessment of the taxonomic reliability of dna barcode sequences in publicly available databases ». Algae 35 (3): 293‑301. https://doi.org/10.4490/algae.2020.35.9.4.
ebs <- dvutils::fasta_read(full_path("PR2_EBS_Jin_2020_wrong_sequences.fasta")) %>% 
  pull(seq_name)

pr2_ebs <- pr2 %>% 
  filter(pr2_accession %in% ebs) 



print(glue::glue("Number of EBS sequences that are annotated by EuKRibo: {nrow(filter(pr2_ebs, !is.na(eukribo_UniEuk_taxonomy_string)))}"))
print(glue::glue("Number of EBS sequences that are inactive or removed: {nrow(filter(pr2_ebs, !is.na(removed_version)))}"))

rio::export(pr2_ebs, full_path("PR2_EBS_Jin_2020_wrong_sequences.xlsx"))

PR2 sequences from strains not assigned

These sequences originate from a strain but are not yet identified in pr2_main (is.na(species))

  • Extract species
  • If species is not found in pr2_taxo, then extract genus and merge with taxo from genus.
    • Remove all “aff.”, “var.”, “nr.”
  • Update pr2_taxonomy with new taxa: 2279
  • Update pr2_main with new species assigned: 19,405

Add new species

pr2_strains <- pr2 %>% 
  filter(is.na(species),
         !is.na(gb_strain),
         is.na(removed_version),
         !str_detect(gb_organism, " OTU")) %>%
  mutate(species_new = str_replace(gb_organism, "aff[.][ ]|cf[.][ ]|nr[.][ ]", ""),
         species_new = str_extract(species_new, "\\w+[ ]\\w+"),
         species_new = str_replace(species_new, " ", "_"),
         species_new = str_replace(species_new, "sp$", "sp.")) %>% 
  select(-sequence)

pr2_taxo_genus <- pr2_taxo %>% 
  select(domain:genus) %>% 
  distinct()

pr2_strains_species_missing <- pr2_strains %>% 
  select(species = species_new) %>% 
  distinct() %>% 
  arrange(species) %>% 
  left_join(filter(pr2_taxo)) %>% 
  filter(is.na(domain)) %>% 
  select(species) %>% 
  mutate(genus =  str_extract(species, "[:alpha:]+")) %>% 
  left_join(pr2_taxo_genus) %>% 
  filter(!is.na(domain)) %>%
  mutate(taxo_edited_version = str_c(pr2.env$version),
         taxo_edited_by = "D. Vaulot")

rio::export(pr2_strains_species_missing, full_path("pr2_taxo_strains_to_add.xlsx"))

Annotate the species

# Reload the taxonomy which has been updated
pr2_db <- db_info("pr2_google")

pr2_db_con <- db_connect(pr2_db)

pr2_taxo <- tbl(pr2_db_con, "pr2_taxonomy") %>%
  filter (is.na(taxo_removed_version)) %>%
  collect()

db_disconnect(pr2_db_con)


# Select unannotated sequences that correspond to strains
pr2_strains <- pr2 %>% 
  filter(is.na(species),
         !is.na(gb_strain),
         is.na(removed_version),
         !str_detect(gb_organism, " OTU")) %>%
  mutate(species_new = str_replace(gb_organism, "aff[.][ ]|cf[.][ ]|nr[.][ ]", ""),
         species_new = str_extract(species_new, "\\w+[ ]\\w+"),
         species_new = str_replace(species_new, " ", "_"),
         species_new = str_replace(species_new, "sp$", "sp.")) %>% 
  filter(species_new %in% pr2_taxo$species) %>% 
  mutate(edited_version = str_c(pr2.env$version,"; ",  replace_na(edited_version, "")),
         edited_by = str_c(pr2.env$editor, "; ", replace_na(edited_by, "")) ,
         edited_remark = "New sequences from strains") %>% 
  select(pr2_accession, species = species_new, edited_version, edited_by, edited_remark ) 

  rio::export(pr2_strains, full_path("pr2_main_strains_to_update.xlsx"))

Chimeras version 2012

Read pr2_main

# Reload the taxonomy which has been updated
pr2_db <- db_info("pr2_google")

pr2_db_con <- db_connect(pr2_db)

pr2_main <- tbl(pr2_db_con, "pr2_main") %>%
  collect()

db_disconnect(pr2_db_con)

Read pr2_chimeras from original PR2 database

  • 1394 sequences in PR2 original table
  • 1258 not yet annotated in PR2 are annotated
pr2_chimera <- rio::import(full_path("pr2_chimera_original.tsv")) %>% 
  mutate(genbank_accession = str_extract(id, "[:alnum:]*"))

pr2_main_chimera <- pr2_main %>% 
  filter(genbank_accession %in% pr2_chimera$genbank_accession,
         is.na(chimera)) %>% 
  mutate(chimera = 1,
         chimera_remark = "chimera in original PR2 database",
         removed_version = pr2.env$version) %>% 
  select(pr2_accession, chimera, chimera_remark,removed_version )

rio::export(pr2_main_chimera, full_path("pr2_main_chimera.xlsx"))