PR2 version 5.0.0
Mitochondria/Nucleomorph sequences from original database

Author

Daniel Vaulot

Published

March 27, 2023

Comments

Strategy

  • Read original file from Richard

  • Dereplicate identical species

  • Add metadata that are not present

  • Add species that are not present

  • Add sequences only if species and metadata are present

  • mito-SSU 8661

  • nucleomorph-SSU 16

  • 6517 new sequences downloaded from GenBank (out of 8876 new)

Final

  • 1478 species added

  • Final number of sequences added: 1898

  • Final number of valid sequences added: 1830

    • mito: 1818 sequences
    • nucleomorph: 12 sequences
  • 6779 entries not added because of missing species in taxonomy table.

Init

source(here::here("R",'PR2_init.R'), echo=FALSE)
source(here::here("R",'PR2_read_google_quick.R'), echo=FALSE)

# In the case of Chrysophyceae both 18S and 16S genes are in the file so no need to filter

  # pr2_18S <- pr2 %>% 
  #   filter(gene == "18S_rRNA")
  # 
  # pr2_18S_active <- pr2_active %>% 
  #   filter(gene == "18S_rRNA")

Set up the files

dir_pr2_update <- "M - Mitochondria"

pr2.env$editor <- "D. Vaulot"

full_path <- function(file_name){here::here("5.0",dir_pr2_update , file_name)}

# create the directory for taxonomy output
dir.create(full_path("taxo"))

Read the original data and reformat

Read the data

  • mito-SSU 8661
  • nucleomorph-SSU 16
file_pr2_original = full_path("Euk-ssu-rrna_2012_05_25.txt.gz")

pr2_update <- read_delim(file_pr2_original, guess_max=200000, na=c("", "-")) %>% 
  filter (Domain == "Organelle",
          SuperGroup %in% c("mito-SSU", "nucleomorph-SSU"),
          !(`ID-Richard` %in% pr2$pr2_accession),
          str_length (Sequence) < 2000) %>% 
  rename(genus = Genus,
         species = Species,
         pr2_accession = `ID-Richard`,
         genbank_accession = EMBL,
         sequence = Sequence,
         label = TYPE) %>% 
  mutate(species = str_replace(species, "[+]", "_"),
         organelle = case_when( SuperGroup == "mito-SSU" ~ "mitochondrion",
                                SuperGroup == "nucleomorph-SSU" ~ "nucleomorph")) %>% 
  distinct(species, .keep_all = TRUE)
  
str_c("Number of sequences : ", nrow(pr2_update))

pr2_update %>% count(organelle)

pr2_update %>% count(species) %>% arrange(desc(n))

hist(str_length(pr2_update$sequence))

rio::export(pr2_update, full_path("pr2_original_mito_nucleomorph.xlsx"))

Add to PR2 missing sequences from Genbank

  • Run the script script_genbank_xml.R on server or locally

  • Run second part PR2-update-GenBank-template.qmd

  pr2_new <-  pr2_update %>% 
     filter(!(genbank_accession %in% pr2_metadata$genbank_accession)) 
  

# Updated sequences that are not present in PR2
  pr2_new
  
  # Use this data frame to download new sequences and then restart from beginning
  pr2_new <- pr2_new %>% 
    pull(genbank_accession)
  
 saveRDS(pr2_new, full_path("accessions_new_mito_2.rds"))
  • 6517 new sequences downloaded from GenBank (out of 8876 new)
  • Only metadata added

Update pr2_taxonomy

  • Add new species missing only if genus is already present
  • 1478 species added
pr2_taxo_genus <- pr2_taxo %>% 
  select(domain:genus) %>% 
  distinct()

pr2_species_missing <- pr2_update %>% 
  select(species) %>% 
  distinct() %>% 
  arrange(species) %>% 
  left_join(pr2_taxo) %>% 
  filter(is.na(domain)) %>% 
  select(species) %>% 
  mutate(genus =  str_extract(species, "[:alpha:]+")) %>% 
  left_join(pr2_taxo_genus) %>% 
  filter(!is.na(domain)) %>%
  mutate(taxo_edited_version = str_c(pr2.env$version),
         taxo_edited_by = "D. Vaulot")

rio::export(pr2_species_missing, full_path("pr2_taxo_to_add.xlsx"))

Update of table pr2_main

Update pr2

pr2_db <- db_info("pr2_google")

pr2_db_con <- db_connect(pr2_db)

pr2_main <- tbl(pr2_db_con, "pr2_main") %>%
  collect() 

pr2_taxo <- tbl(pr2_db_con, "pr2_taxonomy") %>%
  filter (is.na(taxo_removed_version)) %>%
  collect()


pr2_metadata <- tbl(pr2_db_con, "pr2_metadata") %>%
  collect() 

db_disconnect(pr2_db_con)

# Check duplicates

pr2_taxo %>% count(species) %>% filter(n> 1)

pr2_metadata %>% count(genbank_accession) %>% filter(n> 1)

Sequences that are uploaded

  • Only keep sequences for which:
    • species is in taxonomy table
    • genbank is in metadata table
  • New sequences: 1898
pr2_new <- pr2_update %>% 
  filter(species %in% pr2_taxo$species,
         genbank_accession %in% pr2_metadata$genbank_accession)

pr2_not_added <- pr2_update %>% 
  filter(!(genbank_accession %in% pr2_new$genbank_accession))


glue::glue("Number of updated sequences {nrow(pr2_new)}")

Add fields

pr2_new <- pr2_new %>%  
    select (pr2_accession, 
            genbank_accession,
            label,
            organelle,
            species,
            sequence) %>% 
    separate_wider_delim(pr2_accession, delim =".", names = c("junk1", "start", "end"), cols_remove = FALSE) %>% 
    separate_wider_delim(end, delim ="_", names = c("end", "junk2"), cols_remove = TRUE) %>% 
    mutate(added_version = pr2.env$version,
           edited_by = pr2.env$editor,
           remark = "Added from original PR2 database",
         gene = case_when( str_detect(organelle, "nucleomorph") ~ "18S_rRNA", 
                           str_detect(organelle, "chloroplast|mitochond") ~ "16S_rRNA",
                           TRUE ~ "18S_rRNA"),
         sequence_length = str_length(sequence),
         sequence_length = ifelse(sequence_length == 0, NA_integer_, sequence_length),
         ambiguities = str_count(sequence, pattern=pr2.env$ambig_regex),
         # Next time remove from here because it is now in the script to run on the server
         removed_version = case_when((sequence_length < pr2.env$sequence_length_min)|
                                     (sequence_length > pr2.env$sequence_length_max)|  
                                     (ambiguities > pr2.env$ambiguities_max)| 
                                     str_detect(sequence, pr2.env$sequence_N_repeat) ~ pr2.env$version),
         edited_remark = case_when((sequence_length < pr2.env$sequence_length_min) ~ "sequence too short",
                                   (ambiguities > pr2.env$ambiguities_max) ~ "too many ambiguities",
                                   str_detect(sequence, pr2.env$sequence_N_repeat) ~ "2 consecutive N")
         )

 pr2_new$sequence_hash = purrr::map_chr(pr2_new$sequence,digest::sha1)

Check duplicates

 pr2_new %>% 
  count(genbank_accession) %>% 
  filter(n > 1)

Get some statistics

 ggplot(pr2_new, aes(x=sequence_length)) +
  geom_histogram() +
  xlim(0,3000)

Final files for uploading

  • ” Final number of sequences added: 1898”
  • ” Final number of valid sequences added: 1830”
 sprintf(" Final number of sequences added:  %d", nrow(pr2_new))
 sprintf(" Final number of valid sequences added:  %d", nrow(filter(pr2_new, is.na(removed_version))))
 colnames(pr2_new)

 pr2_update_new <- pr2_new
 
  pr2_main_new <- pr2_update_new %>%  
    select (pr2_accession, genbank_accession, start,end, label, organelle, gene, species,
            added_version, edited_by, removed_version, edited_remark, remark)
  nrow(pr2_main_new)
  
  pr2_sequences_new <- pr2_update_new %>% 
    select(pr2_accession, sequence, sequence_length, ambiguities, sequence_hash)
  nrow(pr2_sequences_new)

Save everything to an Excel file

 file_pr2_import <-  full_path("pr2_import_final_mito.xlsx")
 onglets <- list("pr2_main_new" = pr2_main_new, 
                 "pr2_sequence_new" = pr2_sequences_new)
 rio::export(onglets, file_pr2_import, zoom = 90, firstRow = TRUE, firstCol = TRUE)