PR2 version 5.0.0
Cilates L. Santoferrara et al.

Author

Daniel Vaulot

Published

February 24, 2023

Comments

16S Plastid database from R. Dorrell

Init

source(here::here("R",'PR2_init.R'), echo=FALSE)
source(here::here("R",'PR2_read_google_quick.R'), echo=FALSE)

  pr2_16S <- pr2 %>% 
    filter(gene == "16S_rRNA",
           organelle == "plastid")
  pr2_no_species <- pr2 %>% 
    filter(is.na(species))

Set up the files

target_group =  c("Eukaryota")
target_level = "domain"

dir_pr2_update <- "G - 16S_plastid"

pr2.env$editor <- "R. Dorrell."

full_path <- function(file_name){here::here("5.0",dir_pr2_update , file_name)}

# create the directory for taxonomy output
dir.create(full_path("taxo"))

Save current state of PR2 for group

pr2_16S_target <- pr2_16S_active  %>% 
  filter(!!as.symbol(target_level) %in% target_group) %>% 
  arrange(across(any_of(pr2.env$taxo_levels[[9]])))

rio::export(pr2_16S_target, 
            full_path("pr2_16S_plastid_2023-02-23.xlsx"),
            firstActiveRow = 2, 
            firstActiveCol = 2,
            zoom = 80)

Read the original data and reformat

Read the data

  • Number of sequences = 646
file_pr2_update_excel = full_path("Dorell_16S_plastid_trimmed_alignment.xlsx")

pr2_update <- rio::import(file_pr2_update_excel, sheet = "sequences", guess_max=200000, na=c("", "-")) %>% 
  mutate(species = str_c(genus, species_epithet, sep="_")) %>% 
  select(genbank_accession, genus, species, sequence)
  
str_c("Number of sequences : ", nrow(pr2_update))

Add to PR2 missing sequences from Genbank

  • Number of new sequences = 163 These sequences are from MMETSP and will not be included

  • Run the script script_genbank_xml.R on server

  • Run second part PR2-update-GenBank.R

  pr2_new <-  filter(pr2_update, !(genbank_accession %in% pr2$genbank_accession)) 

# Updated sequences that are not present in PR2
  pr2_new
  
  # Use this data frame to download new sequences and then restart from beginning
  pr2_new <- pr2_new %>% 
    pull(genbank_accession)
  
 saveRDS(pr2_new, full_path("accessions_new_16S.rds"))

Compare with sequences in PR2

  • Only keep species that have accession in PR2 and do not have species : 254

  • Sequences duplicated (e.g. with and without introns): 0

  # Only keep sequences that are in pr2
  pr2_update_final <- pr2_update %>% 
    filter((genbank_accession %in% pr2_no_species$genbank_accession)) 
  
   pr2_update_final

  
  # Sequences updated with 2 entries in PR2  (e.g. with and without introns) 
  left_join(select(pr2_update_final, genbank_accession), 
            select(pr2, genbank_accession, pr2_accession)) %>% 
   count(genbank_accession) %>% 
   filter(n > 1) 

Cehck that all species are present in PR2

  • Add if necessary using an Excel file
pr2_species_missing <- pr2_update_final %>% 
  filter(!(species %in% pr2_taxo$species)) %>% 
  select(species) %>% 
  arrange(species) %>% 
  distinct()
  
# rio::export(pr2_species_missing, full_path("pr2_species_missing_02.xlsx"))

Update of table pr2_main and pr2_sequence

Sequences that need updating

pr2_update_final <- pr2_update_final %>% 
  select(genbank_accession, species, sequence) %>% 
  dplyr::rename(species_new = species) %>% 
  left_join(select(pr2, pr2_accession, genbank_accession, 
                   gene, organelle, species, 
                   edited_version, edited_by, removed_version))

Add fields

pr2_main_updated <- pr2_update_final %>%  
    select (pr2_accession, 
            species_8 = species_new, 
            species_9 = species_new, 
            species_old = species,
            gene, 
            organelle,
            edited_version,
            edited_by,
            removed_version) %>% 
    mutate(edited_version = str_c(pr2.env$version,"; ",  replace_na(edited_version, "")),
           edited_by = str_c("R. Dorrell; ", replace_na(edited_by, "")) ) 

pr2_sequence_updated  <- pr2_update_final %>%  
    select (pr2_accession, sequence) %>% 
    mutate(sequence_length = str_length(sequence))

pr2_sequence_updated$sequence_hash = purrr::map_chr(pr2_sequence_updated$sequence,digest::sha1)

Save everything to an Excel file

 file_pr2_imports <-  full_path("pr2_imports_final_16S.xlsx")
 onglets <- list("pr2_main_updated" = pr2_main_updated,
                 "pr2_sequences_updated" = pr2_sequence_updated
                 )
 rio::export(onglets, file_pr2_imports, 
            firstActiveRow = 2, 
            firstActiveCol = 2,
            zoom = 80 
             )