source(here::here("R",'PR2_init.R'), echo=FALSE)
source(here::here("R",'PR2_read_google_quick.R'), echo=FALSE)
<- pr2 %>%
pr2_16S filter(gene == "16S_rRNA",
== "plastid")
organelle <- pr2 %>%
pr2_no_species filter(is.na(species))
PR2 version 5.0.0
Cilates L. Santoferrara et al.
Init
Set up the files
= c("Eukaryota")
target_group = "domain"
target_level
<- "G - 16S_plastid"
dir_pr2_update
$editor <- "R. Dorrell."
pr2.env
<- function(file_name){here::here("5.0",dir_pr2_update , file_name)}
full_path
# create the directory for taxonomy output
dir.create(full_path("taxo"))
Save current state of PR2 for group
<- pr2_16S_active %>%
pr2_16S_target filter(!!as.symbol(target_level) %in% target_group) %>%
arrange(across(any_of(pr2.env$taxo_levels[[9]])))
::export(pr2_16S_target,
riofull_path("pr2_16S_plastid_2023-02-23.xlsx"),
firstActiveRow = 2,
firstActiveCol = 2,
zoom = 80)
Read the original data and reformat
Read the data
- Number of sequences = 646
= full_path("Dorell_16S_plastid_trimmed_alignment.xlsx")
file_pr2_update_excel
<- rio::import(file_pr2_update_excel, sheet = "sequences", guess_max=200000, na=c("", "-")) %>%
pr2_update mutate(species = str_c(genus, species_epithet, sep="_")) %>%
select(genbank_accession, genus, species, sequence)
str_c("Number of sequences : ", nrow(pr2_update))
Add to PR2 missing sequences from Genbank
Number of new sequences = 163 These sequences are from MMETSP and will not be included
Run the script
script_genbank_xml.R
on serverRun second part
PR2-update-GenBank.R
<- filter(pr2_update, !(genbank_accession %in% pr2$genbank_accession))
pr2_new
# Updated sequences that are not present in PR2
pr2_new
# Use this data frame to download new sequences and then restart from beginning
<- pr2_new %>%
pr2_new pull(genbank_accession)
saveRDS(pr2_new, full_path("accessions_new_16S.rds"))
Compare with sequences in PR2
Only keep species that have accession in PR2 and do not have species : 254
Sequences duplicated (e.g. with and without introns): 0
# Only keep sequences that are in pr2
<- pr2_update %>%
pr2_update_final filter((genbank_accession %in% pr2_no_species$genbank_accession))
pr2_update_final
# Sequences updated with 2 entries in PR2 (e.g. with and without introns)
left_join(select(pr2_update_final, genbank_accession),
select(pr2, genbank_accession, pr2_accession)) %>%
count(genbank_accession) %>%
filter(n > 1)
Cehck that all species are present in PR2
- Add if necessary using an Excel file
<- pr2_update_final %>%
pr2_species_missing filter(!(species %in% pr2_taxo$species)) %>%
select(species) %>%
arrange(species) %>%
distinct()
# rio::export(pr2_species_missing, full_path("pr2_species_missing_02.xlsx"))
Update of table pr2_main and pr2_sequence
Sequences that need updating
<- pr2_update_final %>%
pr2_update_final select(genbank_accession, species, sequence) %>%
::rename(species_new = species) %>%
dplyrleft_join(select(pr2, pr2_accession, genbank_accession,
gene, organelle, species, edited_version, edited_by, removed_version))
Add fields
<- pr2_update_final %>%
pr2_main_updated select (pr2_accession,
species_8 = species_new,
species_9 = species_new,
species_old = species,
gene,
organelle,
edited_version,
edited_by,%>%
removed_version) mutate(edited_version = str_c(pr2.env$version,"; ", replace_na(edited_version, "")),
edited_by = str_c("R. Dorrell; ", replace_na(edited_by, "")) )
<- pr2_update_final %>%
pr2_sequence_updated select (pr2_accession, sequence) %>%
mutate(sequence_length = str_length(sequence))
$sequence_hash = purrr::map_chr(pr2_sequence_updated$sequence,digest::sha1) pr2_sequence_updated
Save everything to an Excel file
<- full_path("pr2_imports_final_16S.xlsx")
file_pr2_imports <- list("pr2_main_updated" = pr2_main_updated,
onglets "pr2_sequences_updated" = pr2_sequence_updated
)::export(onglets, file_pr2_imports,
riofirstActiveRow = 2,
firstActiveCol = 2,
zoom = 80
)
Comments
16S Plastid database from R. Dorrell