source(here::here("R",'PR2_init.R'), echo=FALSE)
source(here::here("R",'PR2_read_google_quick.R'), echo=FALSE)
<- pr2 %>%
pr2_18S filter(gene == "18S_rRNA")
<- pr2_active %>%
pr2_18S_active filter(gene == "18S_rRNA")
PR2 version 5.0.0
Cilates L. Santoferrara et al.
Init
Set up the files
= c("Spirotrichea")
target_group = "class"
target_level
<- "D - Ciliates"
dir_pr2_update
$editor <- "L. Santoferrara et al."
pr2.env
<- function(file_name){here::here("5.0",dir_pr2_update , file_name)}
full_path
# create the directory for taxonomy output
dir.create(full_path("taxo"))
Save current state of PR2 for group
<- pr2_18S_active %>%
pr2_18S_target filter(!!as.symbol(target_level) %in% target_group) %>%
arrange(across(any_of(pr2.env$taxo_levels[[9]])))
::export(pr2_18S_target,
riofull_path("pr2_Spirotrichea_v_5.0_2023-02-22.xlsx"),
firstActiveRow = 2,
firstActiveCol = 2,
zoom = 80)
Read the original data and reformat
Read the data
- Number of sequences = 3918
# File has been updated by interactions with Luciana
= full_path("pr2_Spirotrichea_2022_09_11_LS-MG_2023_02_21.xlsx")
file_pr2_update_excel
<- import(file_pr2_update_excel, sheet = "sequences", guess_max=200000, na=c("", "-"))
pr2_update
str_c("Number of sequences : ", nrow(pr2_update))
Add to PR2 missing sequences from Genbank
Run the script
script_genbank_xml.R
on serverRun second part
PR2-update-GenBank.R
<- filter(pr2_update, !(genbank_accession %in% pr2_18S$genbank_accession))
pr2_new
# Updated sequences that are not present in PR2
pr2_new
# Use this data frame to download new sequences and then restart from beginning
<- pr2_new %>%
pr2_new pull(genbank_accession)
saveRDS(pr2_new, full_path("accessions_new_ciliates.rds"))
Compare with sequences in PR2
Sequences in target group in PR2 that are active: 3877
Sequences in target group in PR2 that need update: 3918
Sequences in update that are not active in PR2: 41 (this corresponds to sequence which no species_9 field)
Sequences in target group in PR2 that are not updated: 0
Sequences duplicated (e.g. with and without introns): 1
# Sequences of target group in pr2
pr2_18S_target
# Sequences of PR2 that need update
filter(pr2_18S, (genbank_accession %in% pr2_update$genbank_accession))
# Updated sequences that are not active in PR2
filter(pr2_update, !(genbank_accession %in% pr2_18S_active$genbank_accession))
# Updated sequences that are not present in PR2
filter(pr2_update, !(genbank_accession %in% pr2_18S$genbank_accession))
# Sequences from target group in PR2 that are not in update
<- filter(pr2_18S_active, (!!as.symbol(target_level) %in% target_group) &
pr2_target_not_updated !(genbank_accession %in% pr2_update$genbank_accession))
pr2_target_not_updated
# rio::export(pr2_target_not_updated, full_path("pr2_ciliates_not_updated.xlsx"))
# Sequences updated with 2 entries in PR2 (e.g. with and without introns)
left_join(select(pr2_update, genbank_accession),
select(pr2_18S, genbank_accession, pr2_accession)) %>%
count(genbank_accession) %>%
filter(n > 1)
Update pr2_taxonomy
Build and check
- Taxa to be updated: 428
- Taxa do be added: 46
- Taxa total: 474
<- pr2_update %>%
pr2_taxo_updated group_by_at(pr2.env$taxo_levels[[pr2.env$taxo_levels_number]]) %>%
count()
pr2_taxo_check(pr2_taxo_updated, pr2.env$taxo_levels[[pr2.env$taxo_levels_number]], full_path("taxo"))
<- pr2_taxo_raw %>%
pr2_taxo_raw_targeted filter(class_9 == target_group)
<- pr2_taxo_updated %>%
pr2_taxo_updated rename_all(~ str_c(.,"_9_new" )) %>%
::rename(species_9 = species_9_new) %>%
dplyrleft_join(pr2_taxo_raw_targeted) %>%
rename_at(pr2.env$taxo_levels_9, ~ str_c(.,"_old" )) %>%
rename_all( ~ str_replace(.,"_9_new", "_9" )) %>%
::rename(species_9 = species_9_old) %>%
dplyrmutate(species_8 = species_9,
genus_8 = genus_9,
family_8 = family_9,
order_8 = order_9,
taxo_edited_version = str_c(pr2.env$version,"; ", replace_na(taxo_edited_version, "")),
taxo_edited_by = str_c("L. Santoferrara; ", replace_na(taxo_edited_by, "")),
taxo_remark = str_c("5.0: Taxo updated to be compatible EukRef; ", replace_na(taxo_remark, ""))) %>%
relocate (contains("_9_old"), .before = domain_9)
::export(pr2_taxo_updated,
riofull_path("pr2_taxo_added_updated.xlsx"),
firstActiveRow = 2,
firstActiveCo = 9,
zoom = 80)
Find taxa in PR2 that are not included in the update
- These taxa should be removed
- Taxa to be removed: 68
<- pr2_taxo_raw_targeted %>%
pr2_taxo_not_updated filter(!(species_9 %in% pr2_taxo_updated$species_9)) %>%
select(taxo_id, species_9) %>%
mutate(taxo_removed_version = pr2.env$version)
export(pr2_taxo_not_updated , full_path("pr2_taxo_removed.xlsx"))
Update of table pr2_main
Sequences that need updating
<- pr2_update %>%
pr2_update_final select(genbank_accession, species) %>%
::rename(species_new = species) %>%
dplyrleft_join(select(pr2_main, pr2_accession, genbank_accession, species, edited_version, edited_by))
Sequences without species name or with different species
- Sequences added: 38
- Sequences updated: 1988
<- pr2_update_final %>%
pr2_main_updated filter((species != species_new)|is.na(species))
pr2_main_updated
::glue("Number of updated sequences {nrow(filter(pr2_main_updated, !is.na(species)))}")
glue::glue("Number of new sequences {nrow(filter(pr2_main_updated, is.na(species)))}") glue
Add fields
<- pr2_main_updated %>%
pr2_main_updated select (pr2_accession,
species_8 = species_new,
species_9 = species_new,
species_old = species,
edited_version,%>%
edited_by) mutate(edited_version = str_c(pr2.env$version,"; ", replace_na(edited_version, "")),
edited_by = str_c("L. Santoferrara; ", replace_na(edited_by, "")) )
Save everything to an Excel file
<- full_path("pr2_imports_final.xlsx")
file_pr2_imports <- list("pr2_main_updated" = pr2_main_updated
onglets # "pr2_sequences_updated" = pr2_sequence_updated
)::export(onglets, file_pr2_imports,
riofirstActiveRow = 2,
firstActiveCol = 2,
zoom = 80
)
Comments
We only made changes (in yellow) in the “sequences” tab of the attached file.