source(here::here("R",'PR2_init.R'), echo=FALSE)
source(here::here("R",'PR2_read_google_quick.R'), echo=FALSE)
# In the case of Chrysophyceae both 18S and 16S genes are in the file so no need to filter
# pr2_18S <- pr2 %>%
# filter(gene == "18S_rRNA")
#
# pr2_18S_active <- pr2_active %>%
# filter(gene == "18S_rRNA")
<- pr2
pr2_18S
<- pr2_active pr2_18S_active
PR2 version 5.0.0
Chrysophyceae by Pavel Škaloud
Init
Set up the files
= c("Chrysophyceae")
target_group = "class"
target_level
<- "I - Chrysophyceae"
dir_pr2_update
$editor <- "P. Škaloud"
pr2.env
<- function(file_name){here::here("5.0",dir_pr2_update , file_name)}
full_path
# create the directory for taxonomy output
dir.create(full_path("taxo"))
Save current state of PR2 for group
<- pr2_18S_active %>%
pr2_18S_target filter(!!as.symbol(target_level) %in% target_group) %>%
arrange(across(any_of(pr2.env$taxo_levels[[9]])))
::export(pr2_18S_target,
riofull_path("pr2_Chrysophyceae_v_5.0_2023-03-15.xlsx"),
firstActiveRow = 2,
firstActiveCol = 2,
zoom = 80)
Read the original data and reformat
Read the data
- Number of sequences = 2258
= full_path("pr2_Chrysophyceae_Synurophyceae_2022-10-02_updated_edited.xlsx")
file_pr2_update_excel
<- import(file_pr2_update_excel, sheet = "sequences", guess_max=200000, na=c("", "-")) %>%
pr2_update filter(is.na(not_used))
str_c("Number of sequences : ", nrow(pr2_update))
Add to PR2 missing sequences from Genbank
Run the script
script_genbank_xml.R
on server or locallyRun second part
PR2-update-GenBank-template.qmd
<- filter(pr2_update, !(genbank_accession %in% pr2_18S$genbank_accession))
pr2_new
# Updated sequences that are not present in PR2
pr2_new
# Use this data frame to download new sequences and then restart from beginning
<- pr2_new %>%
pr2_new pull(genbank_accession)
saveRDS(pr2_new, full_path("accessions_new_chrysophytes.rds"))
- 18 new sequences downloaded from GenBank
Compare with sequences in PR2
Sequences in target group in PR2 that are active: 2096
Sequences in target group in PR2 that need update: 2258
Sequences in update that are not active in PR2: 164 (this corresponds to sequence which no species_9 field)
Sequences in target group in PR2 that are not updated: 2 (New sequences from R. Dorrell)
Sequences duplicated (e.g. with and without introns): 0
# Sequences of target group in pr2
pr2_18S_target
# Sequences of PR2 that need update
filter(pr2_18S, (genbank_accession %in% pr2_update$genbank_accession))
# Updated sequences that are not active in PR2
filter(pr2_update, !(genbank_accession %in% pr2_18S_active$genbank_accession))
# Updated sequences that are not present in PR2
filter(pr2_update, !(genbank_accession %in% pr2_18S$genbank_accession))
# Sequences from target group in PR2 that are not in update
<- filter(pr2_18S_active, (!!as.symbol(target_level) %in% target_group) &
pr2_target_not_updated !(genbank_accession %in% pr2_update$genbank_accession))
pr2_target_not_updated
# rio::export(pr2_target_not_updated, full_path("pr2_ciliates_not_updated.xlsx"))
# Sequences updated with 2 entries in PR2 (e.g. with and without introns)
left_join(select(pr2_update, genbank_accession),
select(pr2_18S, genbank_accession, pr2_accession)) %>%
count(genbank_accession) %>%
filter(n > 1)
Update pr2_taxonomy
Build and check
- Taxa to be updated: 428
- Taxa do be added: 46
- Taxa total: 474
<- pr2_update %>%
pr2_taxo_updated group_by_at(pr2.env$taxo_levels[[pr2.env$taxo_levels_number]]) %>%
count()
pr2_taxo_check(pr2_taxo_updated, pr2.env$taxo_levels[[pr2.env$taxo_levels_number]], full_path("taxo"))
<- pr2_taxo_raw %>%
pr2_taxo_raw_targeted filter(class == target_group)
<- pr2_taxo_updated %>%
pr2_taxo_updated rename_all(~ str_c(.,"_new" )) %>%
::rename(species = species_new) %>%
dplyrleft_join(pr2_taxo_raw_targeted) %>%
rename_at(pr2.env$taxo_levels[[pr2.env$taxo_levels_number]], ~ str_c(.,"_old" )) %>%
rename_all( ~ str_replace(.,"_new", "" )) %>%
::rename(species = species_old) %>%
dplyrmutate(taxo_edited_version = str_c(pr2.env$version,"; ", replace_na(taxo_edited_version, "")),
taxo_edited_by = str_c(pr2.env$editor, "; ", replace_na(taxo_edited_by, "")),
taxo_remark = str_c("", replace_na(taxo_remark, ""))) %>%
relocate (contains("_old"), .before = domain)
::export(pr2_taxo_updated,
riofull_path("pr2_taxo_added_updated.xlsx"),
firstActiveRow = 2,
firstActiveCo = 9,
zoom = 80)
Find taxa in PR2 that are not included in the update
- All Chrysophyceae have been removed… (removed_version = 5.0)
<- pr2_taxo_raw_targeted %>%
pr2_taxo_not_updated filter(!(species %in% pr2_taxo_updated$species)) %>%
select(taxo_id, species) %>%
mutate(taxo_removed_version = pr2.env$version)
export(pr2_taxo_not_updated , full_path("pr2_taxo_removed.xlsx"))
Update of table pr2_main
Sequences that need updating
<- pr2_update %>%
pr2_update_final select(any_of(c("genbank_accession", "species", "reference_sequence"))) %>%
::rename(species_new = species) %>%
dplyrleft_join(select(pr2_main, pr2_accession, genbank_accession, species, edited_version, edited_by))
Sequences without species name or with different species
- Sequences added: 160
- Sequences updated: 1577
- Total 1737
<- pr2_update_final %>%
pr2_main_updated filter((species != species_new)|is.na(species))
pr2_main_updated
::glue("Number of updated sequences {nrow(filter(pr2_main_updated, !is.na(species)))}")
glue::glue("Number of new sequences {nrow(filter(pr2_main_updated, is.na(species)))}") glue
Add fields
<- pr2_main_updated %>%
pr2_main_updated select (pr2_accession,
species_old = species,
species = species_new,
edited_version,
edited_by,%>%
reference_sequence) mutate(edited_version = str_c(pr2.env$version,"; ", replace_na(edited_version, "")),
edited_by = str_c(pr2.env$editor, "; ", replace_na(edited_by, "")) )
Save everything to an Excel file
<- full_path("pr2_imports_final.xlsx")
file_pr2_imports <- list("pr2_main_updated" = pr2_main_updated
onglets # "pr2_sequences_updated" = pr2_sequence_updated
)::export(onglets, file_pr2_imports,
riofirstActiveRow = 2,
firstActiveCol = 2,
zoom = 80
)
Comments
2022-10-02 Pavel
Hi Daniel, OK, thank you for the clarification.
Ad 1. I updated the file to check plastid sequences - please find it attached. As there are a few sequences with known identity in most of them I am unable to propose even the order affiliation. So in the most cases I just updated the entries they belong to Chrysophyceae.
Ad 2. Yes, I know they should be kept in PR2, but with their taxonomy updated. I am sorry but I have no information concerning their assignment, I just know they belong to different classes. I proposed their assignment in the remark column, so could you please update their taxonomy accordingly? I am not sure I will write properly the names. The sequences should belong to classes Labyrinthulea, Bacillariophyceae, Synchromophyceae and Haptophyta.
Finally, I forgot to mention I colored the plastid updated sequences by green in the first column - just to help you to quickly find them.
2022-09-28 Pavel
Dear Daniel, please find attached the updated Chrysophyceae table. I formatted the table according to the PR2 manual, so I hope I got the formatting rules correctly. As you can see, I included several updates according to my phylogenetic analyses and several published papers. I included order and family names and in those lineages which do not fit any order level I tried to accomodate previously published clade names (in particular, from Scoble et Cavalier-Smith 2014 - http://dx.doi.org/10.1016/j.ejop.2014.08.001 and Charvet et al. 2012 - http://dx.doi.org/10.1007/s00300-011-1118-7 papers).
However, I am open to any other naming of these clades.