source(here::here("R",'PR2_init.R'), echo=FALSE)
source(here::here("R",'PR2_read_google_quick.R'), echo=FALSE)
PR2 version 5.0.0
Adding GenBank sequences
from Excel file with taxonomy
Aim
This script is only to get new sequences that are in Excel file with taxonomy.
Sequences are put in directory C - New GenBank sequences
.
Init
Set up the files
$editor <- "D. Vaulot"
pr2.env
<- function(file_name){here::here("5.0", "C - New GenBank sequences", file_name)}
full_path
# create the directory for taxonomy output
dir.create(full_path(""), showWarnings = FALSE)
Read the original data and reformat
Read the data
- Number of sequences = 903
= full_path("PR2 new sequences_5.0.xlsx")
file_pr2_update_excel
<- import(file_pr2_update_excel, guess_max=200000, na=c("", "-"))
pr2_update
str_c("Number of sequences : ", nrow(pr2_update))
Add to PR2 missing sequences from Genbank
Run the script
script_genbank_xml.R
on server or locallyRun second part
PR2-update-GenBank-template.qmd
46 new sequences added to PR2
<- filter(pr2_update, !(genbank_accession %in% pr2$genbank_accession))
pr2_new
# Updated sequences that are not present in PR2
pr2_new
# Use this data frame to download new sequences and then restart from beginning
<- pr2_new %>%
pr2_new pull(genbank_accession)
saveRDS(pr2_new, full_path("accessions_new_genbank.rds"))
Remove from update sequences that are not in PR2
This is done once addition of new GenBank has been done
- Number of sequences really updated: 893
<- filter(pr2_update, (genbank_accession %in% pr2$genbank_accession))
pr2_update
str_c("Number of sequences : ", nrow(pr2_update))
Add taxa that are not yet in pr2_taxonomy
New taxa: 71
<- pr2_update %>%
pr2_taxo_updated filter(!(species %in% pr2_taxo$species)) %>%
group_by_at(c(pr2.env$taxo_levels[[pr2.env$taxo_levels_number]], "edited_by")) %>%
count()
<- pr2_taxo_updated %>%
pr2_taxo_updated mutate(taxo_edited_version = pr2.env$version,
taxo_edited_by = edited_by)
::export(pr2_taxo_updated,
riofull_path("pr2_taxo_added.xlsx"),
firstActiveRow = 2,
firstActiveCo = 9,
zoom = 80)
Reload taxonomy and check
<- db_info("pr2_google")
pr2_db <- db_connect(pr2_db)
pr2_db_con
<- tbl(pr2_db_con, "pr2_taxonomy") %>%
pr2_taxo collect() %>%
filter(is.na(taxo_removed_version))
db_disconnect(pr2_db_con)
pr2_taxo_check(pr2_taxo, pr2.env$taxo_levels[[pr2.env$taxo_levels_number]], full_path("taxo"))
Update of table pr2_main
Number of sequences annotated: 796
Add fields
<- pr2_update %>%
pr2_main_updated select(genbank_accession, species_new = species, edited_by_new = edited_by, start_edited = start, end_edited = end, gene_edited = gene) %>%
left_join(select(pr2_main, pr2_accession, genbank_accession, gene, start, end, species, edited_version, edited_by)) %>%
select (pr2_accession,
species_old = species,
species = species_new,
gene, gene_edited,
start, end,
start_edited, end_edited,
edited_version,
edited_by,%>%
edited_by_new) mutate(edited_version = str_c(pr2.env$version,"; ", replace_na(edited_version, "")) ) %>%
filter((species != species_old) | is.na(species_old))
Save everything to an Excel file
<- full_path("pr2_imports_final_3.xlsx")
file_pr2_imports <- list("pr2_main_updated" = pr2_main_updated
onglets # "pr2_sequences_updated" = pr2_sequence_updated
)::export(onglets, file_pr2_imports,
riofirstActiveRow = 2,
firstActiveCol = 2,
zoom = 80
)