source(here::here("R",'PR2_init.R'), echo=FALSE)
source(here::here("R",'PR2_read_google_quick.R'), echo=FALSE)
# In the case of Chrysophyceae both 18S and 16S genes are in the file so no need to filter
# pr2_18S <- pr2 %>%
# filter(gene == "18S_rRNA")
#
# pr2_18S_active <- pr2_active %>%
# filter(gene == "18S_rRNA")
PR2 version 5.0.0
Mitochondria/Nucleomorph sequences from original database
Init
Set up the files
<- "M - Mitochondria"
dir_pr2_update
$editor <- "D. Vaulot"
pr2.env
<- function(file_name){here::here("5.0",dir_pr2_update , file_name)}
full_path
# create the directory for taxonomy output
dir.create(full_path("taxo"))
Read the original data and reformat
Read the data
- mito-SSU 8661
- nucleomorph-SSU 16
= full_path("Euk-ssu-rrna_2012_05_25.txt.gz")
file_pr2_original
<- read_delim(file_pr2_original, guess_max=200000, na=c("", "-")) %>%
pr2_update filter (Domain == "Organelle",
%in% c("mito-SSU", "nucleomorph-SSU"),
SuperGroup !(`ID-Richard` %in% pr2$pr2_accession),
str_length (Sequence) < 2000) %>%
rename(genus = Genus,
species = Species,
pr2_accession = `ID-Richard`,
genbank_accession = EMBL,
sequence = Sequence,
label = TYPE) %>%
mutate(species = str_replace(species, "[+]", "_"),
organelle = case_when( SuperGroup == "mito-SSU" ~ "mitochondrion",
== "nucleomorph-SSU" ~ "nucleomorph")) %>%
SuperGroup distinct(species, .keep_all = TRUE)
str_c("Number of sequences : ", nrow(pr2_update))
%>% count(organelle)
pr2_update
%>% count(species) %>% arrange(desc(n))
pr2_update
hist(str_length(pr2_update$sequence))
::export(pr2_update, full_path("pr2_original_mito_nucleomorph.xlsx")) rio
Add to PR2 missing sequences from Genbank
Run the script
script_genbank_xml.R
on server or locallyRun second part
PR2-update-GenBank-template.qmd
<- pr2_update %>%
pr2_new filter(!(genbank_accession %in% pr2_metadata$genbank_accession))
# Updated sequences that are not present in PR2
pr2_new
# Use this data frame to download new sequences and then restart from beginning
<- pr2_new %>%
pr2_new pull(genbank_accession)
saveRDS(pr2_new, full_path("accessions_new_mito_2.rds"))
- 6517 new sequences downloaded from GenBank (out of 8876 new)
- Only metadata added
Update pr2_taxonomy
- Add new species missing only if genus is already present
- 1478 species added
<- pr2_taxo %>%
pr2_taxo_genus select(domain:genus) %>%
distinct()
<- pr2_update %>%
pr2_species_missing select(species) %>%
distinct() %>%
arrange(species) %>%
left_join(pr2_taxo) %>%
filter(is.na(domain)) %>%
select(species) %>%
mutate(genus = str_extract(species, "[:alpha:]+")) %>%
left_join(pr2_taxo_genus) %>%
filter(!is.na(domain)) %>%
mutate(taxo_edited_version = str_c(pr2.env$version),
taxo_edited_by = "D. Vaulot")
::export(pr2_species_missing, full_path("pr2_taxo_to_add.xlsx")) rio
Update of table pr2_main
Update pr2
<- db_info("pr2_google")
pr2_db
<- db_connect(pr2_db)
pr2_db_con
<- tbl(pr2_db_con, "pr2_main") %>%
pr2_main collect()
<- tbl(pr2_db_con, "pr2_taxonomy") %>%
pr2_taxo filter (is.na(taxo_removed_version)) %>%
collect()
<- tbl(pr2_db_con, "pr2_metadata") %>%
pr2_metadata collect()
db_disconnect(pr2_db_con)
# Check duplicates
%>% count(species) %>% filter(n> 1)
pr2_taxo
%>% count(genbank_accession) %>% filter(n> 1) pr2_metadata
Sequences that are uploaded
- Only keep sequences for which:
- species is in taxonomy table
- genbank is in metadata table
- New sequences: 1898
<- pr2_update %>%
pr2_new filter(species %in% pr2_taxo$species,
%in% pr2_metadata$genbank_accession)
genbank_accession
<- pr2_update %>%
pr2_not_added filter(!(genbank_accession %in% pr2_new$genbank_accession))
::glue("Number of updated sequences {nrow(pr2_new)}") glue
Add fields
<- pr2_new %>%
pr2_new select (pr2_accession,
genbank_accession,
label,
organelle,
species,%>%
sequence) separate_wider_delim(pr2_accession, delim =".", names = c("junk1", "start", "end"), cols_remove = FALSE) %>%
separate_wider_delim(end, delim ="_", names = c("end", "junk2"), cols_remove = TRUE) %>%
mutate(added_version = pr2.env$version,
edited_by = pr2.env$editor,
remark = "Added from original PR2 database",
gene = case_when( str_detect(organelle, "nucleomorph") ~ "18S_rRNA",
str_detect(organelle, "chloroplast|mitochond") ~ "16S_rRNA",
TRUE ~ "18S_rRNA"),
sequence_length = str_length(sequence),
sequence_length = ifelse(sequence_length == 0, NA_integer_, sequence_length),
ambiguities = str_count(sequence, pattern=pr2.env$ambig_regex),
# Next time remove from here because it is now in the script to run on the server
removed_version = case_when((sequence_length < pr2.env$sequence_length_min)|
> pr2.env$sequence_length_max)|
(sequence_length > pr2.env$ambiguities_max)|
(ambiguities str_detect(sequence, pr2.env$sequence_N_repeat) ~ pr2.env$version),
edited_remark = case_when((sequence_length < pr2.env$sequence_length_min) ~ "sequence too short",
> pr2.env$ambiguities_max) ~ "too many ambiguities",
(ambiguities str_detect(sequence, pr2.env$sequence_N_repeat) ~ "2 consecutive N")
)
$sequence_hash = purrr::map_chr(pr2_new$sequence,digest::sha1) pr2_new
Check duplicates
%>%
pr2_new count(genbank_accession) %>%
filter(n > 1)
Get some statistics
ggplot(pr2_new, aes(x=sequence_length)) +
geom_histogram() +
xlim(0,3000)
Final files for uploading
- ” Final number of sequences added: 1898”
- ” Final number of valid sequences added: 1830”
sprintf(" Final number of sequences added: %d", nrow(pr2_new))
sprintf(" Final number of valid sequences added: %d", nrow(filter(pr2_new, is.na(removed_version))))
colnames(pr2_new)
<- pr2_new
pr2_update_new
<- pr2_update_new %>%
pr2_main_new select (pr2_accession, genbank_accession, start,end, label, organelle, gene, species,
added_version, edited_by, removed_version, edited_remark, remark)nrow(pr2_main_new)
<- pr2_update_new %>%
pr2_sequences_new select(pr2_accession, sequence, sequence_length, ambiguities, sequence_hash)
nrow(pr2_sequences_new)
Save everything to an Excel file
<- full_path("pr2_import_final_mito.xlsx")
file_pr2_import <- list("pr2_main_new" = pr2_main_new,
onglets "pr2_sequence_new" = pr2_sequences_new)
::export(onglets, file_pr2_import, zoom = 90, firstRow = TRUE, firstCol = TRUE) rio
Comments
Strategy
Read original file from Richard
Dereplicate identical species
Add metadata that are not present
Add species that are not present
Add sequences only if species and metadata are present
mito-SSU 8661
nucleomorph-SSU 16
6517 new sequences downloaded from GenBank (out of 8876 new)
Final
1478 species added
Final number of sequences added: 1898
Final number of valid sequences added: 1830
6779 entries not added because of missing species in taxonomy table.