library(knitr)
library(rmdformats)
library(kableExtra)
knitr::opts_chunk$set(fig.width=8,
fig.height=6,
eval=FALSE,
cache=TRUE,
echo=TRUE,
prompt=FALSE,
tidy=TRUE,
comment=NA,
message=FALSE,
warning=FALSE)
opts_knit$set(width=90)
options(max.print="300")
1 Goal
Integration of DinoRef into PR2
Ref : Mordret, S., Piredda, R., Vaulot, D., Montresor, M., Kooistra, W.H.C.F. & Sarno, D. 2018.
2 Initialize
2.1 Load libraries and functions
source("C:/Users/vaulot/Google Drive/Scripts/R library/dv_function_pr2.R")
library(dada2)
2.2 Initialize and read pr2
pr2_main <- get_query(db_pr2, "select * from pr2_main")
pr2_main <- pr2_main %>% filter(is.na(removed_version))
pr2_seq <- get_query(db_pr2, "select * from pr2_sequences")
pr2_taxo <- get_query(db_pr2, "select * from taxo")
pr2_taxo <- pr2_taxo %>% filter(is.na(taxo_removed_version))
pr2_metadata <- get_query(db_pr2, "select * from pr2_metadata")
# Join the tables and keep only sequences that are not removed
pr2 <- left_join(pr2_main, pr2_taxo, by = c(species = "species"))
pr2 <- left_join(pr2, pr2_seq)
pr2 <- left_join(pr2, pr2_metadata)
2.3 Define version and annotators
pr2_version = "4.9.0" # Note that this is now a string
version_directory <- paste0("C:/Data Biomol/RNA/_PR2/versions/", pr2_version,
"/")
pr2_editor = "Mordret S."
3 Read the data
Also define a number of file names to be used latter
setwd("C:/Data Biomol/RNA/_PR2/updates/2018 Dinoref")
file_pr2_update <- "C:/Data Biomol/RNA/_PR2/updates/2018 Dinoref/Dinoref version 1.0.xlsx"
dir_pr2_update <- dirname(file_pr2_update)
file_pr2_update_taxo <- paste0(dir_pr2_update, "/taxo.txt")
file_pr2_update_new_features <- paste0(dir_pr2_update, "/features_new_sequences.txt")
file_pr2_not_updated <- paste0(dir_pr2_update, "/pr2_sequences_not_updated.txt")
file_pr2_update_new <- paste0(dir_pr2_update, "/pr2_update_new.txt")
# Read the Excel file - Need to put guess_max at the max
pr2_update <- read_excel(file_pr2_update, sheet = "pr2_update", col_names = TRUE,
guess_max = 2e+05)
4 Construct taxonomy and compare to existing taxonomy
4.1 Import taxonomy into Excel (taxo.txt file)
- Check manually in the database what needs to be updated
# Construct the taxonomy file
pr2_update_taxo <- pr2_update %>% group_by_(.dots = taxo_levels) %>% summarise(sequence_number = n())
# Check it for duplicate names and duplicate parents
pr2_taxo_check(select(pr2_update_taxo, kingdom:species), dir_pr2_update)
# Compare to existing taxonomy
pr2_update_taxo_compared <- left_join(pr2_update_taxo, pr2_taxo, by = c(species = "species")) %>%
rename_(kingdom = "kingdom.x", supergroup = "supergroup.x", division = "division.x",
class = "class.x", order = "order.x", family = "family.x", genus = "genus.x") %>%
arrange(species)
# Write to a file to examine with Excel
write.table(pr2_update_taxo_compared, file = file_pr2_update_taxo, sep = "\t",
quote = FALSE, row.names = FALSE, na = "")
4.2 Update the entries that are already in the taxo table
- 315 species
- These entries have stuff in the taxo.y columns
- Note that the version and the editors have to be changed in the dv_function_pr2 file
- Note that the function update other species belonging to the same genera so that they have the same upstream taxonomy
pr2_taxo_update <- pr2_update_taxo_compared %>% filter(!(is.na(kingdom.y))) %>%
select(kingdom:species)
pr2_taxo_update <- pr2_taxo_update(pr2_taxo_update, method = "update")
# Check which lines did not work
filter(pr2_taxo_update, query_result == 0)
4.3 Append the new entries
- 141 species
- Other species from the same genera are also updated
pr2_taxo_add <- pr2_update_taxo_compared %>% filter(is.na(kingdom.y)) %>% select(kingdom:species)
pr2_taxo_add <- function(pr2_taxo_add, method = "add")
# Check which lines did not work
filter(pr2_taxo_add, query_result == 0)
4.4 Recheck the whole taxonomy table
pr2_taxo <- get_query(db_pr2, "select * from taxo")
pr2_taxo <- pr2_taxo %>% filter(is.na(taxo_removed_version))
pr2_taxo_check(select(pr2_taxo, kingdom:species), dir_pr2_update)
5 Old sequences
- 1444 sequences
# Extract lines with accession number already in PR2, then one need just to
# update the species, nothing more
pr2_update_old <- pr2_update %>% filter(genbank_accession %in% pr2$genbank_accession)
pr2_update_old <- pr2_reassign(pr2_update_old)
6 New sequences
6.1 Extract lines with new accession number
- 227 sequences
pr2_update_new <- pr2_update %>% filter(!(genbank_accession %in% pr2$genbank_accession))
dv_save(pr2_update_new, "pr2_update_new.txt")
6.2 Download the sequences and parse them
- 229 sequences Two sequences had 2 culture_collection fields that caused metadata line to be duplicated
pr2_update_new_metadata <- genbank_download_parse(pr2_update_new$genbank_accession,
genbank_directory)
dv_save(pr2_update_new_metadata, "pr2_update_new_metadata.txt")
knitr::kable(head(pr2_update_new_metadata, 3))
# Save the original metadata that contains 2 bad lines.. in case
pr2_update_new_metadata_bad <- pr2_update_new_metadata
# Suppress the two bad lines that have been repeated
pr2_update_new_metadata <- pr2_update_new_metadata[-c(63, 83), ]
# There are two bad columns in the metabadata table due to duplication of
# the lines
pr2_update_new_metadata <- pr2_update_new_metadata %>% select(-contains("structure"))
dv_save(pr2_update_new_metadata, "pr2_update_new_metadata.txt")
6.3 Extract features
Features that can be used to determine the start and end of the 18S - This is stored in a file.
pr2_update_new_features <- genbank_features(pr2_update_new$genbank_accession,
genbank_directory)
write.table(pr2_update_new_features, file = file_pr2_update_new_features, sep = "\t",
quote = FALSE, row.names = FALSE, na = "")
knitr::kable(head(pr2_update_new_features, 3))
6.4 Update sequence_start and sequence_end from feature table
Enter the sequence_start and sequence_end into the pr2_update_new Excel sheet and reload Note : 2 sequences have been removed because they correspond to 23S and not 18S - > 225 sequences
pr2_update_new <- read_excel(file_pr2_update, sheet = "pr2_update_new", col_names = TRUE,
guess_max = 2e+05)
6.5 Merge the accession, taxonomy, sequence and metadata
pr2_update_new <- left_join(pr2_update_new, pr2_update_new_metadata)
6.6 Extract sequences and get length and number of ambiguities
14 sequences do not meet the critaria and have too many ambiguities - 211 sequences remaining
# Extract the subsequence if the whole sequence is not to be used
index <- !is.na(pr2_update_new$sequence_start)
pr2_update_new$sequence[index] <- str_sub(pr2_update_new$sequence[index], pr2_update_new$sequence_start[index],
pr2_update_new$sequence_end[index])
pr2_update_new$sequence_length_old <- pr2_update_new$sequence_length
# Update sequence length and ambiguities for all sequences since some may
# have been shortened
pr2_update_new$sequence_length <- str_length(pr2_update_new$sequence)
pr2_update_new$ambiguities <- str_count(pr2_update_new$sequence, pattern = "[NRYSWKMBDHV]")
# Remove sequences that are shorter than minimum lenght (500) and those that
# have too many ambiguities
pr2_update_new_rejected <- pr2_update_new %>% filter((sequence_length < sequence_length_min) |
(ambiguities > ambiguities_max))
dv_save(pr2_update_new_rejected, "pr2_update_new_rejected.txt")
knitr::kable(pr2_update_new_rejected$genbank_accession)
pr2_update_new <- pr2_update_new %>% filter((sequence_length >= sequence_length_min) &
(ambiguities <= ambiguities_max))
6.7 Create PR2 fields
pr2_update_new <- pr2_update_new %>% dplyr::rename(start = sequence_start, end = sequence_end)
pr2_update_new$label <- "U"
pr2_update_new$added_version <- pr2_version
pr2_update_new <- pr2_update_new %>% mutate(pr2_accession = paste0(genbank_accession,
".", start, ".", end, "_", label))
pr2_update_new$reference_sequence = 1
# Unquote the following three lines if Chimera or better write an 'if
# statement' if species = Chimera_XXXX pr2_update_new$removed_version <-
# pr2_update_new$added_version pr2_update_new$chimera <- 1
# pr2_update_new$chimera_remark <- 'Detected by B. Edvardsen by UCHIME or
# long branch/manual'
dv_save(pr2_update_new, "pr2_update_new.txt")
6.8 Create the different tables
pr2_main_update <- select(pr2_update_new, pr2_accession, genbank_accession,
start, end, label, species, reference_sequence, starts_with("added_"), starts_with("removed_"),
starts_with("chimera"))
pr2_sequences_update <- select(pr2_update_new, pr2_accession, sequence, sequence_length,
ambiguities)
pr2_metadata_update <- select(pr2_update_new, genbank_accession, starts_with("gb_"),
starts_with("pr2_"), starts_with("eukref_"))
pr2_metadata_update <- select(pr2_metadata_update, -pr2_accession)
6.9 Write to database
Either the update can be done : * directly (but depends on Internet connection) - write_db * by exporting text file and then reimporting using an Excel sheet. The advantage of the lattest method is that the data can be edited.
dv_save(pr2_main_update, "pr2_main_update.txt")
dv_save(pr2_sequences_update, "pr2_sequences_update.txt")
dv_save(pr2_metadata_update, "pr2_metadata_update.txt")
write_db(db_pr2, table_name = "pr2_main", table_df = pr2_main_update)
write_db(db_pr2, table_name = "pr2_sequences", table_df = pr2_sequences_update)
write_db(db_pr2, table_name = "pr2_metadata", table_df = pr2_metadata_update)
6.10 Add the special metadata features annotated by DinoRef
metadata_dinoref <- read_excel(file_pr2_update, sheet = "Dinoref metadata",
col_names = TRUE, guess_max = 2e+05)
# The next line is due to the fact that the query crashed - they are not to
# be used metadata_dinoref_done <- read_excel(file_pr2_update,
# sheet='Dinoref metadata done', col_names = TRUE, guess_max=200000)
# metadata_dinoref <- left_join(metadata_dinoref,
# select(metadata_dinoref_done, genbank_accession, metadata_remark))
# metadata_dinoref <- metadata_dinoref %>% filter(is.na(metadata_remark))
# Only keep sequences that are in the PR2 database
metadata_dinoref <- metadata_dinoref %>% filter(genbank_accession %in% pr2$genbank_accession)
# Build a query for updating metadata
metadata_dinoref <- metadata_dinoref %>% mutate(query = paste0("UPDATE pr2_metadata",
" SET ", update_field_string("pr2_env_biome", pr2_env_biome, append_comma = TRUE),
update_field_string("pr2_biotic_relationship", pr2_biotic_relationship,
append_comma = TRUE), update_field_string("pr2_specific_host", pr2_specific_host,
append_comma = TRUE), update_field_string("pr2_sample_method", pr2_sample_method,
append_comma = TRUE), update_field_string("gb_date", gb_date, append_comma = TRUE),
update_field_string("gb_authors", gb_authors, append_comma = TRUE), update_field_string("metadata_remark",
"Metadata from DinoRef (Mordret S.)", append_comma = TRUE), update_field_string("gb_journal",
gb_journal, append_comma = FALSE), " WHERE genbank_accession = ", single_quote,
genbank_accession, single_quote))
# Update database
metadata_dinoref$query_result <- execute_query_vector(db_pr2, metadata_dinoref$query)
7 Find the PR2 cultures dinoflagellates sequences that are NOT updated
7.1 Create file to Solenn for re-annotation
pr2_cultures_not_updated <- pr2 %>% filter((class == "Dinophyceae") & (pr2_sample_type !=
"environmental")) %>% filter(!(genbank_accession %in% pr2_update$genbank_accession))
dv_save(pr2_cultures_not_updated, "pr2_cultures_not_updated.txt")
Solenn has resent a file called : Dinoref updates needed version 1.0 solenn.xlsx
7.2 Taxonomy correction
Done manually based on sheet : pr2 taxo not updated
7.3 Sequence correction
Done manually from the sheet
7.4 Add 2 sequences that were missing
# Read the Excel file
pr2_update_2 <- read_excel(file_pr2_update, sheet = "pr2_update_2", col_names = TRUE,
guess_max = 2e+05)
pr2_update_new <- pr2_update_2 %>% filter(!(genbank_accession %in% pr2$genbank_accession))
pr2_update_new_metadata <- genbank_download_parse(pr2_update_new$genbank_accession,
genbank_directory)
knitr::kable(head(pr2_update_new_metadata, 3))
pr2_update_new_features <- genbank_features(pr2_update_new$genbank_accession,
genbank_directory)
write.table(pr2_update_new_features, file = file_pr2_update_new_features, sep = "\t",
quote = FALSE, row.names = FALSE, na = "")
knitr::kable(head(pr2_update_new_features, 3))
pr2_update_new <- read_excel(file_pr2_update, sheet = "pr2_update_2", col_names = TRUE,
guess_max = 2e+05)
pr2_update_new <- left_join(pr2_update_new, pr2_update_new_metadata)
# Extract the subsequence if the whole sequence is not to be used
index <- !is.na(pr2_update_new$sequence_start)
pr2_update_new$sequence[index] <- str_sub(pr2_update_new$sequence[index], pr2_update_new$sequence_start[index],
pr2_update_new$sequence_end[index])
pr2_update_new$sequence_length_old <- pr2_update_new$sequence_length
# Update sequence length and ambiguities for all sequences since some may
# have been shortened
pr2_update_new$sequence_length <- str_length(pr2_update_new$sequence)
pr2_update_new$ambiguities <- str_count(pr2_update_new$sequence, pattern = "[^ATGC]")
# Remove sequences that are shorter than minimum lenght (500) and those that
# have too many ambiguities
pr2_update_new_rejected <- pr2_update_new %>% filter((sequence_length < sequence_length_min) |
(ambiguities > ambiguities_max))
dv_save(pr2_update_new_rejected, "pr2_update_new_rejected.txt")
knitr::kable(pr2_update_new_rejected$genbank_accession)
pr2_update_new <- pr2_update_new %>% filter((sequence_length >= sequence_length_min) &
(ambiguities <= ambiguities_max))
pr2_update_new <- pr2_update_new %>% dplyr::rename(start = sequence_start, end = sequence_end)
pr2_update_new$label <- "U"
pr2_update_new$added_version <- pr2_version
pr2_update_new <- pr2_update_new %>% mutate(pr2_accession = paste0(genbank_accession,
".", start, ".", end, "_", label))
pr2_update_new$reference_sequence = 1
pr2_main_update <- select(pr2_update_new, pr2_accession, genbank_accession,
start, end, label, species, reference_sequence, starts_with("added_"), starts_with("removed_"),
starts_with("chimera"))
pr2_sequences_update <- select(pr2_update_new, pr2_accession, sequence, sequence_length,
ambiguities)
pr2_metadata_update <- select(pr2_update_new, genbank_accession, starts_with("gb_"),
starts_with("pr2_"), starts_with("eukref_"))
pr2_metadata_update <- select(pr2_metadata_update, -pr2_accession)
dv_save(pr2_main_update, "pr2_main_update_2.txt")
dv_save(pr2_sequences_update, "pr2_sequences_update_2.txt")
dv_save(pr2_metadata_update, "pr2_metadata_update_2.txt")
write_db(db_pr2, table_name = "pr2_main", table_df = pr2_main_update)
write_db(db_pr2, table_name = "pr2_sequences", table_df = pr2_sequences_update)
write_db(db_pr2, table_name = "pr2_metadata", table_df = pr2_metadata_update)
8 Reannotate some PR2 environmental sequences dinoflagellates sequences using dada2
Note : Only sequences with no ambiguities have been reannotated. Also I only used the Species assingment which implies 100% similarity with the reference sequences. So only 31 sequences have been annotated this way.
8.1 Extract the sequences and create dataframe compatible with dada2
For dada2 - row.names = Accession - $sequence - $abundance
# Remove any sequence which has ambiguities
pr2_env_not_updated <- pr2 %>% filter((class == "Dinophyceae") & (pr2_sample_type ==
"environmental")) %>% filter(!(genbank_accession %in% pr2_update$genbank_accession)) %>%
filter(str_count(sequence, pattern = "[^ATGC]") == 0)
dv_save(pr2_env_not_updated, "pr2_env_not_updated.txt")
# Create the dada2 file
pr2_env_not_updated_dada2 <- pr2_env_not_updated %>% select(sequence) %>% mutate(abundance = 1)
row.names(pr2_env_not_updated_dada2) <- pr2_env_not_updated$genbank_accession
# pr2_export(pr2_env_not_updated, 'pr2_env_not_updated.fasta.gz',
# file_type='fasta', file_format='fasta_taxo_long')
8.2 Export the Dino reference sequences as Dada2
pr2_dinoref <- pr2 %>% filter((class == "Dinophyceae") & (reference_sequence ==
1) & (ambiguities == 0) & (str_count(species, "_X") == 0))
pr2_export(pr2_dinoref, "pr2_dinoref_dada2.fasta.gz", file_type = "fasta", file_format = "dada2")
pr2_export(pr2_dinoref, "pr2_dinoref_dada2_species.fasta.gz", file_type = "fasta",
file_format = "dada2_species")
8.3 Annotate using dada2 wang taxonomy
There are two ways to proceed : * AssignTaxonomy with a bootstrap value * AssignSpecies which looks for 100% similar sequences I chose the second option that is fail safe… The assignement of 31 sequences has been verified
taxa <- assignTaxonomy(pr2_env_not_updated_dada2, refFasta = "pr2_dinoref_dada2.fasta.gz",
taxLevels = taxo_levels, minBoot = 0, outputBootstraps = TRUE, verbose = TRUE)
dv_save(taxa$tax, "pr2_env_not_updated_taxa.txt")
dv_save(taxa$boot, "pr2_env_not_updated_boot.txt")
species <- assignSpecies(pr2_env_not_updated_dada2, refFasta = "pr2_dinoref_dada2_species.fasta.gz",
allowMultiple = TRUE, tryRC = FALSE, verbose = TRUE)
species_df <- as.data.frame(species)
pr2_env_not_updated <- bind_cols(pr2_env_not_updated, select(species_df, Genus,
Species))
pr2_env_not_updated_assigned <- pr2_env_not_updated %>% filter(!(is.na(Genus)))
dv_save(pr2_env_not_updated_assigned, "pr2_env_not_updated_assigned.txt")
8.4 Transfer the verified taxonomy in the database
pr2_editor <- "Vaulot D."
pr2_update_3 <- read_excel(file_pr2_update, sheet = "pr2_update_3", col_names = TRUE,
guess_max = 2e+05)
pr2_update_3 <- pr2_reassign(pr2_update_3)
9 Export PR2 dinoflagellate sequences that are not part of the Reference dataset.
These two files have been sent to Solenn (20/02/2018) for eventual reannotation based on the reference data set.
pr2_dino_not_updated <- pr2 %>% filter((class == "Dinophyceae") & is.na(reference_sequence))
dv_save(pr2_dino_not_updated, "pr2_dino_not_updated.txt")
pr2_export(pr2_env_not_updated, "pr2_dino_not_updated.fasta.gz", file_type = "fasta",
file_format = "fasta_taxo_long")