library(knitr)
  library(rmdformats)
  library(kableExtra) 

  knitr::opts_chunk$set(fig.width=8, 
                        fig.height=6, 
                        eval=FALSE, 
                        cache=TRUE,
                        echo=TRUE,
                        prompt=FALSE,
                        tidy=TRUE,
                        comment=NA,
                        message=FALSE,
                        warning=FALSE)
  opts_knit$set(width=90)
  options(max.print="300")  

1 Goal

Integration of DinoRef into PR2

Ref : Mordret, S., Piredda, R., Vaulot, D., Montresor, M., Kooistra, W.H.C.F. & Sarno, D. 2018. dinoref : A curated dinoflagellate (Dinophyceae) reference database for the 18S rRNA gene. Mol. Ecol. Resour. in press.

2 Initialize

2.1 Load libraries and functions

source("C:/Users/vaulot/Google Drive/Scripts/R library/dv_function_pr2.R")
library(dada2)

2.2 Initialize and read pr2

pr2_main <- get_query(db_pr2, "select * from pr2_main")
pr2_main <- pr2_main %>% filter(is.na(removed_version))

pr2_seq <- get_query(db_pr2, "select * from pr2_sequences")

pr2_taxo <- get_query(db_pr2, "select * from taxo")
pr2_taxo <- pr2_taxo %>% filter(is.na(taxo_removed_version))
pr2_metadata <- get_query(db_pr2, "select * from pr2_metadata")

# Join the tables and keep only sequences that are not removed

pr2 <- left_join(pr2_main, pr2_taxo, by = c(species = "species"))
pr2 <- left_join(pr2, pr2_seq)
pr2 <- left_join(pr2, pr2_metadata)

2.3 Define version and annotators

pr2_version = "4.9.0"  # Note that this is now a string
version_directory <- paste0("C:/Data Biomol/RNA/_PR2/versions/", pr2_version, 
    "/")
pr2_editor = "Mordret S."

3 Read the data

Also define a number of file names to be used latter

setwd("C:/Data Biomol/RNA/_PR2/updates/2018 Dinoref")
file_pr2_update <- "C:/Data Biomol/RNA/_PR2/updates/2018 Dinoref/Dinoref version 1.0.xlsx"


dir_pr2_update <- dirname(file_pr2_update)
file_pr2_update_taxo <- paste0(dir_pr2_update, "/taxo.txt")
file_pr2_update_new_features <- paste0(dir_pr2_update, "/features_new_sequences.txt")
file_pr2_not_updated <- paste0(dir_pr2_update, "/pr2_sequences_not_updated.txt")
file_pr2_update_new <- paste0(dir_pr2_update, "/pr2_update_new.txt")

# Read the Excel file - Need to put guess_max at the max
pr2_update <- read_excel(file_pr2_update, sheet = "pr2_update", col_names = TRUE, 
    guess_max = 2e+05)

4 Construct taxonomy and compare to existing taxonomy

4.1 Import taxonomy into Excel (taxo.txt file)

  • Check manually in the database what needs to be updated
# Construct the taxonomy file
pr2_update_taxo <- pr2_update %>% group_by_(.dots = taxo_levels) %>% summarise(sequence_number = n())

# Check it for duplicate names and duplicate parents
pr2_taxo_check(select(pr2_update_taxo, kingdom:species), dir_pr2_update)

# Compare to existing taxonomy
pr2_update_taxo_compared <- left_join(pr2_update_taxo, pr2_taxo, by = c(species = "species")) %>% 
    rename_(kingdom = "kingdom.x", supergroup = "supergroup.x", division = "division.x", 
        class = "class.x", order = "order.x", family = "family.x", genus = "genus.x") %>% 
    arrange(species)

# Write to a file to examine with Excel
write.table(pr2_update_taxo_compared, file = file_pr2_update_taxo, sep = "\t", 
    quote = FALSE, row.names = FALSE, na = "")

4.2 Update the entries that are already in the taxo table

  • 315 species
  • These entries have stuff in the taxo.y columns
  • Note that the version and the editors have to be changed in the dv_function_pr2 file
  • Note that the function update other species belonging to the same genera so that they have the same upstream taxonomy
pr2_taxo_update <- pr2_update_taxo_compared %>% filter(!(is.na(kingdom.y))) %>% 
    select(kingdom:species)
pr2_taxo_update <- pr2_taxo_update(pr2_taxo_update, method = "update")

# Check which lines did not work
filter(pr2_taxo_update, query_result == 0)

4.3 Append the new entries

  • 141 species
  • Other species from the same genera are also updated
pr2_taxo_add <- pr2_update_taxo_compared %>% filter(is.na(kingdom.y)) %>% select(kingdom:species)
pr2_taxo_add <- function(pr2_taxo_add, method = "add") 
# Check which lines did not work
filter(pr2_taxo_add, query_result == 0)

4.4 Recheck the whole taxonomy table

pr2_taxo <- get_query(db_pr2, "select * from taxo")
pr2_taxo <- pr2_taxo %>% filter(is.na(taxo_removed_version))
pr2_taxo_check(select(pr2_taxo, kingdom:species), dir_pr2_update)

5 Old sequences

  • 1444 sequences
# Extract lines with accession number already in PR2, then one need just to
# update the species, nothing more
pr2_update_old <- pr2_update %>% filter(genbank_accession %in% pr2$genbank_accession)
pr2_update_old <- pr2_reassign(pr2_update_old)

6 New sequences

6.1 Extract lines with new accession number

  • 227 sequences
pr2_update_new <- pr2_update %>% filter(!(genbank_accession %in% pr2$genbank_accession))
dv_save(pr2_update_new, "pr2_update_new.txt")

6.2 Download the sequences and parse them

  • 229 sequences Two sequences had 2 culture_collection fields that caused metadata line to be duplicated
pr2_update_new_metadata <- genbank_download_parse(pr2_update_new$genbank_accession, 
    genbank_directory)
dv_save(pr2_update_new_metadata, "pr2_update_new_metadata.txt")
knitr::kable(head(pr2_update_new_metadata, 3))

# Save the original metadata that contains 2 bad lines.. in case
pr2_update_new_metadata_bad <- pr2_update_new_metadata
# Suppress the two bad lines that have been repeated
pr2_update_new_metadata <- pr2_update_new_metadata[-c(63, 83), ]

# There are two bad columns in the metabadata table due to duplication of
# the lines
pr2_update_new_metadata <- pr2_update_new_metadata %>% select(-contains("structure"))
dv_save(pr2_update_new_metadata, "pr2_update_new_metadata.txt")

6.3 Extract features

Features that can be used to determine the start and end of the 18S - This is stored in a file.

pr2_update_new_features <- genbank_features(pr2_update_new$genbank_accession, 
    genbank_directory)
write.table(pr2_update_new_features, file = file_pr2_update_new_features, sep = "\t", 
    quote = FALSE, row.names = FALSE, na = "")
knitr::kable(head(pr2_update_new_features, 3))

6.4 Update sequence_start and sequence_end from feature table

Enter the sequence_start and sequence_end into the pr2_update_new Excel sheet and reload Note : 2 sequences have been removed because they correspond to 23S and not 18S - > 225 sequences

pr2_update_new <- read_excel(file_pr2_update, sheet = "pr2_update_new", col_names = TRUE, 
    guess_max = 2e+05)

6.5 Merge the accession, taxonomy, sequence and metadata

pr2_update_new <- left_join(pr2_update_new, pr2_update_new_metadata)

6.6 Extract sequences and get length and number of ambiguities

14 sequences do not meet the critaria and have too many ambiguities - 211 sequences remaining

# Extract the subsequence if the whole sequence is not to be used
index <- !is.na(pr2_update_new$sequence_start)
pr2_update_new$sequence[index] <- str_sub(pr2_update_new$sequence[index], pr2_update_new$sequence_start[index], 
    pr2_update_new$sequence_end[index])
pr2_update_new$sequence_length_old <- pr2_update_new$sequence_length

# Update sequence length and ambiguities for all sequences since some may
# have been shortened
pr2_update_new$sequence_length <- str_length(pr2_update_new$sequence)
pr2_update_new$ambiguities <- str_count(pr2_update_new$sequence, pattern = "[NRYSWKMBDHV]")

# Remove sequences that are shorter than minimum lenght (500) and those that
# have too many ambiguities
pr2_update_new_rejected <- pr2_update_new %>% filter((sequence_length < sequence_length_min) | 
    (ambiguities > ambiguities_max))
dv_save(pr2_update_new_rejected, "pr2_update_new_rejected.txt")
knitr::kable(pr2_update_new_rejected$genbank_accession)

pr2_update_new <- pr2_update_new %>% filter((sequence_length >= sequence_length_min) & 
    (ambiguities <= ambiguities_max))

6.7 Create PR2 fields

pr2_update_new <- pr2_update_new %>% dplyr::rename(start = sequence_start, end = sequence_end)
pr2_update_new$label <- "U"
pr2_update_new$added_version <- pr2_version
pr2_update_new <- pr2_update_new %>% mutate(pr2_accession = paste0(genbank_accession, 
    ".", start, ".", end, "_", label))
pr2_update_new$reference_sequence = 1

# Unquote the following three lines if Chimera or better write an 'if
# statement' if species = Chimera_XXXX pr2_update_new$removed_version <-
# pr2_update_new$added_version pr2_update_new$chimera <- 1
# pr2_update_new$chimera_remark <- 'Detected by B. Edvardsen by UCHIME or
# long branch/manual'

dv_save(pr2_update_new, "pr2_update_new.txt")

6.8 Create the different tables

pr2_main_update <- select(pr2_update_new, pr2_accession, genbank_accession, 
    start, end, label, species, reference_sequence, starts_with("added_"), starts_with("removed_"), 
    starts_with("chimera"))
pr2_sequences_update <- select(pr2_update_new, pr2_accession, sequence, sequence_length, 
    ambiguities)
pr2_metadata_update <- select(pr2_update_new, genbank_accession, starts_with("gb_"), 
    starts_with("pr2_"), starts_with("eukref_"))
pr2_metadata_update <- select(pr2_metadata_update, -pr2_accession)

6.9 Write to database

Either the update can be done : * directly (but depends on Internet connection) - write_db * by exporting text file and then reimporting using an Excel sheet. The advantage of the lattest method is that the data can be edited.

dv_save(pr2_main_update, "pr2_main_update.txt")
dv_save(pr2_sequences_update, "pr2_sequences_update.txt")
dv_save(pr2_metadata_update, "pr2_metadata_update.txt")

write_db(db_pr2, table_name = "pr2_main", table_df = pr2_main_update)
write_db(db_pr2, table_name = "pr2_sequences", table_df = pr2_sequences_update)
write_db(db_pr2, table_name = "pr2_metadata", table_df = pr2_metadata_update)

6.10 Add the special metadata features annotated by DinoRef

metadata_dinoref <- read_excel(file_pr2_update, sheet = "Dinoref metadata", 
    col_names = TRUE, guess_max = 2e+05)

# The next line is due to the fact that the query crashed - they are not to
# be used metadata_dinoref_done <- read_excel(file_pr2_update,
# sheet='Dinoref metadata done', col_names = TRUE, guess_max=200000)
# metadata_dinoref <- left_join(metadata_dinoref,
# select(metadata_dinoref_done, genbank_accession, metadata_remark))
# metadata_dinoref <- metadata_dinoref %>% filter(is.na(metadata_remark))


# Only keep sequences that are in the PR2 database
metadata_dinoref <- metadata_dinoref %>% filter(genbank_accession %in% pr2$genbank_accession)

# Build a query for updating metadata
metadata_dinoref <- metadata_dinoref %>% mutate(query = paste0("UPDATE pr2_metadata", 
    " SET ", update_field_string("pr2_env_biome", pr2_env_biome, append_comma = TRUE), 
    update_field_string("pr2_biotic_relationship", pr2_biotic_relationship, 
        append_comma = TRUE), update_field_string("pr2_specific_host", pr2_specific_host, 
        append_comma = TRUE), update_field_string("pr2_sample_method", pr2_sample_method, 
        append_comma = TRUE), update_field_string("gb_date", gb_date, append_comma = TRUE), 
    update_field_string("gb_authors", gb_authors, append_comma = TRUE), update_field_string("metadata_remark", 
        "Metadata from DinoRef (Mordret S.)", append_comma = TRUE), update_field_string("gb_journal", 
        gb_journal, append_comma = FALSE), " WHERE genbank_accession = ", single_quote, 
    genbank_accession, single_quote))

# Update database
metadata_dinoref$query_result <- execute_query_vector(db_pr2, metadata_dinoref$query)

7 Find the PR2 cultures dinoflagellates sequences that are NOT updated

7.1 Create file to Solenn for re-annotation

pr2_cultures_not_updated <- pr2 %>% filter((class == "Dinophyceae") & (pr2_sample_type != 
    "environmental")) %>% filter(!(genbank_accession %in% pr2_update$genbank_accession))
dv_save(pr2_cultures_not_updated, "pr2_cultures_not_updated.txt")

Solenn has resent a file called : Dinoref updates needed version 1.0 solenn.xlsx

7.2 Taxonomy correction

Done manually based on sheet : pr2 taxo not updated

7.3 Sequence correction

Done manually from the sheet

7.4 Add 2 sequences that were missing

# Read the Excel file
pr2_update_2 <- read_excel(file_pr2_update, sheet = "pr2_update_2", col_names = TRUE, 
    guess_max = 2e+05)
pr2_update_new <- pr2_update_2 %>% filter(!(genbank_accession %in% pr2$genbank_accession))

pr2_update_new_metadata <- genbank_download_parse(pr2_update_new$genbank_accession, 
    genbank_directory)
knitr::kable(head(pr2_update_new_metadata, 3))

pr2_update_new_features <- genbank_features(pr2_update_new$genbank_accession, 
    genbank_directory)
write.table(pr2_update_new_features, file = file_pr2_update_new_features, sep = "\t", 
    quote = FALSE, row.names = FALSE, na = "")
knitr::kable(head(pr2_update_new_features, 3))

pr2_update_new <- read_excel(file_pr2_update, sheet = "pr2_update_2", col_names = TRUE, 
    guess_max = 2e+05)
pr2_update_new <- left_join(pr2_update_new, pr2_update_new_metadata)

# Extract the subsequence if the whole sequence is not to be used
index <- !is.na(pr2_update_new$sequence_start)
pr2_update_new$sequence[index] <- str_sub(pr2_update_new$sequence[index], pr2_update_new$sequence_start[index], 
    pr2_update_new$sequence_end[index])
pr2_update_new$sequence_length_old <- pr2_update_new$sequence_length

# Update sequence length and ambiguities for all sequences since some may
# have been shortened
pr2_update_new$sequence_length <- str_length(pr2_update_new$sequence)
pr2_update_new$ambiguities <- str_count(pr2_update_new$sequence, pattern = "[^ATGC]")

# Remove sequences that are shorter than minimum lenght (500) and those that
# have too many ambiguities
pr2_update_new_rejected <- pr2_update_new %>% filter((sequence_length < sequence_length_min) | 
    (ambiguities > ambiguities_max))
dv_save(pr2_update_new_rejected, "pr2_update_new_rejected.txt")
knitr::kable(pr2_update_new_rejected$genbank_accession)

pr2_update_new <- pr2_update_new %>% filter((sequence_length >= sequence_length_min) & 
    (ambiguities <= ambiguities_max))

pr2_update_new <- pr2_update_new %>% dplyr::rename(start = sequence_start, end = sequence_end)
pr2_update_new$label <- "U"
pr2_update_new$added_version <- pr2_version
pr2_update_new <- pr2_update_new %>% mutate(pr2_accession = paste0(genbank_accession, 
    ".", start, ".", end, "_", label))
pr2_update_new$reference_sequence = 1

pr2_main_update <- select(pr2_update_new, pr2_accession, genbank_accession, 
    start, end, label, species, reference_sequence, starts_with("added_"), starts_with("removed_"), 
    starts_with("chimera"))
pr2_sequences_update <- select(pr2_update_new, pr2_accession, sequence, sequence_length, 
    ambiguities)
pr2_metadata_update <- select(pr2_update_new, genbank_accession, starts_with("gb_"), 
    starts_with("pr2_"), starts_with("eukref_"))
pr2_metadata_update <- select(pr2_metadata_update, -pr2_accession)

dv_save(pr2_main_update, "pr2_main_update_2.txt")
dv_save(pr2_sequences_update, "pr2_sequences_update_2.txt")
dv_save(pr2_metadata_update, "pr2_metadata_update_2.txt")

write_db(db_pr2, table_name = "pr2_main", table_df = pr2_main_update)
write_db(db_pr2, table_name = "pr2_sequences", table_df = pr2_sequences_update)
write_db(db_pr2, table_name = "pr2_metadata", table_df = pr2_metadata_update)

8 Reannotate some PR2 environmental sequences dinoflagellates sequences using dada2

Note : Only sequences with no ambiguities have been reannotated. Also I only used the Species assingment which implies 100% similarity with the reference sequences. So only 31 sequences have been annotated this way.

8.1 Extract the sequences and create dataframe compatible with dada2

For dada2 - row.names = Accession - $sequence - $abundance

# Remove any sequence which has ambiguities
pr2_env_not_updated <- pr2 %>% filter((class == "Dinophyceae") & (pr2_sample_type == 
    "environmental")) %>% filter(!(genbank_accession %in% pr2_update$genbank_accession)) %>% 
    filter(str_count(sequence, pattern = "[^ATGC]") == 0)

dv_save(pr2_env_not_updated, "pr2_env_not_updated.txt")

# Create the dada2 file
pr2_env_not_updated_dada2 <- pr2_env_not_updated %>% select(sequence) %>% mutate(abundance = 1)
row.names(pr2_env_not_updated_dada2) <- pr2_env_not_updated$genbank_accession

# pr2_export(pr2_env_not_updated, 'pr2_env_not_updated.fasta.gz',
# file_type='fasta', file_format='fasta_taxo_long')

8.2 Export the Dino reference sequences as Dada2

pr2_dinoref <- pr2 %>% filter((class == "Dinophyceae") & (reference_sequence == 
    1) & (ambiguities == 0) & (str_count(species, "_X") == 0))
pr2_export(pr2_dinoref, "pr2_dinoref_dada2.fasta.gz", file_type = "fasta", file_format = "dada2")
pr2_export(pr2_dinoref, "pr2_dinoref_dada2_species.fasta.gz", file_type = "fasta", 
    file_format = "dada2_species")

8.3 Annotate using dada2 wang taxonomy

There are two ways to proceed : * AssignTaxonomy with a bootstrap value * AssignSpecies which looks for 100% similar sequences I chose the second option that is fail safe… The assignement of 31 sequences has been verified

taxa <- assignTaxonomy(pr2_env_not_updated_dada2, refFasta = "pr2_dinoref_dada2.fasta.gz", 
    taxLevels = taxo_levels, minBoot = 0, outputBootstraps = TRUE, verbose = TRUE)
dv_save(taxa$tax, "pr2_env_not_updated_taxa.txt")
dv_save(taxa$boot, "pr2_env_not_updated_boot.txt")

species <- assignSpecies(pr2_env_not_updated_dada2, refFasta = "pr2_dinoref_dada2_species.fasta.gz", 
    allowMultiple = TRUE, tryRC = FALSE, verbose = TRUE)

species_df <- as.data.frame(species)

pr2_env_not_updated <- bind_cols(pr2_env_not_updated, select(species_df, Genus, 
    Species))
pr2_env_not_updated_assigned <- pr2_env_not_updated %>% filter(!(is.na(Genus)))
dv_save(pr2_env_not_updated_assigned, "pr2_env_not_updated_assigned.txt")

8.4 Transfer the verified taxonomy in the database

pr2_editor <- "Vaulot D."
pr2_update_3 <- read_excel(file_pr2_update, sheet = "pr2_update_3", col_names = TRUE, 
    guess_max = 2e+05)
pr2_update_3 <- pr2_reassign(pr2_update_3)

9 Export PR2 dinoflagellate sequences that are not part of the Reference dataset.

These two files have been sent to Solenn (20/02/2018) for eventual reannotation based on the reference data set.

pr2_dino_not_updated <- pr2 %>% filter((class == "Dinophyceae") & is.na(reference_sequence))

dv_save(pr2_dino_not_updated, "pr2_dino_not_updated.txt")

pr2_export(pr2_env_not_updated, "pr2_dino_not_updated.fasta.gz", file_type = "fasta", 
    file_format = "fasta_taxo_long")