--- title: "Metabarcoding data processing using dada2 - 16S" author: "CAM" date: "`r Sys.Date()`" output: "html_document" --- RStudio Version v4.4.4 Using the code in this file, we will process 16S metabarcoding data using the Dada2 pipeline. (DATA REDUCTION STAGE). Our goal is to use the DADA2 algorithm to assess read quality, denoise, and merge our sequences. Finally, we will assign our sequences to ASVs and assign these to taxonomy using DADA2 and a reference database. The first thing you must do is to set up your environment and load the necessary libraries and your data. desired outcome: https://compbiocore.github.io/metagenomics-workshop/assets/DADA2_tutorial.html ```{r setup, include=FALSE} #load wd path <- "/Users/callahanmcgovern/Desktop/Research/LHM/metabarcoding/dada2/input" setwd(path) list.files(path) knitr::opts_chunk$set(echo = TRUE) knitr::opts_knit$set(root.dir = path) getwd() #load libraries library(dada2) #version 4.2.1 library(knitr) #version 1.4.2 library(magrittr) #version 2.0.3 library(DECIPHER, quietly = T) #version 2.26.0 library(phangorn, quietly = T) #version 2.11.1 #BiocManager::install("phyloseq") library("phyloseq", quietly = T) #version 1.42.0 install.packages("phytools", quietly = T) #version 1.9-16 ``` In the next code chunk we will do quality control of our sample reads. This will begin by sorting your sample reads into forward and reverse reads by looking for the _R1.fastq extension. ** Make sure this is the extension your files use, if not, change the code to match your fwd/rev extension pattern. Then, we will extract the base of the sample names and visualize the quality of our reads. You do not need to see all forward and reverse reads for each sample. We pick the first two from both here to get an idea of quality. This will help determine where you trim your reads. Lastly, we filter and trim our reads based on the quality reading above. ```{r quality control} #INSPECT READ QUALITY Fs <- sort(list.files(path, pattern="_R1.fastq", full.names = TRUE)) Rs <- sort(list.files(path, pattern="_R2.fastq", full.names = TRUE)) sample.names <- sapply(strsplit(basename(Fs), "_"), `[`, 1) sample.names dir.create('fastqs_trimmed') list.files('.') list.files('./fastqs_trimmed') #plot quality profile for FWD/REV reads plotQualityProfile(Fs[1:2]) #plotQualityProfile(Rs[1:2]) #FILTER AND TRIM filtFs <- file.path('fastqs_trimmed', paste0(sample.names, "_F_filt.fastq")) filtRs <- file.path('fastqs_trimmed', paste0(sample.names, "_R_filt.fastq")) out <- filterAndTrim(fwd=Fs, filt=filtFs, rev=Rs, filt.rev=filtRs, maxEE=c(2,2), truncQ=2, rm.phix = TRUE, multithread=TRUE, compress = TRUE) truncLen=c(250, 230) list.files('fastqs_trimmed') saveRDS(out, file = "filtered.RDS") head(out) ``` Now its time to perform the core dada2 algorithm. Here we will instruct dada to learn the error rated of the samples and then it will use this rate to perform a denoising step. Inbetween we delete replicates for less noise as well. Red line shows error rates expected under the nominal definition of the Q-score You want the black line to track with the observed rates (points) and the error rate should drop with increased quality ``` {r dada_denoise} #REMOVE EXACT REPLICATES dereplicated1 <- derepFastq(filtFs, verbose=T) names(dereplicated1) <- sample.names dereplicated2 <- derepFastq(filtRs, verbose=T) names(dereplicated2) <- sample.names #LEARN ERROR RATES ##black line shows estimated error rates after convergence of the machine-learning algorithm. errF <- learnErrors(filtFs, multithread = TRUE) errR <- learnErrors(filtRs, multithread = TRUE) plotErrors(errF, nominalQ = T) #DADA DENOISE dadaFs <- dada(dereplicated1, err=errF, multithread = TRUE) dadaRs <- dada(dereplicated2, err=errR, multithread = TRUE) dadaFs dadaRs ``` Here we will merge our trimmed, filtered, denoised forward and reverse sequences. Then you will create a table (seqtab) which will hold all of your ASVs and order them by abundance. Lastly you will remove chimeras and finally, save your results to a fasta file if you are stopping after this point. ``` {r ASV} #MERGE PAIRS - found 184 ASVs merged <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, verbose = T, trimOverhang = TRUE) class(merged) # list length(merged) # elements-- one for each of our samples names(merged) #name of each element #tells you how many merged samples you have and how many ASVs you have seqtab <- makeSequenceTable(merged, orderBy='abundance') rownames(seqtab) class(seqtab) dim(seqtab) #tells you length of various seqs found and their abundance table(nchar(getSequences(seqtab))) #REMOVE CHIMERAS seqtab.nochim <- removeBimeraDenovo(seqtab, method = "consensus", multithread=TRUE, verbose = TRUE) dim(seqtab.nochim) sum(seqtab.nochim)/sum(seqtab) #tells you % of merged reads that were chimeras #this looks for all reads with an abundance of 1 and removes them (we do this because reads with abundance = 1 are most likely sequencing errors) is1 <- colSums(seqtab.nochim) <= 1 seqtab.no1 <- seqtab.nochim[,!is1] seqtab <- seqtab.no1 saveRDS(seqtab, "~/dada2/outputs/seqtab_silvaCyano") seqtab <- readRDS("~/dada2/outputs/backups/seqtab_silvaCyano") #look at read lengths and trim seqs that are not the desired read table(nchar(getSequences(seqtab))) short <- nchar(getSequences(seqtab)) < 374 seqtab.sized <- seqtab[,!short] seqtab <- seqtab.sized table(nchar(getSequences(seqtab))) #WRITE RESULTS TO FASTA FILE #making asv fasta file asv_seqs <- colnames(seqtab) asv_headers <- vector(dim(seqtab)[2], mode="character") for (i in 1:dim(seqtab)[2]) { asv_headers[i] <- paste(">ASV", i, sep="_") } asv_fasta <- c(rbind(asv_headers, asv_seqs)) write(asv_fasta, "~/dada2/outputs/ASVs.fa") read.FASTA("~/dada2/outputs/ASVs.fa") #gives you info about ASVs ``` Now lets track the reads through our pipeline so far so we can see how many have been discarded in each step along the way. ``` {r track} #Track reads through the pipeline getN <- function(x) sum(getUniques(x)) track <- cbind(out, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(merged, getN), rowSums(seqtab.nochim)) colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim") rownames(track) <- sample.names print(track) ``` Now we can assign taxonomic identification for our ASVs! We will do this with an enriched SILVA dataset called CyanoSeq. ``` {r assign taxonomy} #install.packages("dada2") database <- "~/dada2/databases/CyanoSeq0.9.6_SILVA138.1_dada2.fastq" taxa_silva_cyano <- assignTaxonomy(seqtab, database, verbose=2, minBoot=80, multithread=TRUE) taxonomy <- taxa_silva_cyano View(taxonomy) #remove chloroplast dna ID aka "junk" chloroplasts <- taxonomy[,"Order"] %in% "Chloroplast" seqtab.nochloro <- seqtab[,!chloroplasts] taxa.nochloro <- taxonomy[!chloroplasts,] #rename tables for ps object taxa_table <- taxa.nochloro asvs <- seqtab.nochloro #manually check and adjust top ASVs as needed taxa_table[1, "Genus"] <- "Microcoleus" #is it microcoleus or phormidium taxa_table[2, "Genus"] <- "Microcoleus" #is it microcoleus or phormidium ??? taxa_table[3, "Genus"] <- "Microcoleus" #is it microcoleus or phormidium ??? taxa_table[5, "Genus"] <- "Synechococcus" #maybe cynobium tho taxa_table[15, "Genus"] <- "Microcoleus" #or phormidium ``` Now we can align our sequences and make a phylogenetic tree: ```{r creating a tree} seqs <- getSequences(seqtab.nochloro) names(seqs) <- seqs alignment <- AlignSeqs(DNAStringSet(seqs), anchor = NA) #Run Sequence Alignment (MSA) using DECIPHER phang.align <- phyDat(as(alignment, "matrix"), type = "DNA") #Change sequence alignment output into a phyDat structure dm <- dist.ml(phang.align) treeNJ <- NJ(dm) fit = pml(treeNJ, data=phang.align) View(fit) fitGTR <- update(fit, k=4, inv=0.2) fitGTR_RTDB <- optim.pml(fitGTR, model="GTR", optInv=TRUE, optGamma=TRUE, rearrangement = "stochastic", control = pml.control(trace = 0)) #this takes forever save(fitGTR, file=("fitGTR.RData")) fitGTR <- load("fitGTR.RData") View(fitGTR) save(fitGTR_RTDB, file=("fitGTR_tree.RData")) fitGTR_tree <- load("fitGTR_tree.RData") View(fitGTR_tree) ##add this to ps object when complete #set.seed(711) #root tree for unifrac #phy_tree(ps) <- root(phy_tree(ps), sample(taxa_names(ps), 1), resolve.root = TRUE) #is.rooted(phy_tree(ps)) ``` Finally, use all data processed so far to create a phyloseq object which contains the ASV table, taxa table, sample meta data, phylogenetic tree. ```{r phyloseq} library(phyloseq) #import or create meta data df sampledata = sample_data(data.frame( Name = c("FTn.bottom","Alcove.sinkhole","MIS.Core_c","MIS.Core_+N","MIS.Core_+O","E.C.bay","Ftn.top.p","Ftn.top.w","Ftn.mid"), Location = c("Fountain","Alcove","MIS","MIS","MIS","El Cajon Bay","Fountain","Fountain","Fountain"), Depth = sample(50:1000, size=nsamples(ps), replace=TRUE), Treatment = c("-","-","control","Nitrogen_+","Oxygen_+","-","-","-","-"), row.names=sample_names(ps), stringsAsFactors=FALSE )) View(sampledata) #making sure row names in various tables/df/matrices match #meta_data <- meta_data[c(5,6,7,8,9,1,2,3,4),] #rownames(meta_data) <- c("Sample_10","Sample_11","Sample_12","Sample_13","Sample_14","Sample_5","Sample_7","Sample_8","Sample_9") #meta_data1 <- meta_data[, -1] #meta_data1$sample_name <- c("ftn_bottom", "alcove", "coreC", "coreN", "coreO","EC_", "ftn_topW", "ftn_topP", "ftn_mid") #rownames(asvs) <- c("Sample_10","Sample_11","Sample_12","Sample_13","Sample_14","Sample_5","Sample_7","Sample_8","Sample_9") #making a phyloseq object ps <- phyloseq(otu_table(asvs, taxa_are_rows = F), tax_table(taxa_table), sample_data(sampledata), phy_tree(fitGTR_RTDB$tree)) #change from actual seq to "ASVs" dna <- Biostrings::DNAStringSet(taxa_names(ps)) names(dna) <- taxa_names(ps) ps <- merge_phyloseq(ps, dna) taxa_names(ps) <- paste0("ASV", seq(ntaxa(ps))) #save to reuse later saveRDS(ps, "ps.rds") ps <- readRDS("/Users/callahanmcgovern/Desktop/Research/LHM/metabarcoding/dada2/input/ps.rds") ps sample_data(ps) <- sample_data(sampledata) ps #VIEW SUB-OBJECTS AS DESIRED View(tax_table(ps)) View(otu_table(ps)) View(sample_data(ps)) View(phy_tree(ps)) ```