---
title: "Metabarcoding data processing using dada2 - 16S"
author: "CAM"
date: "`r Sys.Date()`"
output: "html_document"
---
RStudio Version v4.4.4

Using the code in this file, we will process 16S metabarcoding data using the Dada2 pipeline. (DATA REDUCTION STAGE). Our goal is to use the DADA2 algorithm to assess read quality, denoise, and merge our sequences. Finally, we will assign our sequences to ASVs and assign these to taxonomy using DADA2 and a reference database.

The first thing you must do is to set up your environment and load the necessary libraries and your data. 

desired outcome: https://compbiocore.github.io/metagenomics-workshop/assets/DADA2_tutorial.html

```{r setup, include=FALSE}
#load wd
path <- "/Users/callahanmcgovern/Desktop/Research/LHM/metabarcoding/dada2/input"
setwd(path)
list.files(path)

knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(root.dir = path)
getwd()

#load libraries
library(dada2)
#version 4.2.1
library(knitr)
#version 1.4.2
library(magrittr)
#version 2.0.3
library(DECIPHER, quietly = T) 
#version 2.26.0
library(phangorn, quietly = T)
#version 2.11.1
#BiocManager::install("phyloseq")
library("phyloseq", quietly = T) 
#version 1.42.0
install.packages("phytools", quietly = T)
#version 1.9-16
```

In the next code chunk we will do quality control of our sample reads. 
  This will begin by sorting your sample reads into forward and reverse reads by looking for the _R1.fastq extension. ** Make sure this is the extension your files use, if not, change the code to match your fwd/rev extension pattern. 
    Then, we will extract the base of the sample names and visualize the quality of our reads. You do not need to see all forward and reverse reads for each sample. We pick the first two from both here to get an idea of quality. This will help determine where you trim your reads. 
    Lastly, we filter and trim our reads based on the quality reading above. 
    
```{r quality control}
#INSPECT READ QUALITY
Fs <- sort(list.files(path, pattern="_R1.fastq", full.names = TRUE))
Rs <- sort(list.files(path, pattern="_R2.fastq", full.names = TRUE))
sample.names <- sapply(strsplit(basename(Fs), "_"), `[`, 1)
sample.names

dir.create('fastqs_trimmed')
list.files('.')
list.files('./fastqs_trimmed')

#plot quality profile for FWD/REV reads
plotQualityProfile(Fs[1:2])
#plotQualityProfile(Rs[1:2])

#FILTER AND TRIM 
filtFs <- file.path('fastqs_trimmed', 
                    paste0(sample.names, "_F_filt.fastq"))
filtRs <- file.path('fastqs_trimmed', 
                    paste0(sample.names, "_R_filt.fastq"))

out <- filterAndTrim(fwd=Fs, filt=filtFs, 
                     rev=Rs, filt.rev=filtRs, 
                     maxEE=c(2,2),
                     truncQ=2,
                     rm.phix = TRUE,
                     multithread=TRUE,
                     compress = TRUE) 
                     truncLen=c(250, 230)  
list.files('fastqs_trimmed')
saveRDS(out, file = "filtered.RDS")
head(out)
```

Now its time to perform the core dada2 algorithm. Here we will instruct dada to learn the error rated of the samples and then it will use this rate to perform a denoising step. Inbetween we delete replicates for less noise as well. 
Red line shows  error rates expected under the nominal definition of the Q-score
You want the black line to track with the observed rates (points) and the error rate should drop with increased quality

``` {r dada_denoise}
#REMOVE EXACT REPLICATES 
dereplicated1 <- derepFastq(filtFs, verbose=T)
names(dereplicated1) <- sample.names
dereplicated2 <- derepFastq(filtRs, verbose=T)
names(dereplicated2) <- sample.names

#LEARN ERROR RATES
##black line shows estimated error rates after convergence of the machine-learning algorithm. 
errF <- learnErrors(filtFs, multithread = TRUE)
errR <- learnErrors(filtRs, multithread = TRUE)
plotErrors(errF, nominalQ = T)

#DADA DENOISE
dadaFs <- dada(dereplicated1, err=errF, multithread = TRUE)
dadaRs <- dada(dereplicated2, err=errR, multithread = TRUE)
dadaFs
dadaRs
```

Here we will merge our trimmed, filtered, denoised forward and reverse sequences. Then you will create a table (seqtab) which will hold all of your ASVs and order them by abundance. Lastly you will remove chimeras and finally, save your results to a fasta file if you are stopping after this point.

``` {r ASV}
#MERGE PAIRS - found 184 ASVs
merged <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, verbose = T, trimOverhang = TRUE)
class(merged) # list
length(merged) # elements-- one for each of our samples
names(merged) #name of each element

#tells you how many merged samples you have and how many ASVs you have
seqtab <- makeSequenceTable(merged, orderBy='abundance')
rownames(seqtab)
class(seqtab)
dim(seqtab)

#tells you length of various seqs found and their abundance 
table(nchar(getSequences(seqtab)))

#REMOVE CHIMERAS 
seqtab.nochim <- removeBimeraDenovo(seqtab, method = "consensus", multithread=TRUE, verbose = TRUE)
dim(seqtab.nochim)
sum(seqtab.nochim)/sum(seqtab) #tells you % of merged reads that were chimeras

#this looks for all reads with an abundance of 1 and removes them (we do this because reads with abundance = 1 are most likely sequencing errors)
is1 <- colSums(seqtab.nochim) <= 1
seqtab.no1 <- seqtab.nochim[,!is1]
seqtab <- seqtab.no1
saveRDS(seqtab, "~/dada2/outputs/seqtab_silvaCyano")
seqtab <- readRDS("~/dada2/outputs/backups/seqtab_silvaCyano")

#look at read lengths and trim seqs that are not the desired read
table(nchar(getSequences(seqtab)))
short <- nchar(getSequences(seqtab)) < 374
seqtab.sized <- seqtab[,!short]
seqtab <- seqtab.sized
table(nchar(getSequences(seqtab)))

#WRITE RESULTS TO FASTA FILE
#making asv fasta file 
asv_seqs <- colnames(seqtab)
asv_headers <- vector(dim(seqtab)[2], mode="character")

for (i in 1:dim(seqtab)[2]) {
  asv_headers[i] <- paste(">ASV", i, sep="_")
}
asv_fasta <- c(rbind(asv_headers, asv_seqs))
write(asv_fasta, "~/dada2/outputs/ASVs.fa")
read.FASTA("~/dada2/outputs/ASVs.fa") #gives you info about ASVs

```

Now lets track the reads through our pipeline so far so we can see how many have been discarded in each step along the way. 

``` {r track}

#Track reads through the pipeline 
getN <- function(x) sum(getUniques(x))
track <- cbind(out, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(merged, getN), rowSums(seqtab.nochim))
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim")
rownames(track) <- sample.names
print(track)

```

Now we can assign taxonomic identification for our ASVs! We will do this with an enriched SILVA dataset called CyanoSeq. 

``` {r assign taxonomy}
#install.packages("dada2")
database <- "~/dada2/databases/CyanoSeq0.9.6_SILVA138.1_dada2.fastq"
taxa_silva_cyano <- assignTaxonomy(seqtab, database, verbose=2, minBoot=80, multithread=TRUE)

taxonomy <- taxa_silva_cyano
View(taxonomy)

#remove chloroplast dna ID aka "junk" 
chloroplasts <- taxonomy[,"Order"] %in% "Chloroplast"
seqtab.nochloro <- seqtab[,!chloroplasts]
taxa.nochloro <- taxonomy[!chloroplasts,]

#rename tables for ps object
taxa_table <- taxa.nochloro
asvs <- seqtab.nochloro

#manually check and adjust top ASVs as needed
taxa_table[1, "Genus"] <- "Microcoleus" #is it microcoleus or phormidium
taxa_table[2, "Genus"] <- "Microcoleus" #is it microcoleus or phormidium ???
taxa_table[3, "Genus"] <- "Microcoleus" #is it microcoleus or phormidium ???
taxa_table[5, "Genus"] <- "Synechococcus" #maybe cynobium tho
taxa_table[15, "Genus"] <- "Microcoleus" #or phormidium

```

Now we can align our sequences and make a phylogenetic tree: 

```{r creating a tree}
seqs <- getSequences(seqtab.nochloro)
names(seqs) <- seqs 
alignment <- AlignSeqs(DNAStringSet(seqs), anchor = NA) #Run Sequence Alignment (MSA) using DECIPHER
phang.align <- phyDat(as(alignment, "matrix"), type = "DNA") #Change sequence alignment output into a phyDat structure
dm <- dist.ml(phang.align) 
treeNJ <- NJ(dm) 
fit = pml(treeNJ, data=phang.align)
View(fit)
fitGTR <- update(fit, k=4, inv=0.2)
fitGTR_RTDB <- optim.pml(fitGTR, model="GTR", optInv=TRUE, optGamma=TRUE,
                    rearrangement = "stochastic", control = pml.control(trace = 0))   #this takes forever 
save(fitGTR, file=("fitGTR.RData"))
fitGTR <- load("fitGTR.RData")
View(fitGTR)
save(fitGTR_RTDB, file=("fitGTR_tree.RData"))
fitGTR_tree <- load("fitGTR_tree.RData")
View(fitGTR_tree)


##add this to ps object when complete
#set.seed(711) #root tree for unifrac 
#phy_tree(ps) <- root(phy_tree(ps), sample(taxa_names(ps), 1), resolve.root = TRUE)
#is.rooted(phy_tree(ps))
```

Finally, use all data processed so far to create a phyloseq object which contains the ASV table, taxa table, sample meta data, phylogenetic tree. 
```{r phyloseq}
library(phyloseq)
#import or create meta data df
sampledata = sample_data(data.frame(
  Name = c("FTn.bottom","Alcove.sinkhole","MIS.Core_c","MIS.Core_+N","MIS.Core_+O","E.C.bay","Ftn.top.p","Ftn.top.w","Ftn.mid"),
  Location = c("Fountain","Alcove","MIS","MIS","MIS","El Cajon Bay","Fountain","Fountain","Fountain"),
  Depth = sample(50:1000, size=nsamples(ps), replace=TRUE),
  Treatment = c("-","-","control","Nitrogen_+","Oxygen_+","-","-","-","-"),
  row.names=sample_names(ps),
  stringsAsFactors=FALSE
))
View(sampledata)

#making sure row names in various tables/df/matrices match 
#meta_data <- meta_data[c(5,6,7,8,9,1,2,3,4),]
#rownames(meta_data) <- c("Sample_10","Sample_11","Sample_12","Sample_13","Sample_14","Sample_5","Sample_7","Sample_8","Sample_9")
#meta_data1 <- meta_data[, -1]
#meta_data1$sample_name <- c("ftn_bottom", "alcove", "coreC", "coreN", "coreO","EC_", "ftn_topW", "ftn_topP", "ftn_mid")
#rownames(asvs) <- c("Sample_10","Sample_11","Sample_12","Sample_13","Sample_14","Sample_5","Sample_7","Sample_8","Sample_9")

#making a phyloseq object
ps <- phyloseq(otu_table(asvs, taxa_are_rows = F), 
               tax_table(taxa_table),
               sample_data(sampledata),
               phy_tree(fitGTR_RTDB$tree))

#change from actual seq to "ASVs"
dna <- Biostrings::DNAStringSet(taxa_names(ps))
names(dna) <- taxa_names(ps)
ps <- merge_phyloseq(ps, dna)
taxa_names(ps) <- paste0("ASV", seq(ntaxa(ps)))

#save to reuse later
saveRDS(ps, "ps.rds")
ps <- readRDS("/Users/callahanmcgovern/Desktop/Research/LHM/metabarcoding/dada2/input/ps.rds")
ps

sample_data(ps) <- sample_data(sampledata)
ps

#VIEW SUB-OBJECTS AS DESIRED
View(tax_table(ps))
View(otu_table(ps))
View(sample_data(ps))
View(phy_tree(ps))
```