#time: ~ 5h
###activate conda environment
conda activate qiime2-2021.8
###importing data and demultiplexing sequences
#time:~ 2h
time qiime tools import \
  --type 'SampleData[PairedEndSequencesWithQuality]' \
  --input-path FourStages_pe-33-manifest.txt \
  --input-format PairedEndFastqManifestPhred33V2 \
  --output-path FourStages-Paired-end-demux.qza

##summary of the distribution of sequence qualities at each position in your sequence data
time qiime demux summarize \
  --i-data FourStages-Paired-end-demux.qza \
  --o-visualization FourStages-Paired-end-demux.qzv
 
###DAD2 pipline
##denoise
#6 bases bacode and 20 bases primer
#primers are 338F: ACTCCTACGGGAGGCAGCAG and 806R:GGACTACHVGGGTWTCTAAT
#--p-trunc-len-f and --p-trunc-len-l was setted according to demultiplexing sequences result
#overlap is 15
##--p-n-threads was setted to 0 to use all available cores for multithreaded processing

#~ 2h
time qiime dada2 denoise-paired \
  --i-demultiplexed-seqs FourStages-Paired-end-demux.qza \
  --p-trim-left-f 26\
  --p-trim-left-r 26 \
  --p-trunc-len-f 295 \
  --p-trunc-len-r 205 \
  --p-n-threads 0 \
  --o-table table-FourStages-DAD2-F295+R205.qza \
  --o-representative-sequences rep-seqs-FourStages-DAD2-F295+R205.qza \
  --o-denoising-stats stats-FourStages-DAD2-F295+R205.qza

#output visualization
qiime metadata tabulate \
  --m-input-file stats-FourStages-DAD2-F295+R205.qza  \
  --o-visualization stats-FourStages-DAD2-F295+R205.qzv

qiime feature-table summarize \
  --i-table table-FourStages-DAD2-F295+R205.qza \
  --o-visualization table-FourStages-DAD2-F295+R205.qzv

qiime feature-table tabulate-seqs \
  --i-data  rep-seqs-FourStages-DAD2-F295+R205.qza \
  --o-visualization  rep-seqs-FourStages-DAD2-F295+R205.qzv

###training feature classifiers with q2-feature-classifier
## installing RESCRIPt##
sudo apt update
sudo apt install git
conda install -c conda-forge -c bioconda -c qiime2 -c defaults xmltodict
pip install git+https://github.com/bokulich-lab/RESCRIPt.git

##downloading the taxonomy data and representative sequence from silva 138 database
qiime rescript get-silva-data \
    --p-version '138' \
    --p-target 'SSURef_NR99' \
    --p-include-species-labels \
    --o-silva-sequences silva-138-ssu-nr99-seqs.qza \
    --o-silva-taxonomy silva-138-ssu-nr99-tax.qza

##“Culling” low-quality sequences with cull-seqs
##Here we’ll remove sequences that contain 5 or more ambiguous bases (IUPAC compliant ambiguity bases) 
##and any homopolymers that are 8 or more bases in length.
time qiime rescript cull-seqs \
    --i-sequences silva-138-ssu-nr99-seqs.qza \
    --o-clean-sequences silva-138-ssu-nr99-seqs-cleaned.qza

##filtering sequences by length and taxonomy
time qiime rescript filter-seqs-length-by-taxon \
    --i-sequences silva-138-ssu-nr99-seqs-cleaned.qza \
    --i-taxonomy silva-138-ssu-nr99-tax.qza \
    --p-labels Archaea Bacteria Eukaryota \
    --p-min-lens 900 1200 1400 \
    --o-filtered-seqs silva-138-ssu-nr99-seqs-filt.qza \
    --o-discarded-seqs silva-138-ssu-nr99-seqs-discard.qza 

##dereplication of sequences and taxonomy
time qiime rescript dereplicate \
    --i-sequences silva-138-ssu-nr99-seqs-filt.qza  \
    --i-taxa silva-138-ssu-nr99-tax.qza \
    --p-rank-handles 'silva' \
    --p-mode 'uniq' \
    --o-dereplicated-sequences silva-138-ssu-nr99-seqs-derep-uniq.qza \
    --o-dereplicated-taxa silva-138-ssu-nr99-tax-derep-uniq.qza

#make amplicon-region specific classifier
time qiime feature-classifier extract-reads \
    --i-sequences silva-138-ssu-nr99-seqs-derep-uniq.qza \
    --p-f-primer ACTCCTACGGGAGGCAGCAG \
    --p-r-primer GGACTACHVGGGTWTCTAAT \
    --p-n-jobs 5 \
    --p-read-orientation 'forward' \
    --o-reads silva-138-ssu-nr99-seqs-338f-806r.qza

##dereplicate extracted region
qiime rescript dereplicate \
    --i-sequences silva-138-ssu-nr99-seqs-338f-806r.qza \
    --i-taxa silva-138-ssu-nr99-tax-derep-uniq.qza \
    --p-rank-handles 'silva' \
    --p-mode 'uniq' \
    --o-dereplicated-sequences silva-138-ssu-nr99-seqs-338f-806r-uniq.qza \
    --o-dereplicated-taxa  silva-138-ssu-nr99-tax-338f-806r-derep-uniq.qza


###train a Naive Bayes classifier 
time qiime feature-classifier fit-classifier-naive-bayes \
    --i-reference-reads silva-138-ssu-nr99-seqs-338f-806r-uniq.qza \
    --p-classify--chunk-size 1200 \
    --i-reference-taxonomy silva-138-ssu-nr99-tax-338f-806r-derep-uniq.qza \
    --o-classifier silva-138-ssu-nr99-tax-338f-806r-derep-uniq-classifier.qza


#classifying the representative sequences
time qiime feature-classifier classify-sklearn \
  --i-classifier training-feature-classifiers-qiime2-2020.11/silva-138-ssu-nr99-tax-338f-806r-derep-uniq-classifier-qiime2-2020.11.qza \
  --i-reads FourStages/FourStages-DAD2-F295+R205/FourStages-DAD2-F295+R205-Data/rep-seqs-FourStages-DAD2-F295+R205.qza \
  --p-reads-per-batch 12000\
  --p-n-jobs 1\
  --o-classification FourStages/FourStages-DAD2-F295+R205/Taxonomy-FourStages-DAD2-F295+R205/taxonomy-FourStages-DAD2-F295+R205.qza

#Output visualization
time qiime metadata tabulate \
  --m-input-file taxonomy-FourStages-DAD2-F295+R205.qza \
  --o-visualization taxonomy-FourStages-DAD2-F295+R205.qzv

###filtering Data
##taxonomy-based filtering of tables and sequences
#removing all features annotated as mitochondria or chloroplast
qiime taxa filter-table \
  --i-table FourStages/FourStages-DAD2-F295+R205/FourStages-DAD2-F295+R205-Data/table-FourStages-DAD2-F295+R205.qza \
  --i-taxonomy FourStages/FourStages-DAD2-F295+R205/Taxonomy-FourStages-DAD2-F295+R205/taxonomy-FourStages-DAD2-F295+R205.qza \
  --p-include p__ \
  --p-exclude mitochondria,chloroplast \
  --o-filtered-table FourStages/FourStages-DAD2-F295+R205/Filtered1/Filtered1-Data/table-FourStages-with-phyla-no-mitochondria-no-chloroplast.qza
  
## total-frequency-based filtering
#this filter remove all features with a total abundance(summed across all samples) of less than 10 as follows
time qiime feature-table filter-features \
  --i-table FourStages/FourStages-DAD2-F295+R205/Filtered1/Filtered1-Data/table-FourStages-with-phyla-no-mitochondria-no-chloroplast.qza \
  --p-min-frequency 10 \
  --o-filtered-table FourStages/FourStages-DAD2-F295+R205/Filtered1/Filtered1-Data/table-FourStages-DAD2-F295+R205-frequency-Filtered1.qza

#output visualization
qiime feature-table summarize \
  --i-table table-FourStages-DAD2-F295+R205-frequency-Filtered1.qza \
  --o-visualization table-FourStages-DAD2-F295+R205-frequency-Filtered1.qzv

##contingency-based filtering
#Features that are present in only eight(>10%) samples could be filtered from a feature table as follows
time qiime feature-table filter-features \
  --i-table FourStages/FourStages-DAD2-F295+R205/Filtered1/Filtered1-Data/table-FourStages-DAD2-F295+R205-frequency-Filtered1.qza \
  --p-min-samples 6 \
  --o-filtered-table  FourStages/FourStages-DAD2-F295+R205/Filtered1/Filtered1-Data/table-FourStages-DAD2-F295+R205-filtered-table-sample-contingency-Filtered1.qza\

#output visualization 
qiime feature-table summarize \
  --i-table table-FourStages-DAD2-F295+R205-filtered-table-sample-contingency-Filtered1.qza \
  --o-visualization table-FourStages-DAD2-F295+R205-filtered-table-sample-contingency-Filtered1.qzv

##library size normalization by scaling with ranked subsampling(SRS)
#using SRS Shiny app for the determination of Cmin(sampling depth),reference doi:10.7717/peerj.9593
pip install git+https://github.com/vitorheidrich/q2-srs.git
qiime srs SRS \
  --i-table FourStages/FourStages-DAD2-F295+R205/Filtered1/Filtered1-Data/table-FourStages-DAD2-F295+R205-filtered-table-sample-contingency-Filtered1.qza \
  --p-c-min 17063 \
  --o-normalized-table  FourStages/FourStages-DAD2-F295+R205/Filtered1/Filtered1-Data/Size-Normalization/table-FourStages-DAD2-F295+R205-filtered-table-sample-contingency-Filtered1-normal.qza \
  --verbose
  

##Some features might be 0 abundance again, and should be removed
time qiime feature-table filter-features \
  --i-table FourStages/FourStages-DAD2-F295+R205/Filtered1/Filtered1-Data/Size-Normalization/table-FourStages-DAD2-F295+R205-filtered-table-sample-contingency-Filtered1-normal.qza \
  --p-min-frequency 1 \
  --o-filtered-table FourStages/FourStages-DAD2-F295+R205/Filtered1/Filtered1-Data/Size-Normalization/table-FourStages-DAD2-F295+R205-filtered-table-sample-contingency-Filtered1-normal.qza
  
##output visualization
qiime feature-table summarize \
  --i-table table-FourStages-DAD2-F295+R205-filtered-table-sample-contingency-Filtered1-normal.qza \
  --o-visualization table-FourStages-DAD2-F295+R205-filtered-table-sample-contingency-Filtered1-normal.qzv

##create representative sequence file according to the filtered feature table
qiime feature-table filter-seqs \
   --i-data FourStages/FourStages-DAD2-F295+R205/FourStages-DAD2-F295+R205-Data/rep-seqs-FourStages-DAD2-F295+R205.qza \
   --i-table FourStages/FourStages-DAD2-F295+R205/Filtered1/Filtered1-Data/Size-Normalization/table-FourStages-DAD2-F295+R205-filtered-table-sample-contingency-Filtered1-normal.qza \
   --o-filtered-data  FourStages/FourStages-DAD2-F295+R205/Filtered1/rep-seqs-FourStages-DAD2-F295+R205-Filtered1/rep-seqs-FourStages-DAD2-F295+R205-Filtered1.qza
   
##appling FastTree to generate a phylogenetic tree from the masked alignment
time qiime phylogeny align-to-tree-mafft-fasttree \
  --i-sequences FourStages/FourStages-DAD2-F295+R205/Filtered1/rep-seqs-FourStages-DAD2-F295+R205-Filtered1/rep-seqs-FourStages-DAD2-F295+R205-Filtered1.qza \
  --p-n-threads auto \
  --output-dir FourStages/FourStages-DAD2-F295+R205/Filtered1/mafft-fasttree-output-FourStages-DAD2-F295+R205-Filtered1

###Alpha diversity
#chao 1 index
time qiime diversity alpha \
  --i-table FourStages/FourStages-DAD2-F295+R205/Filtered1/Filtered1-Data/Size-Normalization/table-FourStages-DAD2-F295+R205-filtered-table-sample-contingency-Filtered1-normal.qza \
  --p-metric chao1\
  --output-dir FourStages/FourStages-DAD2-F295+R205/Filtered1/core-metrics-results/Chao1
  
#shannon diversity index
  time qiime diversity alpha \
  --i-table FourStages/FourStages-DAD2-F295+R205/Filtered1/Filtered1-Data/Size-Normalization/table-FourStages-DAD2-F295+R205-filtered-table-sample-contingency-Filtered1-normal.qza \
  --p-metric shannon\
  --output-dir FourStages/FourStages-DAD2-F295+R205/Filtered1/core-metrics-results/shannon
  
#close the conda environment
  conda deactivate