######## Nerea 18S dataset - ASV generation, taxonomic assignment, swarm clustering  #########

## G Zampicinini 2022

### Main tasks:

# -Quality check
# -Primer trimming and read re-orientation; 
#   sequencing libraries where Illumina sequencing addpters are ligated, rather than added with a PCR step with tailed primers, result in fastq reads with mixed orientation.
#   This step is required to avoid complementary, but otherwise identical, read pairs to be resolved as 2 different ASVs in the denoising step.
# -Denoising (ASV generation)
# -Taxonomic assignment. 2 tables are provided: 
#  -one with top BLAST matches only (1 match/ASV)
#  -a full table with up to 20 matches/ASV.
#   This allows checking all matches for a given ASV if the top match is unsatisfactory 
#   (poor taxonomic definition, taxon known to be absent in the sample..)
# -Assignment table cleanup and top match selection
# -SWARM clustering: the method clusters ASVs based on the number of nucleotide differences; clustering results for ASVs differing at a single position are included in the swarm_d1 directory.
#   Mahé F, Czech L, Stamatakis A, Quince C, de Vargas C, Dunthorn M, Rognes T. (2021) Swarm v3: towards tera-scale amplicon clustering.  Bioinformatics ⟨https://doi.org/10.1093/bioinformatics/btab493⟩.

## Files in the current project:

# -AUGMENTED OBSERVATORY NEREA-18S DATA (2019-2020)	Table derived from Nerea_18S-V9_ASVs_filt90-GenSort.tsv by converting sample IDs using NEREA samples codes according to SampleID_table_conversion.tsv
# -SampleID_table_conversion.tsv		Table to convert 18S fastq sequences published in the ENA project PRJEB74658 into NEREA samples IDs
# -Nerea_18S-V9_ASVs-filt90.fasta		ASV sequences, length >=90 bp
# -Nerea_18S-V9_ASVs-blastout+taxa.tsv		Full BLAST assignment table with up to 20 hits for each ASV
# -Nerea_18S-V9_ASVs_filt90-GenSort.tsv		Top BLAST matches with taxonmies defined at least at genus level if present, top match otherwise - 1 hit for each ASV
# -QscoreDistSumm.log				Summary of vsearch --fastq_stats logs
# -DADA2-default2.R				DADA2 denoising R script
# -DADA2-default2.Rout				DADA2 denoising R script output 
# -Cutadapt_trim+unify.sh			bash script for primer trimming and read re-orientation 
# -ASV_Blast_xtr4.sh				bash script that combines the ASV abundance table from DADA2 with BLAST results and taxomnomic descriptions
# -swarm_d1/swarm_d1.log			swarm clustering run log
# -swarm_d1/swarm_d1.out			list of clusters, one cluster per line. A cluster is a list of amplicon headers separated by spaces.
# -swarm_d1/swarm_d1.stat			table with one cluster per row and seven columns of information; see swarm manual for details
# -swarm_d1/swarm_d1.str			cluster structure (five-columns tab-delimited format); see swarm manual for details
# -swarm_d1/swarm_d1.uc				tab-separated uclust-like format with 10 columns and 3 different type of entries (S, H or C). 
# (continued)					Each fasta sequence in the input file can be either a cluster centroid (S) or a hit (H) assigned to a cluster. 
# -18S_reads_DADA2_stats.csv			DADA2 statistics.

####### Procedures ######

# Quality check was performed with vsearch --fastq_stats command; quality was better than average, with ~95% of reads at least Q37 -a summary of the output logs is in the file QscoreDistSumm.log

# trim primers and unify mixed orientation reads; requires cutadapt and vsearch - see included script Cutadapt_trim+unify.sh
./Cutadapt_trim+unify.sh

# Denoising: standard DADA2 procedure adapted from that described in the web site https://benjjneb.github.io/dada2/tutorial.html - see included R script

### R code start ###

>source( "DADA2-default2.R" )

### R code end ###


# taxonomic assignment using PR2_4.14.3 reference DB; this reference is based on PR2_4.14 
# Guillou et al. 2013. The Protist Ribosomal Reference database (PR2): a catalog of unicellular eukaryote Small Sub-Unit rRNA sequences with curated taxonomy. Nucleic Acids Res. 41:D597–604
# The database was integrated with:
# -330 entries comprising Protist taxa from GoN contributed by SZN researchers, 
# -30 Fungi envirmental sequences from BioMarks (ISME J 6, 1823–1833 (2012). https://doi.org/10.1038/ismej.2012.36 - http://metagenomics.anl.gov/; data set number 4478907.3)

srun -c 32 blastn \
-db ~/BLAST-DB/pr2_4.14.3_SSU/pr2_4.14.3_SSU \
-query Nerea_18S-V9_ASVs.fasta \
-outfmt 6 \
-num_threads 32 \
-qcov_hsp_perc 80 \
-max_target_seqs 20 \
-out Nerea_18S-V9_ASVs-blastout.tsv

# Add taxonomies
./ASV_Blast_xtr4.sh Nerea-18S_ASVs.tsv Nerea_18S-V9_ASVs-blastout.tsv >Nerea_18S-V9_ASVs-blastout+taxa.tsv

# Filter out results of ASVs >90bp
vsearch --fastx_filter Nerea_18S-V9_ASVs.fasta --fastq_minlen 90 --quiet --fastaout_discarded - |grep ">" |sed 's/>//; s/$/\t/' |grep -v -f - Nerea_18S-V9_ASVs-GenSort.tsv >Nerea_18S-V9_ASVs_filt90-GenSort.tsv

# Filter ASVs >90bp
vsearch --fastx_filter Nerea_18S-V9_ASVs.fasta --fastq_minlen 90 --fastaout Nerea_18S-V9_ASVs-filt90.fasta

# Fix Parabodo nitrophilus in assignment table (caused by species name in the genus definition in PR2 .tax file - fixed in PR2_4.14.3 .tax file)
sed -Ei 's/Parabodo$/Parabodo\tParabodo_nitrophilus/' ../DADA2-default2/Nerea_18S-V9_ASVs-blastout+taxa.tsv

#### R code start ####

# full tax assignment matrix loaded in R
# > Nerea_18S_V9_raw <- read_tsv( "Nerea_18S-V9_ASVs-blastout+taxa.tsv", col_types="ciiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiicdiiiiiiiddcccccccc" )

# Select best match at Genus level when available
# > Nerea_18S_V9_GenSort <- Nerea_18S_V9_raw %>% Gen_Sort()

# write table with best matches at Genus level
# > write_tsv( Nerea_18S_V9_GenSort, "Nerea_18S-V9_ASVs-GenSort.tsv" )

#### R code end ####


# swarm clustering d=1
swarm -d 1 -f -c 12000 -t 8 -z -l swarm_d1.log -o swarm_d1.out -u swarm_d1.uc -i swarm_d1.str -s swarm_d1.stat -w Nerea_18S-V9_ASVs-filt90_swarm_d1-seeds.fasta Nerea_18S-V9_ASVs-filt90+ab.fasta

# swarm clustering d=2 
swarm -d 2 -t 8 -z -l swarm_d2.log -o swarm_d2.out -u swarm_d2.uc -i swarm_d2.str -s swarm_d2.stat -w Nerea_18S-V9_ASVs-filt90_swarm_d2-seeds.fasta Nerea_18S-V9_ASVs-filt90+ab.fasta

#### PR2_5.1 assignment - 03/2024 ####

# Following the release of PR2 v5, the database was integrated with the additional sequences previuosly used to integrate PR2 v4.14;
# PR2 V5 already contains a number of the sequences formerly added to integrate PR2 v4.14; 
# more information is available in the PR2_5.1 README and list of additional sequences

# BLAST assignment (falkor HPC)
srun -c 32 blastn \
-db ~/BLAST-DB/PR2_5.1_SSU/PR2_5.1_SSU \
-query Nerea_18S-V9_ASVs-90.fasta \
-outfmt 6 \
-num_threads 32 \
-perc_identity 90 \
-qcov_hsp_perc 80 \
-max_target_seqs 20 \
-out Nerea_18S-V9_ASVs-90-blastout.tsv

# Add taxonomies
time ./Add_Taxo_PR2_5.sh ../Nerea-18S_ASVs.tsv Nerea_18S-V9_ASVs-90-blastout.tsv >Nerea_18S-V9_ASVs-90-blastout+taxa.tsv                                                                                                           

#real    0m15,723s
#user    0m15,216s
#sys     0m2,584s

## Add_Taxo_PR2_5.sh script
#################################################################################################################################
##!/bin/bash
#
## Merges ASV abundance tables with BLAST assignment tables, adding taxonmies to assigned SeqID from the PR2 .tax file.
## V2.1 - core routines rewritten in gawk using associative arrays, resulting in ~100X speed improvement over grep based approach
## handles multiple BLAST hits per ASV/OTU, allowing to use database redundancy in following top hit selection
## unneeded extra columns introduced  by gawk now indexed based on n. of samples and removed from final output
## PR2_5+  version - 9 level taxonomies
## a rough thing that works - G. Zampicinini  06/2023
#
## Check where we are..
##if [  -e /media/ntfsdrive/ ]
##  then Prefix="/media/z"
##  else Prefix="/media/jpz"
##fi
#
#ASVTab=$1
#BlastTab=$2
#
## copy .tax file to ram for faster access?
##if [ ! -e /dev/shm/*.tax ]
##  then
##  TaxFile=$(zenity  --title="Select reference taxonomy" --file-selection  --filename="/media/z/_home/jpz/Documents/BLAST-DB/" --file-filter="*.tax.gz")
##  OutFile=$(basename -s ".gz" "$TaxFile")
##  zcat "$TaxFile" >"/dev/shm/$OutFile"
##  RefTax="/dev/shm/$OutFile"
##  else
##  RefTax=$(ls /dev/shm/*.tax)
##fi
#
#RefTax=$(ls /dev/shm/*.tax)
#
#THeads=$(sed -n '1p' "$ASVTab") # Store ASV table column names
#
## Additional ASV and SeqID cols temporary needed for indexing gawk array have to be rermoved from final table
#CutFieldN=$(echo "$THeads" |gawk -F "\t" '{print NF}') #Store column n.  BEFORE additional ASV names column
#let CutFieldN2="$CutFieldN"+2 #Skip 1 column (AFTER additional ASV names column)
#let CutFieldN3="$CutFieldN"+12 #Store column n.  BEFORE additional SeqIDs column
#let CutFieldN4="$CutFieldN"+14 #Skip 1 column (AFTER additional SeqIDs column)
#
#
#
##echo "$CutFieldN"
##echo "$CutFieldN2"
#
##echo "Taxfile=$TaxFile"
##echo "OutFile=$OutFile"
##echo "RefTax=$RefTax"
##echo "$THeads"
#
## ASVs=( $(sed '1d' "$ASVTab" |gawk '{print $1}') ) # ASV names array - unused for now
#
#tmpfile=`mktemp`
#echo "" >"$tmpfile"
#
##gawk 'FNR==NR {IDs[$1]=$0} {ID=$2; if (ID in IDs && ARGIND ==2) {print $0, IDs[ID]}}' "$RefTax" "$BlastTab" |tr -s " " "\t" |cut -f -26,28- >>"$tmpfile"
#gawk 'FNR==NR {IDs[$1]=$0} {ID=$2; if (ID in IDs && ARGIND ==2) {print $0, IDs[ID]}}' "$RefTax" "$BlastTab" |tr -s " " "\t" >>"$tmpfile"
#
#sed '1d' "$ASVTab" >/tmp/ASVTabNH
#ASVTabNH="/tmp/ASVTabNH"
#
#gawk 'NR==FNR {IDs[$1]=$0} {if (ARGIND ==2) print IDs[$1], $0}' "$ASVTabNH" "$tmpfile"  |tr -s " " "\t" |cut -f -"$CutFieldN","$CutFieldN2"-"$CutFieldN3","$CutFieldN4"-  \
#|gawk 'NR==FNR {IDs[$1]=$0} {if (ARGIND ==2 && !($1 in IDs)) print $0"\tUnassigned\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"; else if (ARGIND ==1) print $0}' - "$ASVTabNH" 
#\
#|sed '1d' \
#|tr -s " " "\t" \
#|sort -k "$CutFieldN"nr \
#|tr ";" "\t" |sed -E 's/\s+$//; s/\t+$//' \
#|sed "1i$THeads\tSubj_ID\tMatch\tAl_len\tMismatches\tGaps\tQ_start\tQ_end\tS_start\tS_end\tEvalue\tBitScore\tDomain\tSupergroup\tDivision\tSubdivision\tClass\tOrder\tFamily\tGenus\tSpecies"
################################################################################################################################


#### R code start ####

# library( tidyverse (

# full tax assignment matrix loaded in R
# >Nerea_18S_V9_raw <- read_tsv( "Nerea_18S-V9_ASVs-90-blastout+taxa.tsv", col_types="ciiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiicdiiiiiiiddccccccccc" )

# Select best match at Genus level when available
# > Nerea_18S_V9_GenSort <- Nerea_18S_V9_raw %>% Gen_Sort()

## GenSort function
################################################################################################################################
#> Gen_Sort                                                                                                                                                                                              [0/205]
#function( RT )
#    group_by( RT, ASV ) %>%
#    arrange( grepl( "X_sp[.]$", Species ), desc( BitScore ), by_group = TRUE ) %>%
#    add_tally( n_distinct( Species, na.rm = TRUE ), name = "Sp_ab" ) %>%
#    # N_RefSeqs not used for ranking alignments, kept just for info
#    add_tally( n_distinct( Subj_ID, na.rm = TRUE ), name = "N_RefSeqs" ) %>%
#    add_tally(  wt = n_distinct( keep( Subj_ID, Match >=97 ) ),  name = "N_RefSeqs97" ) %>% 
#    slice_head() %>%
#    arrange( desc( Total ) )
################################################################################################################################

# write table with best matches at Genus level
# > write_tsv( Nerea_18S_V9_GenSort, "Nerea_18S-V9_ASVs-90-GenSort.tsv" )

#### R code end ####
