#################################################################################################################################### #################################################################################################################################### ################################### ################################### ################################### Antarctic fur seal skin microbiome OTU processing pipeline ################################### ################################### ################################### #################################################################################################################################### #################################################################################################################################### Supplementary Material "Fur seal microbiota are shaped by the social and physical environment, show mother-offspring similarities and are associated with host genetic quality" OTU processing pipeline scripts Stefanie Grosser, Jan Sauer, Anneke J. Paijmans, Barbara A. Caspers, Jaume Forcada, Jochen B. W. Wolf, Joseph I. Hoffman ############################################################################################################# ###### Analysis of 16S skin microbiome samples from Antarctic fur seals using USEARCH/UPARSE pipeline ###### ###### 48 Mother-offspring samples (96 total) ###### ###### 300 bp forward and reverse reads from Illumina MiSeq run ###### ###### 3.0 Gb of rawdata ###### ###### Stefanie Grosser, July 2017, LMU Munich ###### ###### Using USEARCH version 9.2.64 (latest version on UPPMAX) ###### ############################################################################################################# ## Rawdata location rawdata_path=/PATH/TO/MicrobiomeAnalysis/rawdata/ ################################################################################## ## 1. QC files with fastx_info ## (QC was already done with FastQC v0.7.2, so this part is optional) ################################################################################## module load bioinfo-tools module load usearch/9.2.64 ## Define input und output paths and files indir=/PATH/TO/MicrobiomeAnalysis/rawdata/ outdir=/PATH/TO/MicrobiomeAnalysis/fastq_info/ ## Run fastx_info for all files in the input directory for fq in ${indir}*.fastq do usearch -fastx_info $fq -output ${outdir}$fq done ## Inspect results as described in the UPARSE manual ################################################################################## ## 2. Merge forward and reverse reads ## Forward and reverse reads for each sample are merged and merged sequences from ## all samples are pooled into a single file for subsequent analyses. ## QC indicates that read quality is not very good, hence merging is done with ## relaxed parameters ## Shell script name: uparse_readMerging_AFSmicrobiome.sh ## The analysis is run on one 8GB RAM core ################################################################################## #!/bin/bash -l #SBATCH -t 1-00:00:00 #SBATCH -p core -n 1 module load bioinfo-tools module load usearch/9.2.64 ## Define input und output paths and files indir=/PATH/TO/MicrobiomeAnalysis/rawdata/ tmpdir=/scratch/$SLURM_JOB_ID/ outdir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ ## Files need to be uncompressed for input into usearch. ## To avoid saving uncompressed files in a data directory (takes up too much space), ## save them to the scratch folder (temporary directory only accessible in the current session). for fqs in ${indir}*.fastq.gz do filename=$(basename "$fqs" ".fastq.gz") gunzip -c $fqs > ${tmpdir}${filename}.fastq echo ${tmpdir}${filename}.fastq done ## Check if all files have been decompressed. Compare number of rawdata files with uncompressed files. ## Exit script if numbers are not equal. compressedFiles=$(ls ${indir}*.fastq.gz | wc -l) #192 uncompressedFiles=$(ls ${tmpdir}*.fastq | wc -l) if [ $uncompressedFile != $compressedFiles ] ; then exit 1 else echo "All files have been uncompressed." fi ## Run usearch read merging (parameters explained below) usearch -fastq_mergepairs ${tmpdir}*R1*.fastq -fastqout ${outdir}AFS_merged_all.fq -relabel @ -report ${outdir}AFS_mergingReport.txt -tabbedout ${outdir}AFS_mergingTable.txt -fastq_nostagger -fastq_trunctail 5 -fastq_maxdiffpct 25 -fastq_maxdiffs 30 -fastq_minmergelen 380 -fastq_maxmergelen 520 ## Paramters explained #-relabel @ ## adds sample identifiers to each read label (useful when pooling samples) #-fastq_nostagger ## will remove pairs for which the entire biological sequence (but not the adapter overhangs) aligns. For 300 bp paired-end some unaligned biological sequence is expected (only ~130-140bp overlap) #-fastq_trunctail 5 ## truncate 3' ends of each read at first Q score with <= this value (default 2) #-fastq_maxdiffpct 25 ## maximum percent difference of aligned regions (default 10) #-fastq_maxdiffs 30 ## maximum number of mismatches in alignment (default 5) #-fastq_minmergelen 380 ## minimum length of merged sequence; filters artefacts (sequences are expected to be around 460 bp long) #-fastq_maxmergelen 520 ## maximum length of merged sequence; filters artefacts ## Other options for troubleshooting #-tabbedout ## outputs a table with merging results for each read #-alnout ## outputs human readable alignment ## Extract merging information for each sample echo $(less AFS_mergingReport.txt | grep "Relabel" | awk '{print $3}') > sampleIDs.txt echo $(less AFS_mergingReport.txt | grep "pairs merged" | awk '{print $1 "\t" $3 "\t" $6}') > mergedInfo.txt ################################################################################## ## 3. Primer trimming ## Can be done in usearch using the -fastx_truncate command with -stripleft 17 and ## -stripright 21 options but this might leave some artefacts if primers are ## not exactly positioned at the read ends. ## Use cutadapt instead. Trim forward and reverse primers in separate steps. ## Shell script name: cutadapt_trimPrimers_AFSmicrobiome.sh ## The analysis is run on one 8GB RAM core ################################################################################## #!/bin/bash -l #SBATCH -t 10:00:00 #SBATCH -p core -n 1 module load bioinfo-tools module load cutadapt/1.9.1 module load Fastx/0.0.14 ## Define input und output paths and files tmpdir=/scratch/$SLURM_JOB_ID/ indir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ outdir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ seqs=AFS_merged_all.fq seq_name=$(basename "$seqs" ".fq") ## First, trim anchored 5' forward primer. FPf (forward primer forward) will be found in normal reads, ## RPf (reverse primer forward) will be found in reverse complemented forward reads. ## Untrimmed sequences will be saved to a file in each step, to check them afterwards. ## The first output file will be saved to the scratch directory as it is not needed later. ## The {name} option in the output file will produce separate files for each primer defined with FPf= and RPf=. cutadapt -g FPf=^CCTACGGGNGGCWGCAG -g RPf=^GACTACHVGGGTATCTAATCC -e 0.2 --overlap 10 --untrimmed-output ${tmpdir}${seq_name}_untrimmed_F.fastq -o ${tmpdir}${seq_name}_{name}.fastq ${indir}${seqs} ## reverse complement the detected rev comp reads with Fastx inseq=${tmpdir}${seq_name}_RPf.fastq seq_name=$(basename "$inseq" "_RPf.fastq") fastx_reverse_complement -v -i ${inseq} -o ${tmpdir}${seq_name}_RPf_rc.fastq ## Concatenate the files from the forward reads and the reverse complemented reads for step two of the trimming. ## If-statements check if both files are present or only one (if no rev comp reads were detected). if [ -f "${tmpdir}${seq_name}"_FPf.fastq ] && [ -f "${tmpdir}${seq_name}"_RPf_rc.fastq ] then cat "${tmpdir}${seq_name}"_FPf.fastq "${tmpdir}${seq_name}"_RPf_rc.fastq > "${tmpdir}${seq_name}"_forwardTrim1.fastq elif [ -f "${tmpdir}${seq_name}"_FPf.fastq ] && [ ! -f "${tmpdir}${seq_name}"_RPf_rc.fastq ] then cat "${tmpdir}${seq_name}"_FPf.fastq > "${tmpdir}${seq_name}"_forwardTrim2.fastq fi seq=${tmpdir}${seq_name}_forwardTrim*.fastq echo ${seq} seq_name=$(basename "$seq" "_forwardTrim*.fastq") ## Second, trim anchored 3' reverse primer. FPf will be found in rev comp reads from step above, RPrc (reverce primer reverse complement) ## will be found in all FPf reads from step above. ## Untrimmed sequences will be saved to a file in each step, to check them afterwards. ## This time only one output file for both primers is created (no need for rev comp in this step). cutadapt -g FPf=^CCTACGGGNGGCWGCAG -a RPrc=GGATTAGATACCCBDGTAGTC$ -e 0.2 --overlap 10 --untrimmed-output ${tmpdir}${seq_name}_untrimmed_R.fastq -o ${tmpdir}${seq_name}_primersTrimmed.fastq ${seq} # Copy the final files from scratch directory to the output directory . cp ${tmpdir}${seq_name}_primersTrimmed.fastq ${outdir} cp ${tmpdir}${seq_name}_untrimmed_F.fastq ${outdir} cp ${tmpdir}${seq_name}_untrimmed_R.fastq ${outdir} ################################################################################## ## 4. Filter low quality sequences ## Expected error filtering with expected error threshold recommended in UPARSE ## pipeline ee=1.0. A higher threshold (i.e. 2) can be used if too many reads get ## discarded, however, this may increase the number of wrong OTUs. Might be ok when ## using singleton filtering. ## Shell script name: uparse_qualityFiltering_AFSmicrobiome.sh ## The analysis is run on one 8GB RAM core ################################################################################## #!/bin/bash -l #SBATCH -t 5:00:00 #SBATCH -p core -n 1 module load bioinfo-tools module load usearch/9.2.64 ## Define input und output paths and files indir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ outdir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ inseqs=AFS_merged_all_primersTrimmed.fastq ## Run usearch filtering step usearch -fastq_filter ${indir}${inseqs} -fastq_maxee 1.0 -fastqout ${outdir}AFS_merged_all_qualFiltered.fastq ################################################################################## ## 5. Dereplication to retain only unique sequences ## Only exact matches over the full length of two sequences will be considered. ## Shell script name: uparse_dereplicateSeqs_AFSmicrobiome.sh ## The analysis is run on one 8GB RAM core ################################################################################## #!/bin/bash -l #SBATCH -t 5:00:00 #SBATCH -p core -n 1 module load bioinfo-tools module load usearch/9.2.64 ## Define input und output paths and files indir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ outdir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ inseqs=AFS_merged_all_qualFiltered.fastq ## Run uparse dereplication step usearch -fastx_uniques ${indir}${inseqs} -fastqout ${outdir}AFS_merged_all_uniques.fastq -sizeout -strand both -relabel Uniq #-sizeout ## size annotation will be added to the sequence header (must be specified for OTU clustering) #-strand both ## will match reverse complement sequences #-relabel ## relabel the unique sequences (default is name of the representative sequence) ################################################################################## ## 6. Generate 97% OTUs ## OTU: all pairs of OTUs are >97% identical to each other. ## Chimeras are filtered with the commands, thus no extra chimera filtering steps ## are necessary. Size labels must pe present in the sequence headers. Singletons ## are also discarded when using the default settings (-minsize 2) ## Shell script name: uparse_clusterOTUs_AFSmicrobiome.sh ## The analysis is run on one 8GB RAM core ################################################################################## #!/bin/bash -l #SBATCH -t 10:00:00 #SBATCH -p core -n 1 module load bioinfo-tools module load usearch/9.2.64 ## Define input und output paths and files indir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ outdir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ inseqs=AFS_merged_all_uniques.fastq otus=AFS_merged_all_otus.fasta ## Command for clustering 97% OTUs usearch -cluster_otus ${indir}${inseqs} -otus ${outdir}${otus} -uparseout ${outdir}AFS_merged_all_otus_table.txt -relabel Otu -minsize 2 #-relabel ## relabel OTUs (OTU identifiers in the labels are required for making an OTU table) #-uparseout ## file documenting how the input sequences were classified (Otu, match, or chimera) ################################################################################## ## 7.Classify OTUs ## Align sequences against a database to classify them and check for unspecific ## targets. Use only well annotated and currated databases, not the full databases ## as these are mostly based on predictions, which adds more uncertainty and error ## to the classification. ## Shell script name: uparse_classifyOTUs_AFSmicrobiome.sh ## The analysis is run on one 8GB RAM core ################################################################################## #!/bin/bash -l #SBATCH -t 10:00:00 #SBATCH -p core -n 1 module load bioinfo-tools module load usearch/9.2.64 ## Define input und output paths and files indir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ outdir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ dbdir=/PATH/TO/MicrobiomeAnalysis/Databases/ otus=AFS_merged_all_otus.fasta rdp=rdp_16s_v16_sp.fa otusrdp=AFS_merged_all_otus_RDPclassified.sintax ## Create a udb database format (indexed) from fasta which allows for faster loading usearch -makeudb_sintax ${dbdir}${rdp} -output ${dbdir}rdp_16s_v16_sp.udb ## Database search. In the tabbedout file the first three fields are (1) query sequence label, (2) prediction with boostrap values, ## and (3) strand. If the -sintax_cutoff option is given then predictions are written a second time after applying the confidence ## threshold, keeping only ranks with high enough confidence. usearch -sintax ${indir}${otus} -db ${dbdir}rdp_16s_v16_sp.udb -tabbedout ${outdir}${otusrdp} -strand both -sintax_cutoff 0.8 ## AFS_merged_all_otus_RDPclassified.sintax --> AFSmicrobiome_SI_otuRDPclassification_Rinput_DatasetS7.sintax (Additional Dataset S7) ## Look for OTUs with lower than 1.0 confidence at domain level in the sintax file to check for non 16S ## Copy OTU ids to a new file. Commands run from the terminal. cd /PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ less AFS_merged_all_otus_RDPclassified.sintax | grep -v "d:Bacteria(1.0000)" | awk '{print $1}' > lowConfidenceDomainOTUlist.txt ## Look for OTUs with lower than 1.0 confidence at phylum level in the sintax file to check for non 16S ## Copy OTU ids to a new file. less AFS_merged_all_otus_RDPclassified.sintax | grep -Ev "p:.*?(1.0000)" | awk '{print $1}' > lowConfidencePhylumOTUlist.txt ## Copy OTU sequences to a new file. while read p; do less AFS_merged_all_otus.fasta | grep -A6 ">$p$" >> lowConfidenceDomainOTUs.fasta done $p$" >> lowConfidencePhylumOTUs.fasta done < lowConfidencePhylumOTUlist.txt ## Blast OTU sequences against NCBI nt and exclude all non-bacterial sequences from OTU list. ## Using blast+ tools with locally stored database. ## Shell script name: blast_blastLowConfidenceOTUs_AFSmicrobiome.sh ## The analysis is run on four 8GB RAM cores #!/bin/bash -l #SBATCH -t 1-00:00:00 #SBATCH -p core -n 4 module load bioinfo-tools module load blast/2.6.0+ ## Define input und output paths and files outdir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ indir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ dbdir=/PATH/TO/blast_databases/ otus=lowConfidencePhylumOTUs.fasta otu_name=$(basename "$otus" ".fasta") ## Run blast search blastn -db ${dbdir}nt -evalue 1e-20 -num_threads 4 -max_target_seqs 1 -dust yes -outfmt '6 qseqid sseqid evalue bitscore qstart qend sstart send length pident gapopen sgi sacc staxids sscinames stitle' -query ${indir}${otus} -out ${outdir}${otu_name}_Blast_nt_fmt6.txt ## Look for OTUs with Chloroplast classification in the sintax file. ## Copy OTU ids to a new file. Commands run from the terminal. cd /PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ less AFS_merged_all_otus_RDPclassified.sintax | grep "c:Chloroplast" > ChloroplastOTUlist.txt ## Make a list of OTUs to be removed and use Usearch -fastx_getseqs command with -notmatched option to filter out ## sequences on the list. For matching the labels a newline character has to follow immediately after the label. ## Shell script name: uparse_filterNon16Sotus_AFSmicrobiome.sh ## The analysis is run on one 8GB RAM cores #!/bin/bash -l #SBATCH -t 1-00:00:00 #SBATCH -p core -n 1 module load bioinfo-tools module load usearch/9.2.64 ## Define input und output paths and files outdir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ indir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ oturemove=OTUsToBeRemovedList.txt otunotkeep=removedOTUs.fasta otukeep=AFS_merged_all_otus_filteredFinal.fasta usearch -fastx_getseqs ${indir}${otus} -labels ${indir}${oturemove} -notmatched ${outdir}${otukeep} -fastaout ${outdir}${otunotkeep} ################################################################################## ## 8. Generate OTU table ## Sequences are mapped to the final set of OTUs with 97% identity. Mapped are ## sequences from step after merging and trimming, and before quality filtering. ## Shell script name: uparse_generateOTUtable_AFSmicrobiome.sh ## The analysis is run on one 8GB RAM cores ################################################################################## #!/bin/bash -l #SBATCH -t 1-00:00:00 #SBATCH -p core -n 1 module load bioinfo-tools module load usearch/9.2.64 ## Define input und output paths and files outdir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ indir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ inseqs=AFS_merged_all_primersTrimmed.fastq otus=AFS_merged_all_otus_filteredFinal.fasta otutable=AFS_merged_all_OTUtable_final.txt otutabstats=AFS_merged_all_OTUtable_statsReport_final.txt usearch -usearch_global ${indir}${inseqs} -db ${indir}${otus} -strand plus -id 0.97 -otutabout ${outdir}${otutable} #-strand plus ## required for nucleotide sequences #-id 0.97 ## identity threshold #-otutabout ## QIIME tabbed format (other formats are available) ## The -otutab_stats command generates statistics for the OTU table (e.g. reads per sample) usearch -otutab_stats ${indir}${otutable} -output ${outdir}${otutabstats} ################################################################################## ## 9. Trimming OTU table ## Remove the individuals with very few reads. Use statistics output from ## -otutab_stats command to find a cutoff. Only two individuals have less than ## 10000 reads mapped to OTUs. These are removed. Trim OTU table so that low ## frequency OTUs are removed (likely sequencing errors, cross talk etc.). ## Shell script name: uparse_trimOTUtable_AFSmicrobiome.sh ## The analysis is run on one 8GB RAM cores ################################################################################## #!/bin/bash -l #SBATCH -t 1-00:00:00 #SBATCH -p core -n 1 module load bioinfo-tools module load usearch/9.2.64 ## Define input und output paths and files outdir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ indir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ otutrimtable=AFS_merged_all_OTUtable_final_trimmed.txt usearch -otutab_trim ${indir}${otutable} -min_sample_size 10000 -min_otu_freq 0.00005 -output ${outdir}${otutrimtable} #-min_sample_size 10000 ## trimming OTUs that have less than 10000 reads mapped to them. #-min_otu_freq 0.00005 ## trimming OTUs that have a frequency lower than 0.005% as suggested in Bokulich et al. 2015. ## Additionally create the OTU table only trimming of OTUs with frequency lower than 0.005% and keeping the low read individuals for some initial overall stats ## The same OTUs are retained as in the trimming procedures above otutrimtable_freqOnly=AFS_merged_all_OTUtable_final_trimmed_allSamples.txt usearch -otutab_trim ${indir}${otutable} -min_otu_freq 0.00005 -output ${outdir}${otutrimtable_freqOnly} ## Generate a new statistics file otutabtrimstats=AFS_merged_all_OTUtable_statsReport_final_trimmed.txt usearch -otutab_stats ${indir}${otutrimtable} -output ${outdir}${otutabtrimstats} ## Optional: ## Normalise OTU table to account for different sequencing depth for all samples. ## The adjusted count is calculated as new = (old * sample_size) / total where total is the total number of reads in the sample. otutabtrimnorm=AFS_merged_all_OTUtable_final_trimmed_normalised10000.txt usearch -otutab_norm ${indir}${otutrimtable} -sample_size 10000 -output ${outdir}${otutabtrimnorm} ## AFS_merged_all_OTUtable_final_trimmed_allSamples.txt --> AFSmicrobiome_SI_OTUtable_final_trimmed_allSamples_Rinput_DatasetS9.txt (Additional Dataset S9) ################################################################################## ## 10. Calculate diversity indeces ## Alpha diversity indeces can be calculated in UPARSE. Calculate Jost index ## of order q= 0, 1, and 2 (the effective number of species based on Hill numbers). ## Higher values of q give lower weights to low-abundance species. q=0 equals ## richness, q=1 equals Shannon entropy, and q=2 equals simpson index. ## See Chao et al. 2010. ## Shell script name: uparse_alphaDiversity_AFSmicrobiome.sh ## The analysis is run on one 8GB RAM cores ################################################################################## #!/bin/bash -l #SBATCH -t 1-00:00:00 #SBATCH -p core -n 1 module load bioinfo-tools module load usearch/9.2.64 ## Define input und output paths and files indir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ outdir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ ## Normalised trimmed tables with 94 individuals otutabtrimnorm=AFS_merged_all_OTUtable_final_trimmed_normalised10000.txt ## Non-normalised table with all samples otutrimtable_freqOnly=AFS_merged_all_OTUtable_final_trimmed_allSamples.txt OTUjost0norm=AFS_merged_all_OTUtable_alphaDivJost0_normalised.txt OTUjost1norm=AFS_merged_all_OTUtable_alphaDivJost1_normalised.txt OTUjost2norm=AFS_merged_all_OTUtable_alphaDivJost2_normalised.txt OTUjost0all=AFS_merged_all_OTUtable_alphaDivJost0_allSamples.txt OTUjost1all=AFS_merged_all_OTUtable_alphaDivJost1_allSamples.txt OTUjost2all=AFS_merged_all_OTUtable_alphaDivJost2_allSamples.txt ## Calculate diversity indeces usearch -alpha_div ${indir}${otutabtrimnorm} -output ${outdir}${OTUjost0norm} -metrics jost -jostq 0 usearch -alpha_div ${indir}${otutabtrimnorm} -output ${outdir}${OTUjost1norm} -metrics jost -jostq 1 usearch -alpha_div ${indir}${otutabtrimnorm} -output ${outdir}${OTUjost2norm} -metrics jost -jostq 2 usearch -alpha_div ${indir}${otutrimtable_freqOnly} -output ${outdir}${OTUjost0all} -metrics jost -jostq 0 usearch -alpha_div ${indir}${otutrimtable_freqOnly} -output ${outdir}${OTUjost1all} -metrics jost -jostq 1 usearch -alpha_div ${indir}${otutrimtable_freqOnly} -output ${outdir}${OTUjost2all} -metrics jost -jostq 2 ## Optional: ## Try calculating alpha diversity with rarefied OTU table from qiime single_rarefaction.py script. ## Shell script name: qiime_rarefy_alphaDiversity_AFSmicrobiome.sh ## The analysis is run on one 8GB RAM cores #!/bin/bash -l #SBATCH -t 1-00:00:00 #SBATCH -p core -n 1 module load bioinfo-tools module load Qiime/1.9.1 module load usearch/9.2.64 ## Define input und output paths and files indir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ outdir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ otutrimtable_freqOnly=AFS_merged_all_OTUtable_final_trimmed_allSamples.txt # Calculate rarefied table. Input classic format from usearch, output biom format. single_rarefaction.py -i ${indir}${otutrimtable_freqOnly} -o ${outdir}AFS_merged_all_OTUtable_final_trimmed_raref10000.biom -d 10000 # Convert biom format table back to classic format for usearch. biom convert -i ${indir}AFS_merged_all_OTUtable_final_trimmed_raref10000.biom -o ${outdir}AFS_merged_all_OTUtable_final_trimmed_raref10000.txt --to-tsv # Run alpha diversity calculation but manually remove header line from file first. usearch -alpha_div ${indir}AFS_merged_all_OTUtable_final_trimmed_raref10000.txt -output ${outdir}OTUjost1_raref.txt -metrics jost -jostq 1 ## AFS_merged_all_OTUtable_final_trimmed_raref10000.txt --> AFSmicrobiome_SI_OTUtable_final_trimmed_raref10000_Rinput_DatasetS10.txt (Additional Dataset S10) ## --> AFSmicrobiome_SI_alphaDiversity_Rinput_DatasetS12.txt (Additional Dataset S12) ################################################################################## ## 10a. Calculate alpha diversity (jost1) from multiple rarefied tables to be used ## to estimate uncertainty for the heterozygosity alpha correlations. ## Shell script name: qiime_multiRarefy_alphaDiversity_AFSmicrobiome.sh ## The analysis is run on one 8GB RAM cores ################################################################################## #!/bin/bash -l #SBATCH -t 1-00:00:00 #SBATCH -p core -n 1 module load bioinfo-tools module load Qiime/1.9.1 module load usearch/9.2.64 ## Define input und output paths indir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ outdir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ ## Original trimmed OTU table otutrimtable_freqOnly=AFS_merged_all_OTUtable_final_trimmed_allSamples.txt ## Calculate rarefied tables. Input classic format from usearch, output biom format. Rarefies table to 10000 reads per sample. Outputs 100 rarefied tables. multiple_rarefactions_even_depth.py -i ${indir}${otutrimtable_freqOnly} -o ${outdir} -d 10000 -n 100 ## Convert biom format tables back to classic format for usearch. for i in ${outdir}AFS_merged_all_OTUtable_final_trimmed_raref10000_*.biom ; do name=$(basename "$i" ".biom") biom convert -i ${i} -o ${outdir}${name}.txt --to-tsv done ## Run alpha diversity calculation for each file (remove Qiime header line from file). for i in ${outdir}AFS_merged_all_OTUtable_final_trimmed_raref10000_*.txt ; do number=$(echo $i | cut -d "." -f 1 | cut -d "_" -f 8) # get the file number sed -i 1d $i # removes Qiime header line sed -i 's/^#//' $i # removes the "#" character from the table header sed -i 's/\.0//g' $i # remove all ".0" after the OTU number (Qiime outputs 1 decimal point) usearch -alpha_div ${i} -output ${outdir}OTUjost1_raref_${number}.txt -metrics jost -jostq 1 done ## Clean up the alpha diversity output table. for i in ${outdir}OTUjost1_raref*; do number=$(echo $i | cut -d "." -f 1 | cut -d "_" -f 3) # get the file number sed -i 1,2d $i # remove the first two usearch header lines of the file sed -i "s/jost/jost_${number}/" $i # change "jost" to "jost_{number of the file}" done ## Combine diversity alpha estimates from all files into a single file paste ${outdir}OTUjost1_raref_* | awk '{ for (i=3;i<=NF;i+=2) $i="" } 1' > ${outdir}All_OTUjost1_MultiRaref100.txt ## All_OTUjost1_MultiRaref100.txt --> AFSmicrobiome_SI_alphaDiversityMultiRaref_Rinput_DatasetSX.txt (Additional Dataset SX) ################################################################################## ## 10b. Calculate alpha diversity (jost1) based only on OTUs belonging to the four ## dominant phyla (Proteobacteria, Bacteroidetes, Firmicutes, Actinobacteria = 613 OTUs) ## The document MainPhylaOtuList.txt was created in R. ## Shell script name: uparse_alphaDiversity_mainPhyla_AFSmicrobiome.sh ## The analysis is run on one 8GB RAM cores ################################################################################## #!/bin/bash -l #SBATCH -t 1-00:00:00 #SBATCH -p core -n 1 module load bioinfo-tools module load usearch/9.2.64 ## Define input und output paths indir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/FinalFiles/ outdir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/FinalFiles/DiversityCalulations/DiversityMainPhylaAnalysis/ ## Original trimmed OTU table otutrimtable_freqOnly=AFS_merged_all_OTUtable_final_trimmed_allSamples.txt ## Lists containing the core microbiome OTUs. mainphyla=MainPhylaOtuList.txt ## Define names of the new output OTU tables without the core microbiome mainphyla_otu=AFS_merged_all_OTUtable_final_trimmed_allSamples_mainPhyla.txt ## Remove the core microbiome OTUs on the list from the trimmed OTU table head -n 1 ${indir}${otutrimtable_freqOnly} > ${outdir}${mainphyla_otu} grep -wf ${outdir}${mainphyla} ${indir}${otutrimtable_freqOnly} >> ${outdir}${mainphyla_otu} ## Define names of the diversity calculation output files OTUjost1all_mainphyla=AFS_merged_all_OTUtable_alphaDivJost1_allSamples_mainphyla.txt ## Calculate diversity index usearch -alpha_div ${outdir}${mainphyla_otu} -output ${outdir}${OTUjost1all_mainphyla} -metrics jost -jostq 1 ## AFS_merged_all_OTUtable_alphaDivJost1_allSamples_mainphyla.txt --> AFSmicrobiome_SI_alphaDiversity_MainPhyla_Rinput_DatasetS13.txt (Additional Dataset S13) ################################################################################## ## 11. Curate fasta and sintax files to retain only the final OTUs from the trimmed ## table. ## Shell script name: uparse_FileCuration_AFSmicrobiome.sh ## The analysis is run on one 8GB RAM cores ################################################################################## ## Get all OTU ids in the fasta files. Commands are run in the terminal. cd /PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ less AFS_merged_all_otus_filteredFinal.fasta | grep ">Otu" | sed s/\>// | awk '{print $1}' | sort > OTUids_all_otus_filteredFinal.txt ## Get all IDs left in the OTU tables after trimming less AFS_merged_all_OTUtable_final_trimmed.txt | awk -F "\t" '{print $1}' | tail -n +2 | sort > filteredOTUtab_names.txt ## Compare IDs from both files and save the differences to new file comm -23 OTUids_all_otus_filteredFinal.txt FinalFiles/filteredOTUtab_names.txt > differencesOTUs.txt #!/bin/bash -l #SBATCH -t 1-00:00:00 #SBATCH -p core -n 1 module load bioinfo-tools module load usearch/9.2.64 ## Define input und output paths and files indir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ outdir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ otus=AFS_merged_all_otus_filteredFinal.fasta trimmedotus=AFS_merged_all_otus_filteredFinal_trimmed.fasta oturemove=differencesOTUs.txt tmpdir=/scratch/$SLURM_JOB_ID/ ## Save only sequences in the trimmed OTU table to a new fasta file usearch -fastx_getseqs ${indir}${otus} -labels ${indir}${oturemove} -notmatched ${outdir}${trimmedotus} ################################################################################## ## 12. Align final OTUs and build phylogenic tree for UniFrac distance calculation ## Use QIIME PyNAST or MUSCLE to build alignment and build the tree with FastTree. ## PyNAST aligns sequences to an existing database, MUSCLE produces denovo alignment. ## Make alignments with and without an outgroup. AS outgroup sequence use an archaeal ## sequence. ## Shell script name: qiime_BuildAligments_AFSmicrobiome.sh ## Shell script name: qiime_BuildPhylogeny_AFSmicrobiome.sh ## The analysis is run on one 8GB RAM cores ################################################################################## ## Alignment with Pynast against the Greengenes DB ## Script qiime_BuildAligments_AFSmicrobiome.sh: #!/bin/bash -l #SBATCH -t 1-00:00:00 #SBATCH -p core -n 1 module load bioinfo-tools module load Qiime/1.9.1 ## Define input und output paths and files indir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ outdir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ trimmedotus=AFS_merged_all_otus_filteredFinal_trimmed_outgroup.fasta ## Align the sequences with PyNAST align_seqs.py -i ${indir}${trimmedotus} -m pynast -o ${outdir} ## Resulting alignment file is named pynastalign=AFS_merged_all_otus_filteredFinal_trimmed_outgroup_aligned.fasta ## Filter positions that are gaps in all sequences. filter_alignment.py -i ${outdir}${pynastalign} -s -o ${outdir} # -s removes the standard filter mask, which didn't seem to work properly pynastFiltered=AFS_merged_all_otus_filteredFinal_trimmed_outgroup_pynastAligned_filtered.fasta ## Denovo alignment with MUSCLE align_seqs.py -i ${indir}${trimmedotus} -m muscle -o ${outdir} ## Resulting alignment file is named muscle=AFS_merged_all_otus_filteredFinal_trimmed_MuscleAligned.fasta ## Build phylogenetic tree with FastTree implemented in QIIME. seqs=(${indir}${muscle} ${indir}${pynastFiltered}) for seq in "${seqs[@]}"; do sbatch ${indir} qiime_BuildPhylogeny_AFSmicrobiome.sh $seq done ## Script qiime_BuildPhylogeny_AFSmicrobiome.sh: #!/bin/bash -l #SBATCH -t 1-00:00:00 #SBATCH -p core -n 1 module load bioinfo-tools module load Qiime/1.9.1 ## Define input und output paths and files indir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ outdir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ alignin=$1 make_phylogeny.py -i ${indir}${alignin} -o ${outdir} --> AFSmicrobiome_SI_outgroup_pynastAligned_filtered_Rinput_DatasetS14.tre (Additional Dataset S14) ################################################################################## ## ** Extra analysis for PICRUSt. Metagenome analysis predicted from 16S. ## Predicts metagenomes from closed-reference 16S OTUs. OTUs have to be picked new ## against a reference database. Use QIIME to do this following the PICRUSt tutorial. ## Shell script name: qiime_referenceOTUs_AFSmicrobiome.sh ## The analysis is run on one 8GB RAM cores ################################################################################## ## Take as input the merged and primer trimmed sequences from USEARCH ## First, convert fastq to fasta. #!/bin/bash -l #SBATCH -t 1:00:00 #SBATCH -p core -n 1 module load bioinfo-tools module load Fastx/0.0.14 module load Qiime/1.9.1 indir=/PATH/TO/MicrobiomeAnalysis/uparseAnalysis/ outdir=/PATH/TO/MicrobiomeAnalysis/PicrustAnalysis/ inseqs=AFS_merged_all_primersTrimmed.fastq outseqs=AFS_merged_all_primersTrimmed.fasta fastq_to_fasta -n -i ${indir}${inseqs} -o ${outdir}${outseqs} # Change the name label from >M10.1 to >M10_1 to be compatible with QIIME requirements sed 's/\./\_/' ${outdir}AFS_merged_all_primersTrimmed.fasta > ${outdir}AFS_merged_all_primersTrimmed_QiimeIn.fasta ## Perform closed-reference OTU picking ## Pick OTUs at 97% identity against the Greengenes database version gg_13_8_otus. infile=AFS_merged_all_primersTrimmed_QiimeIn.fasta outfile=AFS_merged_all_primersTrimmed_closedRefOTUS.biom pick_closed_reference_otus.py -i ${outdir}${infile} -o ${outdir}${outfile} -f ## Filter the biom table by excluding all samples with no read mapped and OTUs that have less ## than 0.005% of all reads mapped to them. Finally, summarise the biom table biomtab=AFS_merged_all_primersTrimmed_closedRefOTUS.biom biomtab_filtered=AFS_merged_all_primersTrimmed_closedRefOTUS_filtered.biom biomtab_filtered2=AFS_merged_all_primersTrimmed_closedRefOTUS_filtered2.biom biomsum2=AFS_merged_all_primersTrimmed_closedRefOTUS_filtered2_summary.txt ## Filter samples with no reads mapped (should be none) filter_samples_from_otu_table.py -i ${indir}${biomtab} -o ${outdir}${biomtab_filtered} -n 1 ## Filter OTUs that have less than 0.005% of reads mapped filter_otus_from_otu_table.py -i ${indir}${biomtab_filtered} -o ${outdir}${biomtab_filtered2} --min_count_fraction 0.00005 ## Summarise the table biom summarize-table -i ${indir}${biomtab_filtered2} -o ${outdir}${biomsum2} ## Additionally create a file with rarefied counts to account for the uneven sequencing depth (recommended by PICRUSt) indir=/proj/b2014050/nobackup/private/MicrobiomeAnalysis/PicrustAnalysis/ outdir=/proj/b2014050/nobackup/private/MicrobiomeAnalysis/PicrustAnalysis/ infile=AFS_merged_all_primersTrimmed_closedRefOTUS_filtered2.biom # Calculate rarefied table. Rarefy to 3117, which is the lowest depth according to the summary file. single_rarefaction.py -i ${indir}${infile} -o ${outdir}AFS_merged_all_primersTrimmed_closedRefOTUS_filtered2_rarefied.biom -d 3117 ################################################################################## ## The actual PICRUSt analysis was run on the web-based Galaxy server ## http://galaxy.morganlangille.com/ ## Using GG version 13.5 (13.8 used for the OTU picking is only a minor update ## and compatible with 13.5) 1. Upload the .biom table AFS_merged_all_primersTrimmed_closedRefOTUS_filtered2_rarefied.biom. 2. Run Normalize by Copy Number. 3. Predict Metagenome using KEGG Orthologs. 4. Categorize by Function to hierarchy level 3. 5. Download the resulting .biom table ## Convert the biom file to a stamp .sfp profile format ## This python script from Microbiome Helper requires the biom package (this is loaded with QIIME) ## (https://github.com/LangilleLab/microbiome_helper/blob/master/biom_to_stamp.py) ## The resulting file can be analysed with STAMP #!/bin/bash -l #SBATCH -t 1:00:00 #SBATCH -p core -n 1 module load bioinfo-tools module load Qiime/1.9.1 indir=/PATH/TO/MicrobiomeAnalysis/PicrustAnalysis/ outdir=/PATH/TO/MicrobiomeAnalysis/PicrustAnalysis/ infile=Categorize_by_FunctionL3_FilteredTrimmed2_rarefied.biom outfile=Categorize_by_FunctionL3_FilteredTrimmed2_rarefied.spf ./biom_to_stamp.py -m KEGG_Pathways ${indir}${infile} > ${outdir}${outfile} ################################################################################## ## For additional analysis of the PICRUSt results (e.g. in R) the biom table was ## also converted into .txt format. #!/bin/bash -l #SBATCH -t 1:00:00 #SBATCH -p core -n 1 module load bioinfo-tools module load Qiime/1.9.1 biom convert -i Categorize_by_FunctionL3_FilteredTrimmed2_rarefied.biom -o Categorize_by_FunctionL3_FilteredTrimmed2_rarefied.txt --to-tsv --header-key KEGG_Pathways ## Categorize_by_FunctionL3_FilteredTrimmed2_rarefied.txt --> AFSmicrobiome_SI_Categorize_by_FunctionL3_FilteredTrimmed_rarefied_Rinput_DatasetS16.txt (Additional Dataset S16)