# Gardner et al., Repeated parallel losses of inflexed stamens in Moraceae: phylogenomics and generic revision of the tribe Moreae and the reinstatement of the tribe Olmedieae (Moraceae) # Annotated workflow for analyses -- Elliot Gardner, August 18, 2020. Please feel free to contact me with any questions. # If you find this workflow helpful, please consider citing our paper. ############################################ ######### A. Phylogenetic analyses ######### ############################################ ### Required programs: # GNU Parellel --- https://www.gnu.org/software/parallel/ # Trimmomatic --- http://www.usadellab.org/cms/?page=trimmomatic # HybPiper (and dependencies) --- https://github.com/mossmatters/HybPiper # MAFFT --- https://mafft.cbrc.jp/alignment/software/ # TrimAl --- https://github.com/scapella/trimal # RAxML --- https://github.com/stamatak/standard-RAxML # ASTRAL --- https://github.com/smirarab/ASTRAL # fastaselect.tcl --- https://github.com/bioh4x/NeatFreq/tree/master/lib/mira_3.1.15_dev_linux-gnu_x86_64_static/scripts ### Required files for this workflow: # Reads in .fastq format (no need to decompress if gzipped) # "nametable.txt" - a table with two columns; the first containing the read file prefixes and the second contains the desired sample names (not included in this repository as the reads used here were not trimmed in a single batch) # "targets.fna" - HybPiper target file (see HybPiper documentation for formatting instructions) # "genelist.txt" - a list of gene names that matches the HybPiper target file (de-duplicated, without taxon names) # "trimbylength.sh" - the following script for filtering sequences base on length (remove the first "#" on each line and paste into a file of that name): # ### trimbylength.sh # ### remove sequences from multi-FASTA with two cutoffs: proportion of average length (subtracting Ns), and an absolute minimum in bp. # ### Elliot Gardner - October 2, 2017 # ### usage: bash trimbylength.sh -i [infile.fasta] -o [outfile.fasta] -p [min fraction of avg length, 0-1] -m [min length in bp] # ### sequence files must be in current working directory # # while getopts ":i:o:p:m:" opt; do # case $opt in # i) in="$OPTARG" # ;; # o) out="$OPTARG" # ;; # p) per="$OPTARG" # ;; # m) min="$OPTARG" # ;; # esac # done # # #calculate avg length # avg=$(fastalength $in | awk '{ total += $1 } END { print total/NR }') # # sed -e '/^[^>]/s/N//g' $in > $in.noN # # fastalength $in.noN | awk '$1 > '"$min"' {print ;}' |awk '{print $1/'"$avg"',$2}' | awk '$1 >= '"$per"' {print $2}' > $in.namestokeep # fastaselect.tcl -infile $in -name $in.namestokeep -outfile $out # rm $in.namestokeep # rm $in.noN # printf "input file: "$in"\n" # printf "average length: "$avg"\n" # printf "input sequences: "$(awk '/>/' $in | wc -l)"\n" # printf "output sequences: "$(awk '/>/' $out | wc -l)"\n\n" ### 1. TRIM READS ### while read a b; do java -jar /home/egardner/bin/Trimmomatic-0.36/trimmomatic-0.36.jar PE -phred33 -threads 16 \ ../reads/"$b"_1.fq.gz ../reads/"$b"_2.fq.gz \ "$a"_R1_paired.fastq "$a"_R1_unpaired.fastq "$a"_R2_paired.fastq "$a"_R2_unpaired.fastq \ ILLUMINACLIP:~/bin/Trimmomatic-0.36/adapters/TruSeq3-PE.fa:2:30:10 \ HEADCROP:3 LEADING:30 TRAILING:25 SLIDINGWINDOW:4:25 MINLEN:20 ; done < ../nametable.txt ### 2. ASSEMBLE READS ### while read a b; do ~/bin/HybPiper/reads_first.py --bwa -b ../targets.fna --cpu 16 \ --prefix $a \ -r ../trimmed/"$a"_R1_paired.fastq ../trimmed/"$a"_R2_paired.fastq \ --unpaired ../trimmed/"$a"_R1_unpaired.fastq ; \ python ~/bin/HybPiper/cleanup.py $a ; \ python ~/bin/HybPiper/intronerate.py --prefix $a ; \ done < ../nametable.txt # check number of genes recovered per sample wc -l ./**/genes_with_seqs.txt # if desired, re-run reads_first.py with "--cov_cutoff 4" to increase gene recovery for low-read samples ### 3. FILTER, ALIGN, AND TRIM (adjust paths as needed and adapt for different datasets according to comments)### #Gather and filter mkdir filter; cd filter parallel -j 16 cat ../assemblies/**/{}/**/sequences/FNA/{}.FNA '>' {}.raw.fna :::: ../genelist.txt #Replace the above with the following to retrieve "supercontig" sequences instead, and trim the gene names from the fasta headers #parallel -j 16 cat ../assemblies/**/{}/**/sequences/intron/{}_supercontig.fasta '>' {}.raw.fna :::: ../genelist.txt #parallel ' sed -i "s/-{}//g" {}.raw.fna ' :::: ../genelist.txt #for each gene, remove sequences less than 100bp or less than 25% of the average length parallel -j 16 bash ~/bin/scripts/trimbylength.sh -i {}.raw.fna -o {}.filtered.fna -p 0.25 -m 100 :::: ../genelist.txt #write by-gene and by-sample counts to files and cut genes with fewer than 30 samples. awk '/>/' *.raw.fna | sort | uniq | sed 's/>//g' > ../samplelist.txt parallel ' printf {}"\t";awk "/{}/" *.raw.fna | wc -l ' :::: ../samplelist.txt | sort -k2 -nr > raw_counts.txt parallel ' printf {}"\t";awk "/{}/" *.filtered.fna | wc -l ' :::: ../samplelist.txt | sort -k2 -nr > filtered_counts.txt parallel ' printf {}"\t";awk "/>/" {}.filtered.fna | wc -l ' :::: ../genelist.txt | sort -k2 -nr > filtered_counts_by_gene.txt parallel ' printf {}"\t";awk "/>/" {}.raw.fna | wc -l ' :::: ../genelist.txt | sort -k2 -nr > raw_counts_by_gene.txt awk '$2>29 {print $1}' filtered_counts_by_gene.txt > ../genelist_cut.txt awk '$2<30 {print $1}' filtered_counts_by_gene.txt > exclude_list.txt parallel mv {}.filtered.fna {}.filtered.fna.excluded :::: exclude_list.txt parallel ' printf {}"\t";awk "/>/" {}.filtered.fna | wc -l ' :::: ../genelist_cut.txt | sort -k2 -nr > filtered_counts_by_gene_final.txt #align and trim mkdir ../align cd ../align parallel --eta mafft --maxiterate 1000 --globalpair ../filter/{}.filtered.fna '>' {}.aln :::: ../genelist_cut.txt #trim columns with more than 75% gaps. parallel trimal -gt 0.25 -in {}.aln -out {}.trimmed :::: ../genelist_cut.txt #remove sequences more than 50% divergent from all other sequences. while read a ; do trimal -sident -in $a.aln | awk '/Identity for most similar pair-wise sequences matrix/,EOF' | awk '!/#/' | awk '$2>0.5 {print $1}' > $a.names ; fastaselect.tcl -infile $a.aln -outfile $a.aln.cut -name $a.names ; rm $a.names ; done < ../genelist_cut.txt parallel ' printf {}"\t";awk "/{}/" *.cut | wc -l ' :::: ../samplelist.txt | sort -k2 -nr > filtered_counts_final.txt #re-trim parallel trimal -gt 0.25 -in {}.aln.cut -out {}.trimmed :::: ../genelist_cut.txt # merge two sets of alignments (e.g., Artocarpeae + the other Moraceae for the "exon" dataset) parallel cat ../moreae_trimmed/{}.trimmed ../artocarpus_trimmed/{}.trimmed '>' {}.combined :::: ../genelist333.txt parallel ruby ../makemergetable.rb ../moreae_trimmed/{}.trimmed ../artocarpus_trimmed/{}.trimmed '>' {}.mergetable :::: ../genelist333.txt parallel mafft --merge {}.mergetable {}.combined '>' {}.trimmed :::: ../genelist333.txt #build supermatrix and partition file python ~/bin/HybPiper/fasta_merge.py --fastafiles *.trimmed --raxml DNA > supermatrix.exons.fasta mv partition.raxml supermatrix.exons.part sed -i 's/ //g' supermatrix.exons.fasta cd .. ### 4. PHYLOGENETIC ANALYSES ### # Estimate ML tree from supermatrix mkdir supermatrix; cd supermatrix mv ../align/supermatrix* . raxmlHPC-PTHREADS-AVX -T 16 -n supermatrix.exon.out -s supermatrix.exons.fasta -q supermatrix.exons.part -p 12345 -x 12345 -N 200 -c 25 -f a -m GTRCAT -o Trema_orientale_SRR5674478 # Gene trees #generate gene trees with RAxML mkdir ../genetrees; cd ../genetrees parallel -j 6 raxmlHPC-PTHREADS-AVX -T 2 -n {}.out -s ../align/{}.trimmed -p 12345 -x 12345 -N 200 -c 25 -f a -m GTRCAT :::: ../genelist_cut.txt #sort files mkdir info; mv RAxML_info.* ./info/ mkdir bipartitionsBranchLabels; mv RAxML_bipartitionsBranchLabels.* ./bipartitionsBranchLabels/ mkdir bipartitions; mv RAxML_bipartitions.* ./bipartitions/ mkdir bestTree; mv RAxML_bestTree.* ./bestTree/ mkdir bootstrap; mv RAxML_bootstrap.* ./bootstrap/ mkdir collapsed #collapse nodes under 30% parallel sumtrees.py --replace bootstrap/RAxML_bootstrap.{}.out -f 0.33 -F newick --suppress-annotations -o collapsed/RAxML_collapsed.{}.tre -i newick -t bestTree/RAxML_bestTree.{}.out :::: ../genelist_cut.txt parallel "sed -i 's/\[&U\] //g' collapsed/RAxML_collapsed.{}.tre" :::: ../genelist_cut.txt parallel "sed -i \"s/'//g\" collapsed/RAxML_collapsed.{}.tre" :::: ../genelist_cut.txt cat ./collapsed/RAxML_collapsed.* > bestTrees_collapsed.tre ls ./bootstrap/* > bootstraplist.txt # Estimate species tree with ASTRAL # with LPP (quartet scores) java -jar ~/bin/ASTRAL/astral.5.6.1.jar -i bestTrees_collapsed.tre -t 1 -o astral.collapsed30.lpp.tre # with bootstrap java -jar ~/bin/ASTRAL/astral.5.6.1.jar -i bestTrees_collapsed.tre -b bootstraplist.txt -g -r 150 -o astral.collapsed30.100bs.tre #polytomy test java -jar ~/bin/ASTRAL/astral.5.6.1.jar -i bestTrees_collapsed.tre -t 10 -o astral.collapsed30.polytomytest.tre ### 5. PHYLOGENETIC NETWORK ANALYSIS -- Paratrophis clade only ### #extract the samples we want from the main supercontig dataset parallel fastaselect.tcl -infile ~/supercontigs/filter/{}.raw.fna -outfile ./{}.raw.fna -name ../samplelist.txt :::: ../genelist.txt #trim and exclude genes with fewer than 8 samples parallel -j 30 bash ~/bin/scripts/trimbylength.sh -i {}.raw.fna -o {}.filtered.fna -p 0.25 -m 50 :::: ../genelist.txt awk '/>/' *.raw.fna | sort | uniq | sed 's/>//g' > ../samplelist.txt parallel ' printf {}"\t";awk "/{}/" *.filtered.fna | wc -l ' :::: ../samplelist.txt | sort -k2 -nr > filtered_counts.txt parallel ' printf {}"\t";awk "/>/" {}.filtered.fna | wc -l ' :::: ../genelist.txt | sort -k2 -nr > filtered_counts_by_gene.txt awk '$2>7 {print $1}' filtered_counts_by_gene.txt > ../genelist_cut.txt awk '$2<8 {print $1}' filtered_counts_by_gene.txt > exclude_list.txt #align and trim mkdir ../align cd ../align parallel --eta mafft --maxiterate 1000 --globalpair ../filter/{}.filtered.fna '>' {}.aln :::: ../genelist_cut.txt #remove samples more than 50% different from any other while read a ; do trimal -sident -in $a.aln | awk '/Identity for most similar pair-wise sequences matrix/,EOF' | awk '!/#/' | awk '$2>0.5 {print $1}' > $a.names ; fastaselect.tcl -infile $a.aln -outfile $a.aln.cut -name $a.names ; rm $a.names ; done < ../genelist_cut.txt parallel ' printf {}"\t";awk "/{}/" *.cut | wc -l ' :::: ../samplelist.txt | sort -k2 -nr > filtered_counts_final.txt parallel ' printf {}"\t";awk "/>/" {}.aln.cut | wc -l ' :::: ../genelist_cut.txt | sort -k2 -nr > filtered_counts_by_gene_final.txt awk '$2>7 {print $1}' filtered_counts_by_gene_final.txt > ../genelist_cut_final.txt parallel trimal -gt 0.25 -in {}.aln.cut -out {}.trimmed :::: ../genelist_cut_final.txt #estimate gene trees mkdir ../genetrees cd ../genetrees parallel -j 8 raxmlHPC-PTHREADS-AVX -T 2 -n {}.out -s ../align/{}.trimmed -p 12345 -x 12345 -N 200 -c 25 -f a -m GTRGAMMA -o Streblus_mauritianus_Rakotonirina489 :::: ../genelist_cut.txt mkdir info; mv RAxML_info.* ./info/ mkdir bipartitionsBranchLabels; mv RAxML_bipartitionsBranchLabels.* ./bipartitionsBranchLabels/ mkdir bipartitions; mv RAxML_bipartitions.* ./bipartitions/ mkdir bestTree; mv RAxML_bestTree.* ./bestTree/ mkdir bootstrap; mv RAxML_bootstrap.* ./bootstrap/ #concatenate all gene trees into a single file cat ./bipartitions/* > bipartitions_all.tre #generate the required input file and run PhyloNet printf "#NEXUS\n\nBEGIN TREES;\n\n" > input_supercontigs.txt awk '{print "Tree geneTree"NR" = "$1}' bipartitions_all.tre >> input_supercontigs.txt printf "END;\n\nBEGIN PHYLONET;\n\nInferNetwork_MPL (all) 2 -b 30 -pl 16;\n\nEND;" >> input_supercontigs.txt java -jar ~/bin/phylonet/PhyloNet_3.8.0.jar -di input_supercontigs.nex > MPL_output_supercontigs.txt