#Pipeline for bait design and hybrid capture by Quek Zheng Bin Randolph ####

#Ortholog clustering ####

orthofinder -f fasta_file_dir -S diamond_more_sensitive

#Multiple sequence alignment amino acid ####

mafft-linsi input_faa.fa > output_faa.fa	#Note: all subsequent maaft alignments follows this command. Settings can be changed to --auto if necessary

#Pal2Nal to reconvert the sequences back to nucleotide alignments ####

pal2nal.pl pep.aln nuc.fasta -output fasta > output.fa

#Filter transcripts for coral orthologs only, from orthologs identified via OrthoFinder

python fastaMLtoSL.py input_file

#Do a blast of all transcripts against gene models 

makeblastdb -dbtype prot -in gene-models-amino-acid.fa -out gene-models-amino-acid

blastp -db gene-models-amino-acid -query input_file -out output_file -num_threads 24 -evalue 10e-6 -outfmt 6

#Filter for non-cnidarian hits ####

blastn -db nt -num_threads 24 -evalue 10e-6 -outfmt 6 -query input.fa -out output.txt	#Note: all subsequent blastn commands in the manuscript follows this command, only database (db) is changed

#With trimming bait design with BaitFisher####

##Example file:

#Sample name in alignment files:

#Alignment file transcript
#>EOG5...(orthogroup?)|Some_field_identifier|Nason_vitripennis(sample name)|NV 16710-RA(geneID)	#BAITFISHER: Fields 3(Seqeunce-name-taxon-field-number); 4 (sequence-name-geneID-field-number)
#ATCGXXXXXXXXXX

#Genome file
#>SCAFFOLD4
#ATCGXXXXXX

#GFF FILE
SCAFFOLDXX	OGSXX	EXON/CDS/MRNA	POSITION	STUFF	GenePrediction NV16710-RA

baifisher-v1.2.8 Parameter_file		

#Example of parameter file can be found in the BaitFisher Package, change "bait-number" according to window targeted.
#Note: barcoding baits were designed without alignment cutting, relevant parameter file can also be found in "Example" folder 

#Bait filter for final baits

baitfilter -i input_file -o output_file -m fs 

#Removal of duplicate baits ####

seqkit rmdup --by-seq --ignore-case input_file > output_file

#Identification of self-hybridising baits ####

blastn -db database_of_baits -num_threads 24 -evalue 10e-6 -outfmt 6 -query input.fa -out output.txt -strand minus

#Trimming of reads ####

java -jar trimmomatic-0.38.jar PE input_forward.fq.gz input_reverse.fq.gz output_forward_paired.fq.gz output_forward_unpaired.fq.gz output_reverse_paired.fq.gz output_reverse_unpaired.fq.gz ILLUMINACLIP:TruSeq3-PE-2.fa:2:30:10:2:keepBothReads LEADING:3 TRAILING:3 MINLEN:36

#HybPiper ####

.reads_first.py -b baitfile.fa -r reads*.fq --prefix file_dir_name --bwa

#Extract coding sequences from alternative contigs (Identifies all paralogous genes per sample)

while read i; do echo $i; python .paralog_investigator.py $i; done < name-list.txt &> paralogs.txt

#Retrieve paralogs

parallel "python .paralog_retriever.py name-list.txt {} > {}.paralogs.fasta" ::: OGXXXXX1 OGXXXX2 ... OGXXXXXN &> paralogs-list-per-sample.txt

#Plot gene trees to figure out what type of paralogy

python .paralog_retriever.py namelist.txt gene_name | mafft --auto - | fasttree-2.1.9 -nt -gtr > gene_name.tre

#Retrieve coding sequences; only for non-paralogous loci 

python .retrieve_sequences.py ~/baitfile.fasta . dna

#Extract introns; only for non-paralogous loci 

python .intronerate-develop.py --prefix FOLDER_FOR_SAMPLE

python .retrieve_sequences.py baitfile.fasta . supercontig

#Trimming of alignments ####

trimal -in infile.fa -out outfile.fa -automated1 

#RAxML inference ####

#Phylogenetic inferences ####

#RAxML

raxmlHPC-PTHREADS -T 24 -f d -s input.phy -q partitions.txt -w output_dir -p 123456 -N 100 -m GTRGAMMA -n name-of-output		#Tree search

raxmlHPC-PTHREADS -T 24 -b 12345 -s input.phy -q partitions.txt -w output_dir -p 123456 -N 500 -m GTRGAMMA -n name-of-output		#Bootstraps

raxmlHPC-PHTHREADS-AVX2 -f b -t best_tree.tre -z bootstraps.txt -o outgroup -n name-of-output -m GTRGAMMA -p 12345 -w output_dir	#Pasting bootstraps to final best tree

#ExaBayes; example config files can be found in the ExaBayes package

mpirun -np 120 exabayes -c config.nex -f input.phy -q partitions.txt -s 123456 -n output-name -w output-dir -R 4	#To run the tree

consense -f ExaBayes_topologies.myRun.0 -n myCons	#To obtain consensus tree

#ASTRAL-III

# Generate gene trees using RAxML v8.2.11 (exons only alignments)

raxmlHPC-PTHREADS-AVX2 -f d -s input-exons-only -T 4 -m GTRGAMMA -p 12345 -N 10 -w output_dir -n name_of_output

raxmlHPC-PTHREADS-AVX2 -s input-exons-only -T 4 -m GTRGAMMA -p 12345 -N 100 -w output_dir -n name_of_output -b 12345

raxmlHPC-PTHREADS-AVX2 -f b -t best_tree -z bootstraps -m GTRGAMMA -n unrooted-tree -w output_dir

# Removing low support branches as per recommendation of ASTRAL-III paper i.e. BS < 10

nw_ed all-gene-trees-for-astral.tre 'i & b<=10' o > ~/all-gene-trees-for-astral-BS10.tre	#Newick utilities

# Running ASTRAL v5.6.3 (Astral-III)

java -jar astral.5.6.3.jar -i infile.tre -o outfile.tre 2> astral-BS10.log

#DiscoVista

discoVista.py -m 1 -p '$path/gene-trees' -o 'output/results' -t 75 -c '~/clades-def.txt'

#SPAdes assembly for mitochondrial contigs ####

spades.py -1 R1_trimmed.fq -2 R2_trimmed.fq -o output_dir

#Checking for read depth per contig ####

bwa index contigs_assembled.fa

bwa mem <index> <R1_paired.fq.gz> <R2_paired.fq.gz> -t 24 | samtools view -S -b -@24 > output.bam

samtools view -b -F 4 -@24 bamfile.bam > mapped.bam		#Extract mapped reads only

samtools sort mapped.bam > mapped-sorted.bam			#Sort reads for qualimap




