#!/bin/bash
#the search for mitogenomes uses a "bait" COI barcode, in a two-tier system:
#tier 1 consists of species that belong to families with at least one COX1 barcode sequence publicly available; in this case, the COX1 bait is from the same family
#tier 2 are species from families with no COX1 barcode publicly available; in this case, six COX1 baits are used, each from a different family; searches with each COX1 bait are done sequentially

#prerequisite 1: a text file named "unique_barcoded_families.txt" listing (on each line) families with COX1 barcode sequence publicly available
#prerequisite 2: a fasta file file containing all the COX1 bait sequences for each family (named mtCOI_Bait_Families.fas)
#prerequisite 3: a text file listing family and species ID for all 1446 species to be considered for the mitogenome search (named "target_families_species.txt")

mkdir COX1_barcode_baits blasting_tier1 14_21kb_scaffolds blasting_tier2 600_bp_blast_results candidate_mitogenome_scaffolds candidate_mitogenome_scaffolds/singletons candidate_mitogenome_scaffolds/doubles

#first isolate, for each species, scaffolds in the range of 14-21 kb
#the $species.nuclear_genome.fa files are the NCBI-derived nuclear genome assemblies
cut -f 2 target_families_species.txt > target_taxa.txt
cat target_taxa.txt | while read line
do
	species=$line
	cp ../../nuclear_genomes/nuclear_accessions/$species.nuclear_genome.fa ./
	samtools faidx $species.nuclear_genome.fa
	#keep only the coordinates for the scaffolds that are within the range of 14kb - 21kb (these are considered candidate mitogenomes)
	awk '$2 >= 14000 && $2 <= 21000 {print $1,"1",$2}' $species.nuclear_genome.fa.fai > $species.scaffolds_to_extract.txt
	gsed -i 's/ /\t/g' $species.scaffolds_to_extract.txt
	#use bedtools to extract the candidate mitogenomes
	bedtools getfasta -fi $species.nuclear_genome.fa -bed $species.scaffolds_to_extract.txt | cut -d ':' -f 1 > 14_21kb_scaffolds/$species.14_21kb_scaffolds.fa
	rm $species.nuclear_genome.fa $species.nuclear_genome.fa.fai $species.scaffolds_to_extract.txt
done


#use COI baits for 14-21 kb scaffolds for tier 1 species
cat unique_barcoded_families.txt | while read line
do
	family=$line
	grep -A1 $family mtCOI_Bait_Families.fas > COX1_barcode_baits/${family}_bait_COX1.fa
	grep $family target_families_species.txt | cut -f 2 > $family.species.txt
	cat $family.species.txt | while read line
	do
		species=$line
		mv 14_21kb_scaffolds/$species.14_21kb_scaffolds.fa ./
		makeblastdb -in $species.14_21kb_scaffolds.fa -input_type fasta -dbtype nucl -title $species -parse_seqids -out $species
		blastn -db $species -query COX1_barcode_baits/${family}_bait_COX1.fa -out $species.blast_results -perc_identity 60 -evalue 0.0001 -task blastn -outfmt "6 sacc evalue length pident"
		mv $species.blast_results blasting_tier1/
		rm $species.ndb $species.nin $species.nhr $species.nsq $species.nog $species.not $species.nto $species.ntf $species.nos
		#keep only species that have a COX1 hit > 600 bp, in the scaffolds of 14kb - 21kb
		awk '$3 > 600 {print $0}' blasting_tier1/$species.blast_results > 600_bp_blast_results/$species.blast_results.600bp.txt
		mv $species.14_21kb_scaffolds.fa 14_21kb_scaffolds
	done
	rm $family.species.txt
	#remove empty files from the two blasting results folders
	find blasting_tier1/ -size 0 -delete
	find 600_bp_blast_results/ -size 0 -delete
done


#use COI baits for 14-21 kb scaffolds for tier 1 species
#make a list of tier 2 species
cp target_families_species.txt ./tier2.txt
cat unique_barcoded_families.txt | while read line
do
	family=$line
	#remove from the data spreadsheet tier 1 species (i.e. those with a barcode available at the family level)
	grep -v $family tier2.txt > tier2.temp
	mv tier2.temp tier2.txt
done

cut -f 2 tier2.txt > tier2_species.txt
rm tier2.txt

for family in {"Aphididae","Drosophilidae","Curculionidae","Geometridae","Braconidae"}
do
	cat tier2_species.txt | while read line
	do
		species=$line
		mv 14_21kb_scaffolds/$species.14_21kb_scaffolds.fa ./
		#blast a COX1 sequence from the same family against the candidate mitogenomes
		makeblastdb -in $species.14_21kb_scaffolds.fa -input_type fasta -dbtype nucl -title $species -parse_seqids -out $species.${family}_bait
		blastn -db $species.${family}_bait -query COX1_barcode_baits/${family}_bait_COX1.fa -out $species.${family}_bait.blast_results -perc_identity 60 -evalue 0.0001 -task blastn -outfmt "6 sacc evalue length pident"
		mv $species.${family}_bait.blast_results blasting_tier2/
		rm $species.${family}_bait.ndb $species.${family}_bait.nin $species.${family}_bait.nhr $species.${family}_bait.nsq $species.${family}_bait.nog $species.${family}_bait.not $species.${family}_bait.nto $species.${family}_bait.ntf $species.${family}_bait.nos
		#keep only species that have a COX1 hit > 600 bp, in the scaffolds of 14kb - 21kb
		awk '$3 > 600 {print $0}' blasting_tier2/$species.${family}_bait.blast_results >> 600_bp_blast_results/$species.blast_results.600bp.txt
		mv $species.14_21kb_scaffolds.fa 14_21kb_scaffolds/
	done
	#remove empty files from the two blasting results folders
	find blasting_tier2/ -size 0 -delete
	find 600_bp_blast_results/ -size 0 -delete
done

#collapse replicate blast hits for tier 2 species
cat tier2_species.txt | while read line
do
	species=$line
	cut -f 1 600_bp_blast_results/$species.blast_results.600bp.txt | sort | uniq > 600_bp_blast_results/$species.blast_results.600bp.unique_scaffolds.txt
	cat 600_bp_blast_results/$species.blast_results.600bp.unique_scaffolds.txt | while read line
	do
		scaffold=$line
		grep -w -m 1 $scaffold 600_bp_blast_results/$species.blast_results.600bp.txt >> 600_bp_blast_results/$species.blast_results.600bp.temp
	done
	mv 600_bp_blast_results/$species.blast_results.600bp.temp 600_bp_blast_results/$species.blast_results.600bp.txt
	rm 600_bp_blast_results/$species.blast_results.600bp.unique_scaffolds.txt
done


#check the 600bp results and use only species for which only one COX1 hit with size above 600bp was found (these are considered "singleton" candidate mitogenomes)
cd 600_bp_blast_results
ls *.600bp.txt > list.txt
gsed -i 's/\.blast_results\.600bp\.txt//g' list.txt

#make sure that no duplicated records were included, and count number of scaffolds per species
cat list.txt | while read line
do
	target=$line
	sort $target.blast_results.600bp.txt | uniq > $target.blast_results.600bp.uniq.txt
	mv $target.blast_results.600bp.uniq.txt $target.blast_results.600bp.txt
	wc -l < $target.blast_results.600bp.txt >> counts
done

paste list.txt counts > list.annotated.txt

awk '$2 ==1 {print $1}' list.annotated.txt > singletons.txt
rm list.txt counts 

cat singletons.txt | while read line
do
	species=$line
	cut -f 1 $species.blast_results.600bp.txt > $species.candidate_mitogenome_scaffold.txt
	cat $species.candidate_mitogenome_scaffold.txt | while read line
	do
		mitogenome=$line
		grep -A1 -m 1 $mitogenome ../14_21kb_scaffolds/$species.14_21kb_scaffolds.fa > ../candidate_mitogenome_scaffolds/singletons/$species.candidate_mitogenome.fa
		rm $species.candidate_mitogenome_scaffold.txt
	done
done


#check the 600bp results and use only species for which only two COX1 hits with size above 600bp were found (these are considered "doubles" candidate mitogenomes)
#both scaffolds, labelled "A" and "B", will be annotated
awk '$2 ==2 {print $1}' list.annotated.txt > doubles.txt

cat doubles.txt | while read line
do
	species=$line
	cut -f 1 $species.blast_results.600bp.txt | head -n 1 > $species.candidate_mitogenome_scaffold_A.txt
	cat $species.candidate_mitogenome_scaffold_A.txt | while read line
	do
		mitogenome=$line
		grep -A1 -m 1 $mitogenome ../14_21kb_scaffolds/$species.14_21kb_scaffolds.fa > ../candidate_mitogenome_scaffolds/doubles/$species.mitogenome_scaffold_A.fa
	done
	rm $species.candidate_mitogenome_scaffold_A.txt
	cut -f 1 $species.blast_results.600bp.txt | tail -n 1 > $species.candidate_mitogenome_scaffold_B.txt
	cat $species.candidate_mitogenome_scaffold_B.txt | while read line
	do
		mitogenome=$line
		grep -A1 -m 1 $mitogenome ../14_21kb_scaffolds/$species.14_21kb_scaffolds.fa > ../candidate_mitogenome_scaffolds/doubles/$species.mitogenome_scaffold_B.fa
	done
	rm $species.candidate_mitogenome_scaffold_B.txt
done
rm doubles.txt singletons.txt list.annotated.txt
