#!/bin/bash
#use only species with filtered and annotated mitogenomes, with high-quality nuclear genomes
#use only the 13 protein-coding genes and rRNA genes

mkdir blast_results

#make a list of species to be included in the analysis
ls filtered_mitogenomes/*.txt > target_species.txt
gsed -i 's/\.mitogenome_coding\.txt//g' target_species.txt
gsed -i 's/filtered_mitogenomes\///g' target_species.txt

#loop over the list of species
cat target_species.repeat.txt | while read line
do
	target_species=$line
	makeblastdb -in ../remove_mtDNA_from_nuclear_genomes/$target_species.nuclear_genome.linearized.mtDNA_clean.fa -input_type fasta -dbtype nucl -title $target_species -parse_seqids -out $target_species
#partition the mitogenome in non-overlapping windows of 658bp
	bash fasta_windows_v1.1.sh filtered_mitogenomes/$target_species.mitogenome_coding.txt 658 658
	grep ">" windows_${target_species}.mitogenome_coding.txt > windows.txt
	gsed -i 's/>//g' windows.txt
#loop over the list of windows
	cat windows.txt | while read line
	do
		window=$line
		grep -A 1 $window windows_${target_species}.mitogenome_coding.txt > $window.fa
#perform the BLASTn search for NUMTs
		blastn -db $target_species -query $window.fa -out $window.blast_results -perc_identity 60 -evalue 0.0001 -task blastn -outfmt "6 qseqid sseqid evalue length pident"
#filter the results retaining only hits >= 100bp
		awk '$4 >= 100 {print $0}' $window.blast_results > $window.blast_results.100bp
#filter the results based on %ID and length
		awk '{if($5 < 99) {print $0} else if ($4 < 658) print $0}' $window.blast_results.100bp > $window.blast_results.txt
		rm $window.blast_results $window.blast_results.100bp
		wc -l < $window.blast_results.txt >> $target_species.blast_counts.txt
	done
	cut -d '_' -f 2 windows.txt > windows.start.txt
	cut -d '_' -f 3 windows.txt > windows.end.txt
	echo "window    bp_start        bp_end  NUMT_counts" > header.txt
	paste windows.txt windows.start.txt windows.end.txt $target_species.blast_counts.txt > $target_species.windowed_blast_results.temp
	cat header.txt $target_species.windowed_blast_results.temp > blast_results/$target_species.windowed_blast_results.txt
	rm window* header.txt $target_species.blast_counts.txt $target_species.windowed_blast_results.temp $target_species.n*
done

#get average NUMT counts for each mitogenome
cd blast_results/
ls *_blast_results.txt > list.txt
gsed -i 's/\.windowed_blast_results\.txt//g' list.txt

cat list.txt | while read line
do
	prefix=$line
	grep -v bp_start $prefix.windowed_blast_results.txt > $prefix.windowed_blast_results.temp
	awk '{ total += $4 } END { print total/NR }' $prefix.windowed_blast_results.temp >> averages.txt
	awk '{sum+=$4; sumsq+=$4*$4} END {print sqrt(sumsq/NR - (sum/NR)^2)}' $prefix.windowed_blast_results.temp >> SD.txt
done
rm *.temp

echo "Species   Average_NUMTs   SD_NUMTs" > header
paste list.txt averages.txt SD.txt > results
cat header results > results.txt
rm header results averages.txt SD.txt list.txt
cd ../
