#!bin/bash
#usage: bash COX1_harvest_from_annotated_public_mitogenomes.sh &> stderr.txt
#pre-requisite: a list of species that have annotated mitogenomes available (and that passed filtering), named annotated_mitogenomes.filtered.txt

cat annotated_mitogenomes.filtered.txt | while read line
do
	target=$line
	cp ../../mitogenomes/mitogenome_assemblies/$target.mitogenome.gbk ./
	cp ../../mitogenomes/mitogenome_assemblies/$target.mitogenome.fa ./
	#get coordinates of the COX1 gene from the downloaded mitogenome, and format those coordinates for the bedtools step below (tab-delimited file with 3 columns: contig name, start bp position, end bp position)
	grep -B 1 COX1 $target.mitogenome.gbk | grep CDS > $target.coords.raw.temp
	gsed -i 's/complement//g' $target.coords.raw.temp
	gsed -i 's/(//g' $target.coords.raw.temp
	gsed -i 's/)//g' $target.coords.raw.temp
	gsed -i 's/<//g' $target.coords.raw.temp
	awk '{print $2}' $target.coords.raw.temp > $target.coords.temp
	gsed -i 's/\.\./\t/g' $target.coords.temp
	#get contig name from the fasta-formated genome (first line, which starts with ">")
	grep ">" $target.mitogenome.fa | cut -d ' ' -f 1 > $target.contig_name.txt
	gsed -i 's/>//g' $target.contig_name.txt
	paste $target.contig_name.txt $target.coords.temp > $target.coords.txt
	#use the bedtools "getfasta" utility to subset the fasta-formatted mitogenome assembly, and obtain the COX1 sequence
	bedtools getfasta -fo $target.COX1.fa -tab -fi $target.mitogenome.fa -bed $target.coords.txt
	#remove the default fasta header (will be replaced later with species ID)
	gsed -i 's/\t/\n/g' $target.COX1.fa
	gsed -i -e "1d" $target.COX1.fa
done

#get from the stderr file the accessions that had errors
grep "3 columns" stderr.txt | cut -d ' ' -f 16 | cut -d '.' -f 1 > accessions_failed.txt

#move files for which the COX1 fasta harvest failed to new folder (these are processed using species-specific search criteria since COX1 is labelled differently for those species)
mkdir failed_first_try

cat accessions_failed.txt | while read line
do
	target=$line
	mv $target.mitogenome.gbk failed_first_try/
	mv $target.mitogenome.fa failed_first_try/
	rm $target.*
	grep -B 1 cox1 failed_first_try/Frieseomelitta_varia.mitogenome.gbk | grep gene | head -n 1 > failed_first_try/Frieseomelitta_varia.coords.raw.temp
	grep -B 1 cox1 failed_first_try/Mamestra_configurata.mitogenome.gbk | grep gene | head -n 1 > failed_first_try/Mamestra_configurata.coords.raw.temp
	grep -B 5 cox1 failed_first_try/Ooceraea_biroi.mitogenome.gbk | grep gene | head -n 1 > failed_first_try/Ooceraea_biroi.coords.raw.temp
	grep -B 5 COX1 failed_first_try/Papilio_protenor.mitogenome.gbk | grep gene | head -n 1 > failed_first_try/Papilio_protenor.coords.raw.temp
	grep -B 8 cytochrome failed_first_try/Schlechtendalia_chinensis.mitogenome.gbk | grep CDS | head -n 1 > failed_first_try/Schlechtendalia_chinensis.coords.raw.temp
	grep -B 8 cytochrome failed_first_try/Tagiades_litigiosa.mitogenome.gbk | grep CDS | head -n 1 > failed_first_try/Tagiades_litigiosa.coords.raw.temp
done

rm *.fai *.temp *.mitogenome.gbk *.mitogenome.fa *contig_name.txt *.coords.txt

#use species ID in the fasta header for each COX1 sequence
ls *.COX1.fa > list.txt
gsed -i 's/\.COX1\.fa//g' list.txt

cat list.txt | while read line
do
	accession=$line
	echo $accession > $accession.header.txt
	gsed -i 's/^/>/g' $accession.header.txt
	cat $accession.header.txt $accession.COX1.fa > $accession.COX1.new.fa
	mv $accession.COX1.new.fa $accession.COX1.fa
done

#complete the COX1 mining for accessions that failed on the first try
mv accessions_failed.txt failed_first_try/

cd failed_first_try
cat accessions_failed.txt | while read line
do
	target=$line
	#get coordinates of the COX1 gene from the downloaded mitogenome, and format those coordinates for the bedtools step below (tab-delimited file with 3 columns: contig name, start bp position, end bp position)
	gsed -i 's/<//g' $target.coords.raw.temp
	awk '{print $2}' $target.coords.raw.temp > $target.coords.temp
	gsed -i 's/\.\./\t/g' $target.coords.temp
	#get contig name from the fasta-formated genome (first line, which starts with ">")
	grep ">" $target.mitogenome.fa | cut -d ' ' -f 1 > $target.contig_name.txt
	gsed -i 's/>//g' $target.contig_name.txt
	paste $target.contig_name.txt $target.coords.temp > $target.coords.txt
	#use the bedtools "getfasta" utility to subset the fasta-formatted mitogenome assembly, and obtain the COX1 sequence
	bedtools getfasta -fo $target.COX1.fa -tab -fi $target.mitogenome.fa -bed $target.coords.txt
	#remove the default fasta header (will be replaced later with species ID)
	gsed -i 's/\t/\n/g' $target.COX1.fa
	gsed -i -e "1d" $target.COX1.fa
done
rm *.fai *.temp *.mitogenome.gbk *.mitogenome.fa *contig_name.txt *.coords.txt

#use species ID in the fasta header for each COX1 sequence
ls *.COX1.fa > list.txt
gsed -i 's/\.COX1\.fa//g' list.txt

cat list.txt | while read line
do
	accession=$line
	echo $accession > $accession.header.txt
	gsed -i 's/^/>/g' $accession.header.txt
	cat $accession.header.txt $accession.COX1.fa > $accession.COX1.new.fa
	mv $accession.COX1.new.fa $accession.COX1.fa
done
rm *.header.txt list.txt
cd ../

mkdir COX1_sequences
mv *.COX1.fa COX1_sequences/
rm *.header.txt list.txt
