#!/bin/bash
cat NCBI_annotated_mitogenomes_with_nuclear_genome.txt | while read line
do
species=$line
#copy the .gbk and .fa mitogenomes (to be used for subsetting the coding regions)
cp ../../mitogenomes/mitogenome_assemblies/${species}.mitogenome.gbk ./
cp ../../mitogenomes/mitogenome_assemblies/${species}.mitogenome.fa ./
#get the coordinates of all CDS listed in the .gbk file and the rRNA genes
	grep CDS $species.mitogenome.gbk >> $species.coding.txt
	grep -w "rRNA " $species.mitogenome.gbk >> $species.coding.txt
	gsed -i 's/CDS//g' $species.coding.txt
	gsed -i 's/rRNA//g' $species.coding.txt
	gsed -i 's/ //g' $species.coding.txt
	gsed -i 's/complement(//g' $species.coding.txt
	gsed -i 's/(//g' $species.coding.txt
	gsed -i 's/)//g' $species.coding.txt
	gsed -i 's/join//g' $species.coding.txt
	gsed -i 's/<//g' $species.coding.txt
	gsed -i 's/>//g' $species.coding.txt
#in cases with split annotations, keep the first coordinates
	cut -d ',' -f 1 $species.coding.txt > $species.coding_no_split.txt
#keep only numeric lines (all coordinates should be numeric) for start and end coordinates
	cut -d '.' -f 1 $species.coding_no_split.txt | egrep -x '[0-9]+' > $species.coding_start.txt
	cut -d '.' -f 3 $species.coding_no_split.txt | egrep -x '[0-9]+' > $species.coding_end.txt
	grep ">" $species.mitogenome.fa | cut -d ' ' -f 1 > $species.contig_name.txt
#print contig name 15 times (13 protein-coding genes and 2 rRNA genes) 
	awk '{while(++i<16)print;i=0}' $species.contig_name.txt > $species.contig_name.repeated.txt
	gsed -i 's/>//g' $species.contig_name.repeated.txt
	paste $species.contig_name.repeated.txt $species.coding_start.txt $species.coding_end.txt > $species.coordinates
#keep only coordinates with three fields (columns); this excludes lines with missing annotation
	awk ' NF==3 {print $0} ' $species.coordinates > $species.coordinates.bed
	rm $species.coding.txt $species.contig_name.repeated.txt $species.coding_start.txt $species.coding_end.txt
#use the bedtools "getfasta" utility to subset the fasta-formatted mitogenome assembly, and obtain the COX1 sequence
	bedtools getfasta -fo $species.mitogenome_coding.fa -tab -fi $species.mitogenome.fa -bed $species.coordinates.bed
#concatenate the coding regions
	cut -f 2 $species.mitogenome_coding.fa > $species.mitogenome_coding.temp
	gsed -i ':a;N;$!ba;s/\n//g' $species.mitogenome_coding.temp
	cat $species.contig_name.txt $species.mitogenome_coding.temp > $species.mitogenome_coding.txt
	rm $species.contig_name.txt $species.mitogenome_coding.temp *.fai $species.coordinates ${species}.mitogenome.gbk ${species}.mitogenome.fa $species.mitogenome_coding.fa $species.coding_no_split.txt $species.coordinates.bed
done

mkdir NCBI_annotated_mitogenomes_coding_only
mv *.mitogenome_coding.txt NCBI_annotated_mitogenomes_coding_only/
