#!/bin/bash
cat NCBI_nonannotated_mitogenomes_with_nuclear_genome.txt | while read line
do
species=$line
#copy the .fa mitogenomes (to be used for subsetting the coding regions)
cp ../../mitogenomes/mitogenome_assemblies/${species}.mitogenome.fa ./
#copy the .bed files
cp ../mitogenome_annotation_source/NCBI_nonannotated/${species}.bed ./
#get the coordinates of all CDS listed in the .bed file and the rRNA genes
grep -v trn ${species}.bed > $species.coordinates.bed
rm ${species}.bed
#reformat the FASTA header for the mitogenomes
grep -v ">" ${species}.mitogenome.fa > ${species}.mitogenome.txt
grep ">" ${species}.mitogenome.fa | cut -d ' ' -f 1 > header.txt
gsed -i 's/\.//g' header.txt
cat header.txt ${species}.mitogenome.txt > ${species}.mitogenome.fa
rm ${species}.mitogenome.txt
#use the bedtools "getfasta" utility to subset the fasta-formatted mitogenome assembly, and obtain the COX1 sequence
	bedtools getfasta -fo $species.mitogenome_coding.fa -tab -fi $species.mitogenome.fa -bed $species.coordinates.bed
#concatenate the coding regions
	cut -f 2 $species.mitogenome_coding.fa > $species.mitogenome_coding.temp
	gsed -i ':a;N;$!ba;s/\n//g' $species.mitogenome_coding.temp
	cat header.txt $species.mitogenome_coding.temp > $species.mitogenome_coding.txt
	rm header.txt $species.mitogenome_coding.temp $species.mitogenome_coding.fa $species.mitogenome.fa $species.mitogenome.fa.fai $species.coordinates.bed
done

mkdir NCBI_non_annotated_mitogenomes_coding_only
mv *.mitogenome_coding.txt NCBI_non_annotated_mitogenomes_coding_only/
