#!/bin/bash
mkdir check_annotated_mitogenomes/

#four species (Frankliniella_occidentalis, Ooceraea_biroi, Papilio_protenor, Schlechtendalia_chinensis) are listed in the file "mitogenomes.to_check.txt"; these taxa have genes listed in a different format are excluded at this step (the checks are done manually for those species)

#first check that all 13 expected PCGs are included in the annotation file
#pre-requisite: a text file (named annotated_mitogenomes.txt) that lists the species with annotated mitogenomes that will be used for the quality check
#pre-requisite: a text file (named mitogenome_genes.txt) listing each of the 13 PCGs espected for the mitogenome

cp annotated_mitogenomes.txt ./annotated_mitogenomes.filtered.txt 
cat mitogenomes.to_check.txt | while read line
do
	prefix=$line
	grep -v $prefix annotated_mitogenomes.filtered.txt > annotated_mitogenomes.filtered.temp
	mv annotated_mitogenomes.filtered.temp annotated_mitogenomes.filtered.txt
done

#first, check that none of the 13 expected PCGs are missing 
cat annotated_mitogenomes.filtered.txt | while read line
do
	target=$line
	cp ../../../mitogenomes/mitogenome_assemblies/$target.mitogenome.gbk ./check_annotated_mitogenomes/
	grep -w gene= ./check_annotated_mitogenomes/$target.mitogenome.gbk | grep -v trn | grep -v LSU | grep -v SSU | grep -v rrn | grep -v rRNA | sort | uniq >> $target.genes
	#make sure the format of the genes is consistent
	gsed -i 's/\/gene="//g' $target.genes
	gsed -i 's/"//g' $target.genes
	gsed -i 's/ND2/nad2/g' $target.genes
	gsed -i 's/ND1/nad1/g' $target.genes
	gsed -i 's/ND3/nad3/g' $target.genes
	gsed -i 's/ND4/nad4/g' $target.genes
	gsed -i 's/ND4L/nad4l/g' $target.genes
	gsed -i 's/nad4L/nad4l/g' $target.genes
	gsed -i 's/ND5/nad5/g' $target.genes
	gsed -i 's/ND6/nad6/g' $target.genes
	gsed -i 's/ATP6/atp6/g' $target.genes
	gsed -i 's/ATP8/atp8/g' $target.genes
	gsed -i 's/COIII/cox3/g' $target.genes
	gsed -i 's/COII/cox2/g' $target.genes
	gsed -i 's/COI/cox1/g' $target.genes
	gsed -i 's/COX1/cox1/g' $target.genes
	gsed -i 's/COX2/cox2/g' $target.genes
	gsed -i 's/COX3/cox3/g' $target.genes
	gsed -i 's/CYTB/cob/g' $target.genes
	gsed -i 's/cytB/cob/g' $target.genes
	cat mitogenome_genes.txt | while read line
	do
		gene=$line
		grep $gene $target.genes | wc -l >> $gene.counts.txt
	done
rm $target.genes
done

paste annotated_mitogenomes.filtered.txt nad2.counts.txt cox1.counts.txt cox2.counts.txt atp8.counts.txt atp6.counts.txt cox3.counts.txt nad3.counts.txt nad5.counts.txt nad4.counts.txt nad4l.counts.txt nad6.counts.txt cob.counts.txt nad1.counts.txt > all_counts.txt
rm nad2.counts.txt cox1.counts.txt cox2.counts.txt atp8.counts.txt atp6.counts.txt cox3.counts.txt nad3.counts.txt nad5.counts.txt nad4.counts.txt nad4l.counts.txt nad6.counts.txt cob.counts.txt nad1.counts.txt

#remove annotations for which at least one of the expected genes was not found
gsed -i 's/\s0/remove/g' all_counts.txt
grep remove all_counts.txt | cut -f 1 > to_exclude.txt
rm all_counts.txt

#the second check: keep only annotations that have the expected gene order (as estimated based on the ID of the first and the last gene in the annotation)
#find which gene is the first listed in the annotation; for all annotations that have the same starting gene, get the ID of the last gene
cat annotated_mitogenomes.filtered.txt | while read line
do
target=$line
grep -w gene= ./check_annotated_mitogenomes/$target.mitogenome.gbk | grep -v trn | grep -v LSU | grep -v SSU | grep -v rrn | grep -v rRNA | head -n 1 >> first_gene
done

paste annotated_mitogenomes.filtered.txt first_gene | sort -V -k 2 > first_gene.txt
rm first_gene
gsed -i 's/\/gene="//g' first_gene.txt
gsed -i 's/"//g' first_gene.txt
gsed -i 's/ND2/nad2/g' first_gene.txt
gsed -i 's/ND1/nad1/g' first_gene.txt
gsed -i 's/ND3/nad3/g' first_gene.txt
gsed -i 's/ND4/nad4/g' first_gene.txt
gsed -i 's/ND4L/nad4l/g' first_gene.txt
gsed -i 's/nad4L/nad4l/g' first_gene.txt
gsed -i 's/ND5/nad5/g' first_gene.txt
gsed -i 's/ND6/nad6/g' first_gene.txt
gsed -i 's/ATP6/atp6/g' first_gene.txt
gsed -i 's/ATP8/atp8/g' first_gene.txt
gsed -i 's/COIII/cox3/g' first_gene.txt
gsed -i 's/COII/cox2/g' first_gene.txt
gsed -i 's/COI/cox1/g' first_gene.txt
gsed -i 's/COX1/cox1/g' first_gene.txt
gsed -i 's/COX2/cox2/g' first_gene.txt
gsed -i 's/COX3/cox3/g' first_gene.txt
gsed -i 's/CYTB/cob/g' first_gene.txt
gsed -i 's/cytB/cob/g' first_gene.txt


cat mitogenome_genes.txt | while read line
do
gene=$line
grep $gene first_gene.txt | cut -f 1 > ${gene}_first.txt
cat ${gene}_first.txt | while read line
do
        target=$line
		grep -w gene= ./check_annotated_mitogenomes/$target.mitogenome.gbk | grep -v trn | grep -v LSU | grep -v SSU | grep -v rrn | grep -v rRNA | tail -n 1 >> last_gene_for_${gene}_first_annotations.txt
		gsed -i 's/\/gene="//g' last_gene_for_${gene}_first_annotations.txt
		gsed -i 's/"//g' last_gene_for_${gene}_first_annotations.txt
		gsed -i 's/ND2/nad2/g' last_gene_for_${gene}_first_annotations.txt
		gsed -i 's/ND1/nad1/g' last_gene_for_${gene}_first_annotations.txt
		gsed -i 's/ND3/nad3/g' last_gene_for_${gene}_first_annotations.txt
		gsed -i 's/ND4/nad4/g' last_gene_for_${gene}_first_annotations.txt
		gsed -i 's/ND4L/nad4l/g' last_gene_for_${gene}_first_annotations.txt
		gsed -i 's/nad4L/nad4l/g' last_gene_for_${gene}_first_annotations.txt
		gsed -i 's/ND5/nad5/g' last_gene_for_${gene}_first_annotations.txt
		gsed -i 's/ND6/nad6/g' last_gene_for_${gene}_first_annotations.txt
		gsed -i 's/ATP6/atp6/g' last_gene_for_${gene}_first_annotations.txt
		gsed -i 's/ATP8/atp8/g' last_gene_for_${gene}_first_annotations.txt
		gsed -i 's/COIII/cox3/g' last_gene_for_${gene}_first_annotations.txt
		gsed -i 's/COII/cox2/g' last_gene_for_${gene}_first_annotations.txt
		gsed -i 's/COI/cox1/g' last_gene_for_${gene}_first_annotations.txt
		gsed -i 's/COX1/cox1/g' last_gene_for_${gene}_first_annotations.txt
		gsed -i 's/COX2/cox2/g' last_gene_for_${gene}_first_annotations.txt
		gsed -i 's/COX3/cox3/g' last_gene_for_${gene}_first_annotations.txt
		gsed -i 's/CYTB/cob/g' last_gene_for_${gene}_first_annotations.txt
		gsed -i 's/cytB/cob/g' last_gene_for_${gene}_first_annotations.txt
done
done
rm first_gene.txt

#for annotations that start with the atp6 gene, the last gene in the annotation should be either cox3 or atp8; therefore, we exclude annotations that do not match with this expectation
paste atp6_first.txt last_gene_for_atp6_first_annotations.txt | grep -v atp8 | grep -v cox3 | cut -f 1 >> to_exclude.txt

#for annotations that start with the atp8 gene, the last gene in the annotation should be either cox3 or atp8; therefore, we exclude annotations that do not match with this expectation
paste atp8_first.txt last_gene_for_atp8_first_annotations.txt | grep -v atp6 | grep -v cox2 | cut -f 1 >> to_exclude.txt

#for annotations that start with the cob gene, the last gene in the annotation should be either nad1 or nad6; therefore, we exclude annotations that do not match with this expectation
paste cob_first.txt last_gene_for_cob_first_annotations.txt | grep -v nad1 | grep -v nad6 | cut -f 1 >> to_exclude.txt

#for annotations that start with the cox1 gene, the last gene in the annotation should be either cox2 or nad2; therefore, we exclude annotations that do not match with this expectation
paste cox1_first.txt last_gene_for_cox1_first_annotations.txt | grep -v cox2 | grep -v nad2 | cut -f 1 >> to_exclude.txt

#for annotations that start with the cox2 gene, the last gene in the annotation should be either atp8 or cox1; therefore, we exclude annotations that do not match with this expectation
paste cox2_first.txt last_gene_for_cox2_first_annotations.txt | grep -v atp8 | grep -v cox1 | cut -f 1 >> to_exclude.txt

#for annotations that start with the cox3 gene, the last gene in the annotation should be either atp6 or nad3; therefore, we exclude annotations that do not match with this expectation
paste cox3_first.txt last_gene_for_cox3_first_annotations.txt | grep -v atp6 | grep -v nad3 | cut -f 1 >> to_exclude.txt

#for annotations that start with the nad1 gene, the last gene in the annotation should be either cob or nad2; therefore, we exclude annotations that do not match with this expectation
paste nad1_first.txt last_gene_for_nad1_first_annotations.txt | grep -v cob | grep -v nad2 | cut -f 1 >> to_exclude.txt

#for annotations that start with the nad2 gene, the last gene in the annotation should be either cox1 or nad1; therefore, we exclude annotations that do not match with this expectation
paste nad2_first.txt last_gene_for_nad2_first_annotations.txt | grep -v cox1 | grep -v nad1 | cut -f 1 >> to_exclude.txt

#for annotations that start with the nad3 gene, the last gene in the annotation should be either cox3 or nad5; therefore, we exclude annotations that do not match with this expectation
paste nad3_first.txt last_gene_for_nad3_first_annotations.txt | grep -v cox3 | grep -v nad5 | cut -f 1 >> to_exclude.txt

#for annotations that start with the nad4l gene, the last gene in the annotation should be either nad4 or nad6; therefore, we exclude annotations that do not match with this expectation
paste nad4l_first.txt last_gene_for_nad4l_first_annotations.txt | grep -v nad4 | grep -v nad6 | cut -f 1 >> to_exclude.txt

#for annotations that start with the nad4 gene, the last gene in the annotation should be either nad4l or nad5; therefore, we exclude annotations that do not match with this expectation
paste nad4_first.txt last_gene_for_nad4_first_annotations.txt | grep -v nad4l | grep -v nad5 | cut -f 1 >> to_exclude.txt

#for annotations that start with the nad5 gene, the last gene in the annotation should be either nad3 or nad4; therefore, we exclude annotations that do not match with this expectation
paste nad5_first.txt last_gene_for_nad5_first_annotations.txt | grep -v nad3 | grep -v nad4 | cut -f 1 >> to_exclude.txt

#for annotations that start with the nad6 gene, the last gene in the annotation should be either cob or nad4l; therefore, we exclude annotations that do not match with this expectation
paste nad6_first.txt last_gene_for_nad6_first_annotations.txt | grep -v cob | grep -v nad4l | cut -f 1 >> to_exclude.txt
rm *_first.txt *first_annotations.txt

sort to_exclude.txt | uniq > to_exclude.uniq.txt
mv to_exclude.uniq.txt to_exclude.txt

