#!/bin/bash
#make a list of accessions with annotations

ls singletons/*.bed > list.txt
gsed -i 's/\.bed//g' list.txt
gsed -i 's/singletons\///g' list.txt

#copy raw annotations to be renamed and filtered
cp singletons/*.bed ./

#make a list matching accession IDs with species IDs
ls ../candidate_mitogenome_scaffolds/singletons/*.candidate_mitogenome.fa > all_species.txt
gsed -i 's/\.candidate_mitogenome\.fa//g' all_species.txt
gsed -i 's/\.\.\/candidate_mitogenome_scaffolds\/singletons\///g' all_species.txt

cat all_species.txt | while read line
do
	species=$line
	grep ">" ../candidate_mitogenome_scaffolds/singletons/$species.candidate_mitogenome.fa >> scaffold_IDs.txt
done

gsed -i 's/_//g' scaffold_IDs.txt

paste all_species.txt scaffold_IDs.txt > species_and_scaffold_IDs.txt
rm all_species.txt scaffold_IDs.txt
gsed -i 's/>//g' species_and_scaffold_IDs.txt
gsed -i 's/\.//g' species_and_scaffold_IDs.txt

#rename the annotated scaffolds
cat list.txt | while read line
do
accession=$line
grep -w $accession species_and_scaffold_IDs.txt | cut -f 1 > species_ID.txt
cat species_ID.txt | while read line
do
	species=$line
	mv $accession.bed $species.bed
done
done
rm species_and_scaffold_IDs.txt species_ID.txt list.txt

#check the number of CDS and copies of cox1 in each of the annotation files
ls *.bed > list.txt
gsed -i 's/\.bed//g' list.txt

cat list.txt | while read line
do
	species=$line
	grep -v trn $species.bed | grep -v rrn | wc -l >> counts
	grep cox1 $species.bed | wc -l >> cox1.counts.txt
done

paste list.txt counts cox1.counts.txt > CDS_per_species.txt
rm list.txt counts cox1.counts.txt

#first filter: keep only annotations that have a single cox1 copy and 13 or more protein-coding genes (while the expectation is 13 genes, more can occur if a given CDS is split by the annotation software - requiring manual editing)
awk '$3>1 {print $1}' CDS_per_species.txt > annotations_to_remove.temp
awk '$2<13 {print $1}' CDS_per_species.txt >> annotations_to_remove.temp

sort annotations_to_remove.temp | uniq > annotations_to_remove.txt
rm annotations_to_remove.temp 

cat annotations_to_remove.txt | while read line
do
	target=$line
	rm $target.bed
done
rm annotations_to_remove.txt CDS_per_species.txt

##second filter: keep only annotations for which all 13 of the expected protein-coding genes have been found
##a text file listing those genes (mitogenome_genes.txt) is needed at this step
ls *.bed > list.txt
gsed -i 's/\.bed//g' list.txt

cat list.txt | while read line
do
species=$line
cat mitogenome_genes.txt | while read line
do
gene=$line
grep $gene $species.bed | wc -l >> $gene.counts.txt
done
done

paste list.txt nad2.counts.txt cox1.counts.txt cox2.counts.txt atp8.counts.txt atp6.counts.txt cox3.counts.txt nad3.counts.txt nad5.counts.txt nad4.counts.txt nad4l.counts.txt nad6.counts.txt cob.counts.txt nad1.counts.txt > all_counts.txt
rm nad2.counts.txt cox1.counts.txt cox2.counts.txt atp8.counts.txt atp6.counts.txt cox3.counts.txt nad3.counts.txt nad5.counts.txt nad4.counts.txt nad4l.counts.txt nad6.counts.txt cob.counts.txt nad1.counts.txt 

#remove annotations for which one of the expected genes was not found
gsed -i 's/\s0/remove/g' all_counts.txt
grep remove all_counts.txt | cut -f 1 > to_exclude.txt
rm all_counts.txt

#third filter: keep only annotations that have the expected gene order (as estimated based on the ID of the first and the last gene in the annotation)
#find which gene is the first listed in the annotation; for all annotations that have the same starting gene, get the ID of the last gene
cat list.txt | while read line
do
species=$line
grep -v "trn" $species.bed | grep -v rrn | cut -f 4 | head -n 1 | cut -d '-' -f 1 | cut -d '_' -f 1 >> first_gene
done

paste list.txt first_gene | sort -V -k 2 > first_gene.txt
rm first_gene

cat mitogenome_genes.txt | while read line
do
gene=$line
grep $gene first_gene.txt | cut -f 1 > ${gene}_first.txt
cat ${gene}_first.txt | while read line
do
	species=$line
	grep -v "trn" $species.bed | grep -v rrn | cut -f 4 | grep -v $gene | tail -n 1 | cut -d '-' -f 1 | cut -d '_' -f 1 >> last_gene_for_${gene}_first_annotations.txt
done
done
rm first_gene.txt

#for annotations that start with the atp6 gene, the last gene in the annotation should be either cox3 or atp8; therefore, we exclude annotations that do not match with this expectation
paste atp6_first.txt last_gene_for_atp6_first_annotations.txt | grep -v atp8 | grep -v cox3 | cut -f 1 >> to_exclude.txt

#for annotations that start with the atp8 gene, the last gene in the annotation should be either cox3 or atp8; therefore, we exclude annotations that do not match with this expectation
paste atp8_first.txt last_gene_for_atp8_first_annotations.txt | grep -v atp6 | grep -v cox2 | cut -f 1 >> to_exclude.txt

#for annotations that start with the cob gene, the last gene in the annotation should be either nad1 or nad6; therefore, we exclude annotations that do not match with this expectation
paste cob_first.txt last_gene_for_cob_first_annotations.txt | grep -v nad1 | grep -v nad6 | cut -f 1 >> to_exclude.txt

#for annotations that start with the cox1 gene, the last gene in the annotation should be either cox2 or nad2; therefore, we exclude annotations that do not match with this expectation
paste cox1_first.txt last_gene_for_cox1_first_annotations.txt | grep -v cox2 | grep -v nad2 | cut -f 1 >> to_exclude.txt

#for annotations that start with the cox2 gene, the last gene in the annotation should be either atp8 or cox1; therefore, we exclude annotations that do not match with this expectation
paste cox2_first.txt last_gene_for_cox2_first_annotations.txt | grep -v atp8 | grep -v cox1 | cut -f 1 >> to_exclude.txt

#for annotations that start with the cox3 gene, the last gene in the annotation should be either atp6 or nad3; therefore, we exclude annotations that do not match with this expectation
paste cox3_first.txt last_gene_for_cox3_first_annotations.txt | grep -v atp6 | grep -v nad3 | cut -f 1 >> to_exclude.txt

#for annotations that start with the nad1 gene, the last gene in the annotation should be either cob or nad2; therefore, we exclude annotations that do not match with this expectation
paste nad1_first.txt last_gene_for_nad1_first_annotations.txt | grep -v cob | grep -v nad2 | cut -f 1 >> to_exclude.txt

#for annotations that start with the nad2 gene, the last gene in the annotation should be either cox1 or nad1; therefore, we exclude annotations that do not match with this expectation
paste nad2_first.txt last_gene_for_nad2_first_annotations.txt | grep -v cox1 | grep -v nad1 | cut -f 1 >> to_exclude.txt

#for annotations that start with the nad3 gene, the last gene in the annotation should be either cox3 or nad5; therefore, we exclude annotations that do not match with this expectation
paste nad3_first.txt last_gene_for_nad3_first_annotations.txt | grep -v cox3 | grep -v nad5 | cut -f 1 >> to_exclude.txt

#for annotations that start with the nad4l gene, the last gene in the annotation should be either nad4 or nad6; therefore, we exclude annotations that do not match with this expectation
paste nad4l_first.txt last_gene_for_nad4l_first_annotations.txt | grep -v nad4 | grep -v nad6 | cut -f 1 >> to_exclude.txt

#for annotations that start with the nad4 gene, the last gene in the annotation should be either nad4l or nad5; therefore, we exclude annotations that do not match with this expectation
paste nad4_first.txt last_gene_for_nad4_first_annotations.txt | grep -v nad4l | grep -v nad5 | cut -f 1 >> to_exclude.txt

#for annotations that start with the nad5 gene, the last gene in the annotation should be either nad3 or nad4; therefore, we exclude annotations that do not match with this expectation
paste nad5_first.txt last_gene_for_nad5_first_annotations.txt | grep -v nad3 | grep -v nad4 | cut -f 1 >> to_exclude.txt

#for annotations that start with the nad6 gene, the last gene in the annotation should be either cob or nad4l; therefore, we exclude annotations that do not match with this expectation
paste nad6_first.txt last_gene_for_nad6_first_annotations.txt | grep -v cob | grep -v nad4l | cut -f 1 >> to_exclude.txt
rm *_first.txt *first_annotations.txt

sort to_exclude.txt | uniq > to_exclude.uniq.txt
mv to_exclude.uniq.txt to_exclude.txt

cat to_exclude.txt | while read line
do
	species=$line
	rm $species.bed
done

mkdir filtered_singleton_annotations
mv *.bed filtered_singleton_annotations/
rm to_exclude.txt list.txt


#the last filter takes into account the largest size observed in the annotated mitogenomes batch (i.e. 20873 bp), and excludes non-annotated assemblies larger than this size
cd filtered_singleton_annotations/
ls *.bed > list.txt
gsed -i 's/\.bed//g' list.txt

cat list.txt | while read line
do
	target=$line
	cp ../../candidate_mitogenome_scaffolds/singletons/$target.candidate_mitogenome.fa ./
	grep -v ">" $target.candidate_mitogenome.fa > $target.candidate_mitogenome.txt
	awk '{print length}' $target.candidate_mitogenome.txt >> size.txt
	rm $target.candidate_mitogenome.txt
done

paste list.txt size.txt > list_with_size.txt
awk '$2 > 20873 {print $1}' list_with_size.txt > to_exclude.txt

cat to_exclude.txt | while read line
do
	species=$line
	rm $species.*
done
rm to_exclude.txt list_with_size.txt list.txt size.txt

mkdir filtered_mitogenomes
mv *.candidate_mitogenome.fa filtered_mitogenomes/

#rename candidate mitogenomes that passed all filters
cd filtered_mitogenomes/
ls *.fa > list.txt
gsed -i 's/\.candidate_mitogenome\.fa//g' list.txt

cat list.txt | while read line
do
	target=$line
	mv $target.candidate_mitogenome.fa $target.mitogenome.fa
done

