#!bin/bash
#make a list of the nuclear genome assemblies to search
ls ../nuclear_genomes/nuclear_accessions/*.fa > list.txt
gsed -i 's/\.\.\/nuclear_genomes\/nuclear_accessions\///g' list.txt
gsed -i 's/\.nuclear_genome\.fa//g' list.txt
mkdir backup_genomes

cat list.txt | while read line
do
prefix=$line
#linearize fasta files
awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);}  END {printf("\n");}' ../nuclear_genomes/nuclear_accessions/$prefix.nuclear_genome.fa > $prefix.nuclear_genome.linearized.fa
#make a list of scaffolds containing 'mitochondrion' in the header; these will be removed from the nuclear assemblies
grep mitochondrion $prefix.nuclear_genome.linearized.fa | cut -d ' ' -f 1 > ids.txt
gsed -i 's/>//g' ids.txt
#remove scaffolds
awk 'BEGIN{while((getline<"ids.txt")>0)l[">"$1]=1}/^>/{f=!l[$1]}f' $prefix.nuclear_genome.linearized.fa > $prefix.nuclear_genome.linearized.mtDNA_clean.fa
rm ids.txt
mv $prefix.nuclear_genome.linearized.fa backup_genomes/
done

#make a list of scaffolds identified as mitogenomes based on BLAST analyses (even if they are not labelled as 'mitochondrion')
mkdir mitogenomes_mined_from_nuclear_genome/
cp ../mitogenome_COI_mining/mitogenomes_mined_from_nuclear_genomes/annotated_scaffolds/filtered_singleton_annotations/*.bed ./mitogenomes_mined_from_nuclear_genome/
cp ../mitogenome_COI_mining/mitogenomes_mined_from_nuclear_genomes/annotated_scaffolds/filtered_doubles_annotations/filtered_bed_files/*.bed ./mitogenomes_mined_from_nuclear_genome/
ls mitogenomes_mined_from_nuclear_genome/*.bed > list2.txt
gsed -i 's/mitogenomes_mined_from_nuclear_genome\///g' list2.txt
gsed -i 's/\.bed//g' list2.txt
cut -d '_' -f 1,2 list2.txt > list2.clean.txt && mv list2.clean.txt list2.txt

#rename bed files (keeping only genus and species), keep only the mitogenome scaffold name, and add a "." before the last number in the scaffold name (since this is the format in the nuclear genomes)
cat list2.txt | while read line
do
	prefix=$line
	mv mitogenomes_mined_from_nuclear_genome/${prefix}* mitogenomes_mined_from_nuclear_genome/${prefix}.bed
	head -n 1 mitogenomes_mined_from_nuclear_genome/${prefix}.bed | cut -f 1 > ids.txt
	gsed -i 's/.$/.&/' ids.txt
	rm mitogenomes_mined_from_nuclear_genome/${prefix}.bed
	#remove scaffolds
	awk 'BEGIN{while((getline<"ids.txt")>0)l[">"$1]=1}/^>/{f=!l[$1]}f' $prefix.nuclear_genome.linearized.mtDNA_clean.fa > $prefix.nuclear_genome.linearized.mtDNA_clean.new_mitogenomes_removed.fa
	mv $prefix.nuclear_genome.linearized.mtDNA_clean.new_mitogenomes_removed.fa $prefix.nuclear_genome.linearized.mtDNA_clean.fa
	rm ids.txt
done
rm -r mitogenomes_mined_from_nuclear_genome/