#!/bin/bash
##remove gap of aligned fasta file from curated data with the reference of HPV16
date

workdir="/path/to/Example_Project/Genome_alignment/Alignment_fa_rmgap"
outdir="/path/to/output"
scripts_dir="/path/to/scripts"
trimal_dir="/path/to/trimal" ##v1.4.1
seqtk_dir="/path/to/seqtk"  ##v1.3
python_dir="/path/to/python" #v3.9.12
input_fa="example_cactus.fa"  ##aligned fasta file from curated data 
cd $workdir

## extract reference sequence from the alignment
echo "Extract reference sequence from the alignment ..."
echo ""
echo "hom_sap_HPV16" > ref.txt ##reference_strain
$seqtk_dir subseq $input_fa ref.txt > $outdir/$prefix.reference.fa
rm ref.txt

## extract gap regions from the alignment
$python_dir/python3 $scripts_dir/03_extra_gap_regions.py \
    -fa $outdir/$prefix.reference.fa \
    --output $outdir/gap_regions.txt

## remove gap alignments
$trimal_dir/trimal -in $input_fa -out $outdir/$prefix.rmgap.fa -selectcols { `cat $outdir/gap_regions.txt` }


date
echo "Done!"
