#!/bin/bash

INDIVIDUAL=${1?Error: no file basename given}
TABIX=${2?Error: no file basename given}
VCFTOOLS=${3?Error: no file basename given}
WINDOWSIZE=${4?Error: no file basename given}
MINCONTIGLENGTH=${5?Error: no file basename given}
MYVCF=$INDIVIDUAL".vcf"
winsize=$WINDOWSIZE
mincontigbp=$MINCONTIGLENGTH
suffix=$INDIVIDUAL"_windowHE"
biallelic=FALSE
annotated=FALSE
do_bgzip=TRUE
do_tabix=TRUE
extract_contiginfo=TRUE
extract_samples=TRUE
run_loop=TRUE
sample_scores=TRUE

if [[ -f "${MYVCF}" ]]
        then
        echo "Input: unzipped file."
        grep -B1000000 -m 1 '#CHROM' ${MYVCF} > myvcf.header.txt
        else
        echo "Input: gzipped file."
        zgrep -B1000000 -m 1 '#CHROM' ${MYVCF}.gz > myvcf.header.txt
fi

if [[ "$do_bgzip" = TRUE ]]
        then
        echo "Zipping with bgzip..."
        echo "This may take up to an hour, depending on the size of the vcf file."
        if [[ -f "${MYVCF}.gz" ]]
            then
            echo "Input: gzipped file."
            gunzip -c ${MYVCF}.gz | grep -v 'INDEL' | $TABIX/bgzip -f > ${MYVCF}.bgz &
            else
            if [[ -f "${MYVCF}" ]]
                        then
                        echo "Input: unzipped file."
                        grep -v 'INDEL' $MYVCF | $TABIX/bgzip -f > ${MYVCF}.bgz &
                        else
                        echo "ERROR: input (gzipped) vcf-file not found. Note that you should not provide the gz extension, even if file is gzipped."
                fi
        fi
        else
        echo "Skipping bgzip step because 'do_bgzip' is set to FALSE."
fi
wait

if [[ "$do_tabix" = TRUE ]]
        then
        echo "Indexing with tabix..."
        echo "This may take several minutes (>up to half an hour), depending on the size of the vcf file."
        $TABIX/tabix -f -p vcf ${MYVCF}.bgz
        else
        echo "Skipping indexing step because 'do_tabix' is set to FALSE."
fi
wait

if [[ "$extract_contiginfo" = TRUE ]]
        then
        echo "Selecting contigs with minimum length (specified by mincontigbp flag)..."
        grep 'contig' myvcf.header.txt | cut -f1 -d ',' | sed 's/ID=/!/g' | cut -f2 -d '!' > mycontigs.txt
        grep 'contig' myvcf.header.txt | cut -f2 -d ',' | cut -f2 -d '=' | sed 's/>//g' > mylengths.txt
        paste mycontigs.txt mylengths.txt > mycontiglengths.txt
        rm mycontigs.txt mylengths.txt
        awk -v minlength="$mincontigbp" '$2>=minlength' mycontiglengths.txt > mylongcontigs.txt
        ncontigs=$(wc -l mylongcontigs.txt | cut -f1 -d ' ')
        echo "Finished selecting contigs."
        echo "Number of contigs retained: "$ncontigs
        echo "Results stored in the files mycontiglengths.txt and mylongcontigs.txt."
        else
        echo "Not extracting contig length information because the flag 'extract_contiginfo' is set to FALSE."
        echo "Assuming the files 'mycontiglengths.txt' and 'mylongcontigs.txt' are present in the working directory."
fi
wait

if [[ "$extract_samples" = TRUE ]]
        then
        echo "Retrieving sample info..."
        zgrep -m 1 '#CHROM' ${MYVCF}.gz | cut -f10- | tr '\t' '\n' > myvcfsamples.txt
        nrsamples=$(wc -l myvcfsamples.txt | cut -f1 -d ' ')
        seq 1 $nrsamples > mysamplenrs.txt
        echo "contig startbp endbp totalbp" | sed 's/ /\t/g' > header1.tmp.txt
        sed 's/^/nmiss_/' mysamplenrs.txt > mysamplenrs.nmiss.tmp.txt
        sed 's/^/nsites_/' mysamplenrs.txt > mysamplenrs.nsites.tmp.txt
        sed 's/^/nhet_/' mysamplenrs.txt > mysamplenrs.nhet.tmp.txt
        sed 's/^/nhomo_/' mysamplenrs.txt > mysamplenrs.nalthomo.tmp.txt
        paste -d '\n' mysamplenrs.nmiss.tmp.txt mysamplenrs.nsites.tmp.txt mysamplenrs.nhet.tmp.txt mysamplenrs.nalthomo.tmp.txt | tr '\n' '\t' > header2.tmp.txt
        if [[ "$sample_scores" = TRUE ]]
           then
           paste header1.tmp.txt header2.tmp.txt > mywindowheader.txt
           else
           cp header1.tmp.txt mywindowheader.txt
        fi
        rm mysamplenrs.txt mysamplenrs.nsites.tmp.txt mysamplenrs.nmiss.tmp.txt mysamplenrs.nhet.tmp.txt mysamplenrs.nalthomo.tmp.txt header2.tmp.txt header1.tmp.txt
        echo "Sample info retrieved."
fi

if [[ "$run_loop" = TRUE ]]
        then
        echo "Starting loop..."
        if [ -f "mywindowhe.${winsize}.${suffix}.txt" ]; then rm mywindowhe.${winsize}.${suffix}.txt; fi
        if [ -f "mywindowheader.txt" ]; then cp mywindowheader.txt mywindowhe.${winsize}.${suffix}.txt; else touch mywindowhe.${winsize}.${suffix}.txt; fi
        if [ -f "mywindowpi.${winsize}.${suffix}.txt" ]; then rm mywindowpi.${winsize}.${suffix}.txt; fi
        if [ -f "mywindowpiheader.txt" ]; then cp mywindowpiheader.txt mywindowpi.${winsize}.${suffix}.txt; else touch mywindowpi.${winsize}.${suffix}.txt; fi
        if [ -f "mywindowpoly.${winsize}.${suffix}.txt" ]; then rm mywindowpoly.${winsize}.${suffix}.txt; fi
        if [ -f "mywindowpolyheader.txt" ]; then cp mywindowpolyheader.txt mywindowpoly.${winsize}.${suffix}.txt; else touch mywindowpoly.${winsize}.${suffix}.txt; fi
        if [[ "$sample_scores" = TRUE ]]
                then
                nrsamples=$(wc -l myvcfsamples.txt | cut -f1 -d ' ')
                echo "Number of samples:" $nrsamples
                else
                echo "Not calculating sample specific scores."
        fi
        nrcontigs=$(wc -l mylongcontigs.txt | cut -f1 -d ' ')
        echo "Total number of contigs/scaffolds which are longer than 10Mb:" $nrcontigs
        for contignr in $(seq 1 $nrcontigs)
                do
                contigname=$(awk -v myline="$contignr" 'NR==myline' mylongcontigs.txt | cut -f1)
                contiglength=$(awk -v myline="$contignr" 'NR==myline' mylongcontigs.txt | cut -f2)
                echo -e $contignr'\t'$contigname'\t'$contiglength
                contiglength2=$(( $contiglength + $winsize ))
                for endbp in $(seq $winsize $winsize $contiglength2)
                        do
                        startbp=$(( $endbp - $winsize + 1 ))
                        if (( $endbp > $contiglength ))
                                then
                                endbp=$contiglength
                                winsize2=$(( $endbp - $startbp + 1 ))
                                else
                                winsize2=$winsize
                        fi
                        $TABIX/tabix ${MYVCF}.bgz ${contigname}:${startbp}-${endbp} > myvcfregion.${winsize}.allcolumns.noindels.txt
                        cut -f10- myvcfregion.${winsize}.allcolumns.noindels.txt > myvcfregion.${winsize}.noindels.txt
                        cut -f1-9 myvcfregion.${winsize}.allcolumns.noindels.txt > myvcfregion.${winsize}.metainfo.noindels.txt
                        totalbp=$(wc -l myvcfregion.${winsize}.metainfo.noindels.txt | cut -f1 -d ' ')
                        if [[ "$sample_scores" = TRUE ]]
                                then
                                for indnr in $(seq 1 $nrsamples)
                                        do
                                        cut -f$indnr myvcfregion.${winsize}.noindels.txt | grep -v '\./\.' > myvcfcolumn.${winsize}.txt
                                        totalsites=$(wc -l myvcfcolumn.${winsize}.txt | cut -f1 -d ' ')
                                        missingsites=$(( $winsize2 - $totalsites ))
                                        heterosites=$(grep '0/1\|0/2\|0/3\|1/2\|1/3\|2/3' -a --no-group-separator myvcfcolumn.${winsize}.txt | wc -l | cut -f1 -d ' ')
                                        althomosites=$(grep '1/1\|2/2\|3/3' -a --no-group-separator myvcfcolumn.${winsize}.txt | wc -l | cut -f1 -d ' ')
                                        sed -i "s/$/\t${missingsites}\t${totalsites}\t${heterosites}\t${althomosites}/" mywindowhe.${winsize}.tmp.txt
                                        done
                        fi
                        done
                done
	rm myvcf.header.txt myvcfregion*noindels.txt mywindowhe*tmp.txt
        if [[ "$sample_scores" = TRUE ]]
                then
                rm myvcfcolumn*txt
        fi
        echo "Analysis finished."
        else
        echo "Flag run_loop is set to FALSE. Not running analyses. Exiting."
fi

################################################

