#!/bin/bash

#NOTE: this is a shortened version of DARWINDOW (https://github.com/mennodejong1986/Darwindow) that contains only the preperation steps.
#Written by Menno de Jong, edited by Magnus Wolf 01.12.2022
#THERE IS NO NEED TO EDIT ANYTHING IN THIS SCRIPT AS ALL PARAMETERS AND THINGS YOU MIGHT WANNE CHANGE ARE DEFINED IN THE MAIN SCRIPT OF RESEQ-to-Popanalyses!!!!!!!!!!!!


TABIX=${1?Error: not the correct amount of arguments, it needs a path to the bin in which is tabix, a name of the vcf without the gz extension and a suffix}
MYVCF=${2?Error: not the correct amount of arguments,it needs a path to the bin in which is tabix, a name of the vcf without the gz extension and a suffix}
suffix=${3?Error: not the correct amount of arguments,it needs a path to the bin in which is tabix, a name of the vcf without the gz extension and a suffix}

biallelic=FALSE
annotated=FALSE
haploiddata=FALSE						# Set to TRUE in case of haploid data. Only useful to count alternative sites (because no heterozygous sites present anyway).

# preparatory steps:
do_bgzip=TRUE
do_tabix=TRUE
extract_contiginfo=TRUE
extract_samples=TRUE

###########################


###### PREPARE FILES ######

if [[ -f "${MYVCF}" ]]
	then
	echo "Input: unzipped file."
	grep -B1000000 -m 1 '#CHROM' ${MYVCF} > myvcf.header.txt
	else
	echo "Input: gzipped file."
	zgrep -B1000000 -m 1 '#CHROM' ${MYVCF}.gz > myvcf.header.txt
fi

# index:
if [[ "$do_bgzip" = TRUE ]]
	then
	echo "Zipping with bgzip..."
	echo "This may take up to an hour, depending on the size of the vcf file."
	if [[ -f "${MYVCF}.gz" ]]
	    then
	    echo "Input: gzipped file."
	    gunzip -c ${MYVCF}.gz | grep -v 'INDEL' | $TABIX/bgzip -f > ${MYVCF}.bgz &
	    else
	    if [[ -f "${MYVCF}" ]]
			then
			echo "Input: unzipped file."
			grep -v 'INDEL' $MYVCF | $TABIX/bgzip -f > ${MYVCF}.bgz &
			else
			echo "ERROR: input (gzipped) vcf-file not found. Note that you should not provide the gz extension, even if file is gzipped."
		fi
	fi
	else
	echo "Skipping bgzip step because 'do_bgzip' is set to FALSE."
fi
wait

if [[ "$do_tabix" = TRUE ]]
	then
	echo "Indexing with tabix..."
	echo "This may take several minutes (>up to half an hour), depending on the size of the vcf file."
	$TABIX/tabix -f -p vcf ${MYVCF}.bgz
	else
	echo "Skipping indexing step because 'do_tabix' is set to FALSE."
fi
wait

if [[ "$extract_contiginfo" = TRUE ]]
	then
	echo "Selecting contigs with minimum length (specified by mincontigbp flag)..."
	# extract contig length information:
	grep 'contig' myvcf.header.txt | cut -f1 -d ',' | sed 's/ID=/!/g' | cut -f2 -d '!' > mycontigs.txt
	grep 'contig' myvcf.header.txt | cut -f2 -d ',' | cut -f2 -d '=' | sed 's/>//g' > mylengths.txt
	paste mycontigs.txt mylengths.txt > mycontiglengths.txt
	rm mycontigs.txt mylengths.txt
	# select contigs based on minimum length:
	#awk '$2>5000000' mycontiglengths.txt > mylongcontigs.txt
	awk -v minlength="$mincontigbp" '$2>=minlength' mycontiglengths.txt > mylongcontigs.txt
	ncontigs=$(wc -l mylongcontigs.txt | cut -f1 -d ' ')
    echo "Finished selecting contigs."
    echo "Number of contigs retained: "$ncontigs
	echo "Results stored in the files mycontiglengths.txt and mylongcontigs.txt."
	else
	echo "Not extracting contig length information because the flag 'extract_contiginfo' is set to FALSE."
	echo "Assuming the files 'mycontiglengths.txt' and 'mylongcontigs.txt' are present in the working directory." 
fi
wait

if [[ "$extract_samples" = TRUE ]]
	then
	echo "Retrieving sample info..."
	# bcftools query -l ${MYVCF}.gz > myvcfsamples.txt
	zgrep -m 1 '#CHROM' ${MYVCF}.gz | cut -f10- | tr '\t' '\n' > myvcfsamples.txt
	nrsamples=$(wc -l myvcfsamples.txt | cut -f1 -d ' ')
	seq 1 $nrsamples > mysamplenrs.txt
	
	# create file header:
	if [[ "$annotated" = TRUE ]]
	   then
	   echo "contig startbp endbp totalbp ncoding low_mono low_poly high_mono high_poly" | sed 's/ /\t/g' > header1.tmp.txt
	   else
	   echo "contig startbp endbp totalbp" | sed 's/ /\t/g' > header1.tmp.txt
	fi
	sed 's/^/nmiss_/' mysamplenrs.txt > mysamplenrs.nmiss.tmp.txt
	sed 's/^/nsites_/' mysamplenrs.txt > mysamplenrs.nsites.tmp.txt
	sed 's/^/nhet_/' mysamplenrs.txt > mysamplenrs.nhet.tmp.txt
	sed 's/^/nhomo_/' mysamplenrs.txt > mysamplenrs.nalthomo.tmp.txt
	paste -d '\n' mysamplenrs.nmiss.tmp.txt mysamplenrs.nsites.tmp.txt mysamplenrs.nhet.tmp.txt mysamplenrs.nalthomo.tmp.txt | tr '\n' '\t' > header2.tmp.txt
	if [[ "$sample_scores" = TRUE ]]
	   then
	   paste header1.tmp.txt header2.tmp.txt > mywindowheader.txt
	   else
	   cp header1.tmp.txt mywindowheader.txt
	fi
	rm mysamplenrs.txt mysamplenrs.nsites.tmp.txt mysamplenrs.nmiss.tmp.txt mysamplenrs.nhet.tmp.txt mysamplenrs.nalthomo.tmp.txt header2.tmp.txt header1.tmp.txt 
	echo "Sample info retrieved."
fi




