#!/bin/bash

#this is run_mpileup_parralel_v10.sh a slightly altered version of a script called BAM2VCF_run_mpileup_parallel_HIGHWAY.sh written by Menno de Jong.

########################  USER-DEFINED SECTION ##################################

NRSETS=${1?Error: no number of subdivisions given}                                              # Number of subdivisions (this is basically the number of co$
THREADS=${2?Error: no number of threads given}
SAMTOOLS=${3?Error: no path to sammtools executable}                                            # Path to samtools executable
BCFTOOLS=${4?Error: no path to bcftools execulable}                                             # Path to bcftools executable
REFERENCE=${5?Error: no path to reference.fasta}                                                # Path to reference genome
PREFIX=${6?Error: no PREFIX given}                                                              # Prefix for naming new files (i.e. bam file name without th$
BAMFILE=${7?Error: no inputfile provided}                                                      # provide list of all input bam files
MINMAPQUAL=${8?Error: no mapping quality cutoff provided}                                       # provide a minimal mapping quality a SNP has to meet
MINBASEQUAL=${9?Error: no base quality cutoff provided}                                         # provide a minimal base quality a SNP has to meet

createbed=TRUE
runmpileup=TRUE
combinebcf=TRUE
indexbcf=TRUE
callgenotypes=TRUE
combinevcf=TRUE

# Note: sometimes when running createbed you might get the error:
# error: mybed.txt: No such file or directory
# In that case choose another (lower?) value for NRSETS and try again

# In a conda environment, I run into the error: File size limit exceeded (core dumped)
#################################################################################
# User shouldn't change anything from here, except for removing hashtags if the user wants to initiate the samtools mpileup loop (after having created the bed files).

if [[ "$createbed" = TRUE ]]
        then
        echo "Start creating bed files"
        ${SAMTOOLS} faidx ${REFERENCE}                                                  # find out total number of bp in your genome
        cut -f1-2 ${REFERENCE}.fai > contigs.bed
        awk '{ total += $2 } END { print total }' contigs.bed > nrsites.txt
        NRSITES=$(cat nrsites.txt)
        cut -f2 contigs.bed > column2.txt
        perl -lne 'print $sum+=$_' column2.txt > cum.txt
        paste -d '\t' contigs.bed cum.txt > contigs.cum.bed
        echo "Subdivisions:"
        seq 1 $NRSETS > mynumbers.txt
        for digit in $(cat mynumbers.txt)
        do
        	num=$(($digit-1))
        	part=$((${NRSITES}/${NRSETS}))
        	var1=$((${num} * ${part}))
        	var2=$((${digit} * ${part}))
        	echo $digit
        	echo $var1
        	echo $var2
        	awk -v mystart="${var1}" -v myend="${var2}" -v mydigit="${digit}" '{ if ( $3 >= mystart && $3 <= myend ) print > "mybed"mydigit".txt" }' contigs.cum.bed
        	awk -v mystart="${var1}" -v myend="${var2}" '{ if ( $3 >= mystart && $3 <= myend ) print > "mybed.txt" }' contigs.cum.bed
        	cut -f1 mybed.txt > contignames.txt
        	cut -f2 mybed.txt > contiglengths.txt
        	sed -i 's/^/0___/' contiglengths.txt
        	sed -i 's/___/\t/g' contiglengths.txt
        	paste -d '\t' contignames.txt contiglengths.txt > mybed.txt
        	awk '{ if ($2 == "0") $2=1; print $0 }' mybed.txt | sed 's/ /\t/g' > mybed.temp.txt     # bcftools expects 1 based reference system
        	mv mybed.temp.txt mybed.txt                                                             # bcftools expects 1 based reference system
        	mv mybed.txt mybed${digit}.txt
        done
        rm mynumbers.txt nrsites.txt contig* cum.txt column2.txt
        echo "Creating overview of number of bp per subset..."
        ls -1 mybed*txt > bedfiles.nrsites.tmp1.txt
        if [ -f "bedfiles.nrsites.tmp2.txt" ]; then rm bedfiles.nrsites.tmp2.txt; fi
        touch bedfiles.nrsites.tmp2.txt
        for bedfile in mybed*txt
        do
        	nrsites=$(awk '{sum+=$3;} END{print sum;}' $bedfile)
        	echo $nrsites >> bedfiles.nrsites.tmp2.txt
        	paste bedfiles.nrsites.tmp1.txt bedfiles.nrsites.tmp2.txt > bedfiles.nrsites.txt
        done
        rm bedfiles.nrsites.tmp1.txt bedfiles.nrsites.tmp2.txt

        echo "Finished creating bed files."
        echo "You can observe the file 'bedfiles.nrsites.txt' to see if sites are relatively equally distributed over subsets."
        echo "Often the last file (containing unplaced scaffolds) contains considerable more sites than the other files."
        echo "It might be worthwhile to split this file into multiple files otherwise this file may cause a delay."
fi

if [[ "$runmpileup" = TRUE ]]
        then
        echo "Starting bcftools mpileup now."
        for mybedfile in mybed*.txt
        do
	        echo ${mybedfile}
        	# base quality score of Illumina 1.8+ ranges from 0 to 41, and are denoted by, from low to high: !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI. If using threshold of 13, any base with value on left hand side from / is deleted.
        	# Q: min-BQ: minimum base quality (default is 13, which roughly corresponds to a probability of 0.05 that base call is incorrect, as given by the formula -10*log10(0.05))
        	# q: min-MQ: minimum mapping quality (default is 0)
        	${BCFTOOLS} mpileup -A --min-MQ ${MINMAPQUAL} --min-BQ ${MINBASEQUAL} -C50 -a "DP,AD" -R ${mybedfile} --output-type b -f ${REFERENCE} ${BAMFILE} > ${PREFIX}.${mybedfile}.bcf &
        	# by default only PL in sample info columns format
        	# with the -a flag (annotate flag) we add AD (allele depth) and DP (total depth), so that during bcftools call this information can be included in the sample genotype columns
        	#
        	## What about -B or --no-BAQ?
        	## For low depth data (around 15x depth):
        	# -B or --no-BAQ:       do NOT set this flag!!
        	# Specifying this option will disable probabilistic realignment for the computation of base alignment quality (BAQ), which greatly helps to reduce false SNPs caused by misalignments.
        	# BAQ is the Phred-scaled probability of a read base being misaligned.
        	# m: min-ireads: minimum number gapped reads needed to infer an indel. By default: 1. This threshold could be set a bit higher, e.g. 2 or 3.
        	## For high depth data (around 60x depth):
        	# DO set --B but use a higher min-BQ threshold.
        	# The reason is that the BAQ calculations have been written for low depth data, and the underlying algorithm is not suitable for high depth data.
        	#
        done
        wait
        echo "Finished mpileup."
fi

if [[ "$combinebcf" = TRUE ]]
        then
        echo "Listing and sorting bcf subset files..."
        ls ${PREFIX}*mybed*bcf > mybcfsubset.files.txt
        cut -f2 -d '.' mybcfsubset.files.txt | sed 's/mybed//g' > mybcfsubset.numbers.txt
        paste mybcfsubset.numbers.txt mybcfsubset.files.txt | sort -k1,1n | cut -f2 > mybcfsubset.files.sorted.txt
        rm mybcfsubset.files.txt mybcfsubset.numbers.txt
        echo "Start concatenating bcf files to one single file."
        ${BCFTOOLS} concat --threads ${THREADS} --output-type b --file-list mybcfsubset.files.sorted.txt > ${PREFIX}.combined.bcf
        # ${BCFTOOLS} concat --threads 20 --output-type b ${PREFIX}.mybed*.bcf > ${PREFIX}.combined.bcf
        # Even though number of threads is set to 20, I have only seen concat command using up to 800%.
        echo "Finished combining."
        # rm *mybed*bcf
        # echo "Removed intermediate bcf-files."
        # To view header:
        # bcftools view -h ${PREFIX}.combined.bcf | head -3
        # To view data:
        # bcftools view -H ${PREFIX}.combined.bcf  | head -3
fi

if [[ "$indexbcf" = TRUE ]]
        then
        echo "Indexing bcf-file..."
        ${BCFTOOLS} index -f --threads ${THREADS} ${PREFIX}.combined.bcf
        echo "Finished indexing bcf-file."
fi

if [[ "$callgenotypes" = TRUE ]]
        then
        echo "Genotype calling using bcftools call..."
        for mybedfile in mybed*.txt
        do
                echo ${mybedfile}
                echo "Assuming invariant ploidy-level of 2 for all scaffolds and all samples..."
                ${BCFTOOLS} call -a GQ -c -R ${mybedfile} --ploidy 2 --output-type z ${PREFIX}.combined.bcf | ${BCFTOOLS} norm --check-ref w --fasta-ref ${REFERENCE} -O z > ${PREFIX}.${mybedfile}.vcf.gz &
                # The bcftools norm command might return to the screen many lines starting with 'NON_ACGTN_REF'
                # If using the flag '--check-ref e' with e for error rather than w for warning, bcftools would abort.
                #
                # BACKGROUND INFO:
                # Better not to multi-thread per file, because depending on the number of subsets you might overload the system (e.g 25x3=75 cores)
                # choose c (consensus) or m (multicaller) mode. Consensus is the original model, but multicaller mode (which considers more than 2 alleles per samples?) is now preferred.
                # -p, --pval-threshold float: if in mode -c, accept variant if P(ref|D) < float. In words: assume alternative allele if probability of reference allele given the available data is below value
                # Do not use -v (--variants-only) flag, otherwise information on monomorphic sites will get lost, which makes it impossible to calculate measures such as He, pi and Dxy.
                #
                # --annotate (-a) flag can add extra fields
                # Note, the --annotate flag has replaced the --format-fields flag:
                # -- format-fields (-f) can add GQ,GP fields (If you try to add other terms, you will receive an error)
                #
                # By default, genotype format of output vcf file:
                # GT:PL
                # If flag -a "DP,PL,AD" has been specified during mpileup run, then automatically it becomes:
                # GT:DP:AD                      # if ALT is absent              e.g. 0/0:16:16
                # GT:PL:DP:AD           # if ALT is present             e.g. 0/0:0,12,108:4:4,0
                #
                # We add Genotype Quality (which can be used for filtering by vcftools). The genotype format becomes:
                # GT:DP:AD                      # if ALT is absent              e.g. 0/0:16:16
                # GT:PL:DP:AD:GQ        # if ALT is present             e.g. 0/0:0,12,108:4:4,0:127
        done
        wait
        echo "Finished bcftools call for all subsets."
fi

if [[ "$combinevcf" = TRUE ]]
        then
        echo "Creating sorted list of vcf files..."
        ls ${PREFIX}*mybed*.txt.vcf.gz > myvcfsubset.files.txt
        cut -f2 -d '.' myvcfsubset.files.txt | sed 's/mybed//g' > myvcfsubset.numbers.txt
        paste myvcfsubset.numbers.txt myvcfsubset.files.txt | sort -k1,1n | cut -f2 > myvcfsubset.files.sorted.txt
        rm myvcfsubset.files.txt myvcfsubset.numbers.txt
        echo "Combining (i.e. concatenating) vcf files..."
        ${BCFTOOLS} concat --threads 30 --file-list myvcfsubset.files.sorted.txt -O z -o ${PREFIX}.vcf.gz
        wait
        echo "Finished combining vcf files. Output is stored in file 'PREFIX.vcf.gz'."
fi

