#!/bin/bash
# wrapper to fastq_pre_barcodes
# Generates a BAM file with unaligned reads and all barcodes preprocessed
VERSION="0.21.0"

command -v fastq_pre_barcodes   >/dev/null 2>&1 || { echo "ERROR: fastq_utils does not seem to be installed - fastq_pre_barcodes not found.  Aborting." >&2; exit 1; }
command -v samtools   >/dev/null 2>&1 || { echo "ERROR: fastq_utils does not seem to be installed - samtools not found.  Aborting." >&2; exit 1; }

set -e
##########################################################
#
function pinfo {
    echo "[INFO] $*" > /dev/stderr
}

function perror {
    echo "[ERROR] $*" > /dev/stderr
}

function usage() {
    echo "fastq2bam VERSION=$VERSION"
    echo "Usage: fastq2bam -s schema -b out_bam -1 fastqfile1 [ -2 fastqfile2 -3 fastqfile3 -4 fastqfile4]"
    echo "barcode schema: 10xV1 10xV1i 10xV1a 10xV2 drop-seq none"
    echo "10xV1 10xV1i 10xV1a 10xV2 and drop-seq schemas may be customised through the following paramaters:"
    echo " -c int : cell barcode offset"
    echo " -C int : cell barcode size"
    echo " -u int : UMI barcode offset"
    echo " -U int : UMI barcode size"
    echo " -z int : sample barcode offset"
    echo " -Z int : sample barcode size"
    echo " -X : use 10x UMI tags instead of tags defined in the SAM specification"
}

function gen_bam() {
    cmd=$*
    set -eo pipefail
    cmd="$cmd --sam --outfile1 - | samtools view -b - > $bam_file.tmp && mv $bam_file.tmp $bam_file"
    echo $cmd
    bash -c "set -eo pipefail; $cmd"
    set -e
    ## bam should not be empty
    if [ "-$(samtools view -c $bam_file)" == "-0" ]; then
	perror "All reads discarded - please check the warnings"
	rm -f $bam_file
	exit 1
    fi

}

function set_default_value {
    var=$1
    default_value=$2
    if [ "${!var}-" == "-" ]; then
	eval $var=$default_value
    fi
}
##########################################################
##
schema=
bam_file=
fastq_file1=
fastq_file2=
fastq_file3=
fastq_file4=
cell_barcode_offset=
cell_barcode_size=
umi_barcode_offset=
umi_barcode_size=
sample_barcode_offset=
sample_barcode_size=
_10x_compat=

while getopts "s:z:Z:c:C:u:U:b:s:a:1:2:3:4:hX"  Option
do
    case $Option in
        X ) _10x_compat="--10x";;
        b ) bam_file=$OPTARG;;
	s ) schema=$OPTARG;;
	a ) schema=$OPTARG;;
	1 ) fastq_file1=$OPTARG;;
	2 ) fastq_file2=$OPTARG;;
	3 ) fastq_file3=$OPTARG;;
	4 ) fastq_file4=$OPTARG;;
	c ) cell_barcode_offset=$OPTARG;;
	C ) cell_barcode_size=$OPTARG;;
	u ) umi_barcode_offset=$OPTARG;;
	U ) umi_barcode_size=$OPTARG;;
	z ) sample_barcode_offset=$OPTARG;;
	Z ) sample_barcode_size=$OPTARG;;
        h ) usage; exit 0;;
	\?) exit 1;;
    esac
done

if [ "$*-" == "-" ] || [ "$schema-" == "-" ]; then
    usage
    exit 1
fi

if [ "$bam_file-" == "-" ] ; then
    usage
    exit 1
fi

#schema=$1
#out_bam=$2
#shift 2

pinfo schema=$schema
pinfo bam_file=$bam_file
pinfo fastq1=$fastq_file1
pinfo fastq2=$fastq_file2
pinfo fastq3=$fastq_file3


if [ "$schema-"  == "10xV2-" ]; then
    # index1=R1 file index2=I1 reads=R2
    if [ "$fastq_file2-"  == "-" ]; then
	echo "Missing files for $schema:"
	echo "-1 cell#16|umi#10 -2 cdna -3 [-3 sample#8]"
	echo "e.g., -1 *R1*.fastq.gz -2 *R2*.fastq.gz [-3 *I1*.fastq.gz]"
	usage
	exit 1
    fi
    set_default_value "umi_barcode_offset" 16
    set_default_value "umi_barcode_size" 10
    set_default_value "cell_barcode_offset" 0
    set_default_value "cell_barcode_size" 16

    cmd="fastq_pre_barcodes --read1 $fastq_file2 --index1 $fastq_file1 --umi_read index1 --umi_offset $umi_barcode_offset --umi_size $umi_barcode_size --cell_read index1 --cell_offset $cell_barcode_offset --cell_size $cell_barcode_size  $_10x_compat"
    if [ "$fastq_file3-" != "-" ]; then
	set_default_value "sample_barcode_offset" 0
	set_default_value "sample_barcode_size" 8
	cmd="$cmd --index2 $fastq_file3 --sample_read index2 --sample_offset $sample_barcode_offset --sample_size $sample_barcode_size" 
    fi
    gen_bam $cmd
    exit 0
fi

if [ "$schema-"  == "10xV1-" ]; then
    # index1=read2/I7, index2=I5
    if [ "$fastq_file2-"  == "-" ]; then
	echo "Missing files for $schema:"
	echo "-1 cdna  -2 cell#14|umi#10 [-3 sample#8]"
	echo "e.g., -1 *R1*.fastq.gz -2 *R2*.fastq.gz  [-3 *I5*.fastq.gz]"
	usage
	exit 1
    fi
    set_default_value "umi_barcode_offset" 14
    set_default_value "umi_barcode_size" 10
    set_default_value "cell_barcode_offset" 0
    set_default_value "cell_barcode_size" 14

    cmd="fastq_pre_barcodes --read1 $fastq_file1 --index1 $fastq_file2 --index1 $fastq_file2 --umi_read index1 --umi_offset $umi_barcode_offset --umi_size $umi_barcode_size --cell_read index1 --cell_offset $cell_barcode_offset --cell_size $cell_barcode_size  $_10x_compat"
    if [ "$fastq_file3-" != "-" ]; then
	set_default_value "sample_barcode_offset" 0
	set_default_value "sample_barcode_size" 8
	cmd="$cmd  --index2 $fastq_file2 --sample_read index2 --sample_offset $sample_barcode_offset --sample_size $sample_barcode_size" 
    fi
    gen_bam $cmd
    exit 0
fi


if [ "$schema-"  == "10xV1a-" ]; then
    # fastq_file1 = read_RA*_1 = cdna
    # fastq_file2 = index3=read_RA*_2 = UMI
    # fastq_file3 = index1=*I1* = cell barcode
    # fastq_file4 = index2=*I2* = sample barcodes
    if [ "$fastq_file2-"  == "-" ] || [ "$fastq_file3-" == "-" ]; then
	echo "Missing files for $schema:"
	echo "-1 cdna  -2 umi#10 -3 cell#14 [-4 sample#8]"
	echo "e.g., -1 *RA_1.fastq.gz -2 *RA_2.fastq.gz -3 *I1*.fastq.gz [-4 *I2*.fastq.gz]"
	usage
	exit 1
    fi
    set_default_value "umi_barcode_offset" 0
    set_default_value "umi_barcode_size" 10
    set_default_value "cell_barcode_offset" 0
    set_default_value "cell_barcode_size" 14

    cmd="fastq_pre_barcodes --read1 $fastq_file1 --index2 $fastq_file2 --index1 $fastq_file3 --umi_read index2 --umi_offset  $umi_barcode_offset --umi_size  $umi_barcode_size --cell_read index1 --cell_offset  $cell_barcode_offset --cell_size $cell_barcode_size  $_10x_compat"
    if [ "$fastq_file4-"  != "-" ]; then
	set_default_value "sample_barcode_offset" 0
	set_default_value "sample_barcode_size" 8
	cmd="$cmd  --index3 $fastq_file4 --sample_read index3 --sample_offset $sample_barcode_offset --sample_size $sample_barcode_size"
    fi
    gen_bam $cmd
    exit 0
fi

if [ "$schema-"  == "10xV1i-" ]; then
    # fastq_file1 = read_RA*_1 = cdna & index3=read_RA*_2 = UMI
    # fastq_file2 = index1=*I1* = cell barcode
    # fastq_file3 = index2=*I2* = sample barcodes
    if [ "$fastq_file2-"  == "-" ]; then
	echo "Missing files for $schema:"
	echo "-1 cdna/umi#10 -2 cell#14  [-3 sample#8]"
	echo "e.g., -1 *RA*.fastq.gz -2 *I1*.fastq.gz [-3 *I2*.fastq.gz]"
	usage
	exit 1
    fi
    set_default_value "umi_barcode_offset" 0
    set_default_value "umi_barcode_size" 10
    set_default_value "cell_barcode_offset" 0
    set_default_value "cell_barcode_size" 14

    cmd="fastq_pre_barcodes --read1 $fastq_file1 --index3 $fastq_file1 --index1 $fastq_file2 --umi_read index3 --umi_offset $umi_barcode_offset --umi_size $umi_barcode_size --cell_read index1 --cell_offset $cell_barcode_offset --cell_size $cell_barcode_size --interleaved read1,index3  $_10x_compat"
    if [ "$fastq_file3-"  != "-" ]; then
	set_default_value "sample_barcode_offset" 0
	set_default_value "sample_barcode_size" 8
	cmd="$cmd  --index2 $fastq_file3 --sample_read index2 --sample_offset $sample_barcode_offset --sample_size $sample_barcode_size"
    fi
    gen_bam $cmd
    exit 0
fi


if [ "$schema-"  == "dropseq-" ]; then
    pinfo "schema $schema under development"
    if [ "$fastq_file2-"  == "-" ]; then
	echo "Missing files for $schema:"
	echo "-1 umi#8|cell#12 -2 cdna"
	echo "e.g., -1 *_1.fastq.gz -2 *_2.fastq.gz"
	usage
	exit 1
    fi
    set_default_value "umi_barcode_offset" 12
    set_default_value "umi_barcode_size" 8
    set_default_value "cell_barcode_offset" 0
    set_default_value "cell_barcode_size" 12

    cmd="fastq_pre_barcodes --read1 $fastq_file2 --index1 $fastq_file1 --umi_read index1 --umi_offset $umi_barcode_offset --umi_size $umi_barcode_size --cell_read index1 --cell_offset  $cell_barcode_offset --cell_size $cell_barcode_size $_10x_compat"
    gen_bam $cmd

    exit 0
fi

if [ "$schema-"  == "none-" ]; then
    cmd="fastq_pre_barcodes --read1 $fastq_file1"
    if [ "$fastq_file2-" != "-" ]; then
	cmd="$cmd  --read2 $fastq_file2" 
    fi
    gen_bam $cmd
    exit 0
fi

pinfo "Unknown  schema $schema"
exit 1


