#!/bin/bash
#-------------------------------
# January 2020
# Marlies Dolezal, Claire Burny
#-------------------------------

# Raw FastQ files have been QC and mapped by Viola Nolte.
# The input raw BAM have been sorted by position while downloading with Readtools from the cluster.
# Marlies Dolezal performed the variants calling and formatted this bash script.


# ----- Load global variables
source config.txt


# ----- Rename raw BAM 
function RenameDownloadedRawBam () {
  echo "----------------------"
  echo "FUNCTION: $FUNCNAME"
  echo "----------------------"

  path_data=$1  
  cd ${path_data}  #scan for the samples within a pool (1 dir = 1 barcode)
  subdir=$(ls -d Pool*)
  for sd in ${subdir[@]}; do
    cd ${path_data}/$sd &&\
    pref=$(echo $sd | awk -F'_novoalign_out_vn' '{print $1}') &&\
    cp novoalign_DistMap_output_Paired_end_reads.bam ${pref}.bam &&\
    cp novoalign_DistMap_output_Paired_end_reads.bai ${pref}.bai 
  done
  
}
#RenameDownloadedRawBam
RenameDownloadedRawBam ${path_project}${subpath_pool_rename}


# ----- Process raw BAM 
function ProcessRawBam () {
  echo "----------------------"
  echo "FUNCTION: $FUNCNAME"
  echo "----------------------"

  SECONDS=0 #set timer
  
  pref=$1
  pref=${pref%.bam*} #extract prefix of BAM file name

  # index
  ${path_softwares_exe}/samtools-1.10/samtools index ${path_project}${subpath_raw_BAM}/${pref}.bam &&\
  
  # remove duplicates
  cmd_nodup="java -Xmx4g -Dsnappy.disable=true -jar ${path_softwares_exe}/picard.jar MarkDuplicates \
  REMOVE_DUPLICATES=true I=${path_project}${subpath_raw_BAM}/${pref}.bam O=${path_project}${subpath_processed_BAM}/${pref}_nodup.bam \
  M=${path_project}${subpath_processed_BAM}/${pref}_nodup_picmetrics.txt VALIDATION_STRINGENCY=SILENT" &&\
  eval $cmd_nodup 2> ${path_project}${subpath_processed_BAM}/${pref}_nodup_log &&\

  cmd_fl="${path_softwares_exe}/samtools-1.10/samtools flagstat ${path_project}${subpath_processed_BAM}/${pref}_nodup.bam \
  > ${path_project}${subpath_processed_BAM}/${pref}_nodup.txt" &&\
  eval $cmdr_fl &&\

  # filter mapped output 
  cmd_filt="${path_softwares_exe}/samtools-1.10/samtools view -b -q 20 -f 0x002 -F 0x004 -F 0x008 ${path_project}${subpath_processed_BAM}/${pref}_nodup.bam \
  > ${path_project}${subpath_processed_BAM}/${pref}_nodup_q20.bam" &&\
  eval $cmd_filt 2> ${path_project}${subpath_processed_BAM}/${pref}_nodup_q20_log &&\

  cmdf_fl="${path_softwares_exe}/samtools-1.10/samtools flagstat ${path_project}${subpath_processed_BAM}/${pref}_nodup_q20.bam \
  > ${path_project}${subpath_processed_BAM}/${pref}_nodup_q20.txt" &&\
  eval $cmdf_fl &&\

  # index
  ${path_softwares_exe}/samtools-1.10/samtools index ${path_project}${subpath_processed_BAM}/${pref}_nodup_q20.bam &&\

  # clip overlapping sorted paired-end reads
  cmd_clip="${path_softwares_exe}/bamUtil/bin/bam clipOverlap --in ${path_project}${subpath_processed_BAM}/${pref}_nodup_q20.bam \
  --out ${path_project}${subpath_processed_BAM}/${pref}_nodup_q20_clip.bam \
  --stats --poolSize 1000000000" &&\
  eval $cmd_clip 2> ${path_project}${subpath_processed_BAM}/${pref}_nodup_q20_clip_log &&\

  # index
  ${path_softwares_exe}/samtools-1.10/samtools index ${path_project}${subpath_processed_BAM}/${pref}_nodup_q20_clip.bam &&\
  
  # remove redundant file
  rm ${path_project}${subpath_processed_BAM}/${pref}_nodup.bam &&\
   
  ELAPSED="Elapsed: $(($SECONDS / 3600))hrs $((($SECONDS / 60) % 60))min $(($SECONDS % 60))sec" &&\
  echo "$pref: $ELAPSED" #print timer

}
#ProcessRawBam
cd ${path_project}${subpath_raw_BAM} &&\
files=$(ls P**.bam)
for f in ${files[@]}; do
  ProcessRawBam $f
done 


# ----- Rename files using the following nomenclature X_Y_Z.bam with:
#X: type of sample: "S", "O" (parental strains Samarkand or Oregon-R respectively), "F" (from generation 1 to more evolved time points)
#Y: generation: encoded as "000" (parental strains), "001" (generation 1), "010" (generation 10)...
#Z: setup: "X" (parental strains), "A", "B" see Methods

function RenameFiles () {
  echo "----------------------"
  echo "FUNCTION: $FUNCNAME"
  echo "----------------------"

  declare -a _old=("${!1}")
  declare -a _new=("${!2}")

  counter=0	
  for name in ${_old[@]}; do
    echo "counter: $counter"		
	echo "old name: $name"
	echo "new name: ${_new[$counter]}"
  
    cp ${name} ${_new[$counter]} &&\
    rm ${name} &&\
    let counter=counter+1
  done

}
#RenameFiles
cd ${path_project}${subpath_processed_BAM} &&\
old_names=( "Pool_104a_index_12_OrR_nodup_q20_clip.bam" "Pool_104a_index_6_Sam_nodup_q20_clip.bam")
new_names=( "O_000_R00_X_nodup_q20_clip.bam" "S_000_R00_X_nodup_q20_clip.bam")
RenameFiles old_names[@] new_names[@]

old_names=( "Pool_281a+b_i11_nodup_q20_clip.bam" "Pool_281a+b_i3_nodup_q20_clip.bam")
new_names=( "F_020_R05_B_29_nodup_q20_clip.bam" "F_020_R01_B_29_nodup_q20_clip.bam")
RenameFiles old_names[@] new_names[@]

old_names=( "Pool_281a+b_i6_nodup_q20_clip.bam")
new_names=( "F_020_R02_B_29_nodup_q20_clip.bam")
RenameFiles old_names[@] new_names[@]

old_names=( "Pool_281a+b_i7_nodup_q20_clip.bam")
new_names=( "F_020_R03_B_29_nodup_q20_clip.bam")
RenameFiles old_names[@] new_names[@]

old_names=( "Pool_281a+b_i8_nodup_q20_clip.bam" "Pool_282a+b_i18_nodup_q20_clip.bam" "Pool_282a+b_i20_nodup_q20_clip.bam" "Pool_282a+b_i22_nodup_q20_clip.bam" "Pool_282a+b_i25_nodup_q20_clip.bam" "Pool_282a+b_i27_nodup_q20_clip.bam" )
new_names=( "F_020_R04_B_29_nodup_q20_clip.bam" "F_020_R06_B_29_nodup_q20_clip.bam" "F_020_R07_B_29_nodup_q20_clip.bam" "F_020_R08_B_29_nodup_q20_clip.bam" "F_020_R09_B_29_nodup_q20_clip.bam" "F_020_R10_B_29_nodup_q20_clip.bam" )
RenameFiles old_names[@] new_names[@]


# ----- Update header BAM 
function ReHeader () {
  echo "----------------------"
  echo "FUNCTION: $FUNCNAME"
  echo "----------------------"

  SECONDS=0 
  
  pref=$1
  echo $pref 
  info=${pref%_nodup_q20_clip.bam*} &&\

  java -jar ${path_softwares_exe}/picard.jar AddOrReplaceReadGroups I= ${path_project}${subpath_processed_BAM}/${info}_nodup_q20_clip.bam \
  O= ${path_project}${subpath_processed_BAM}/${info}_RG.bam \
  RGID=${info} \
  RGPL=illumina \
  RGLB=LibH \
  RGSM=${info} \
  RGPU=unit1 \
  VALIDATION_STRINGENCY=SILENT &&\

  ${path_softwares_exe}/samtools-1.10/samtools index ${path_project}${subpath_processed_BAM}/${info}_RG.bam &&\

  ELAPSED="Elapsed: $(($SECONDS / 3600))hrs $((($SECONDS / 60) % 60))min $(($SECONDS % 60))sec" &&\
  echo "$pref: $ELAPSED"
}
#ReHeader
cd ${path_project}${subpath_processed_BAM} &&\
files=( "F_020_R04_B_29_nodup_q20_clip.bam" "F_020_R06_B_29_nodup_q20_clip.bam" "F_020_R07_B_29_nodup_q20_clip.bam" "F_020_R08_B_29_nodup_q20_clip.bam" "F_020_R09_B_29_nodup_q20_clip.bam" "F_020_R10_B_29_nodup_q20_clip.bam" )
for f in ${files[@]}; do
  ReHeader $f
done 


# ----- Create Bamlist file 
function CreateBAMlistfile () {
  echo "----------------------"
  echo "FUNCTION: $FUNCNAME"
  echo "----------------------"
  ls *$1 | awk '{printf $1 "\n"}' > bamlist.txt
}
#CreateBAMlistfile
cd ${path_project}${subpath_BAM_var_call} &&\
pattern_input="X_RG.bam"
CreateBAMlistfile ${pattern_input}


# ----- Call Freebayes
function CallFreebayes () {
  echo "----------------------"
  echo "FUNCTION: $FUNCNAME"
  echo "----------------------"

  echo "start freebayes"
  SECONDS=0 
  
  ploidy=$1

  cmd_var="${path_softwares_exe}/freebayes/build/freebayes -L ${path_project}${subpath_BAM_var_call}/bamlist.txt \
  --fasta-reference ${path_project}${subpath_fasta}/dmel6.03-clean.wMel_wRi_Lactobacillus_Acetobacter.fa \
  --ploidy ${ploidy} > ${path_project}${subpath_raw_vcf}/ploidy${ploidy}_raw.vcf"
  eval $cmd_var 2> ${path_project}${subpath_raw_vcf}/ploidy${ploidy}_raw_log &&\

  ELAPSED="Elapsed: $(($SECONDS / 3600))hrs $((($SECONDS / 60) % 60))min $(($SECONDS % 60))sec" &&\
  echo "$pref: $ELAPSED"
}
#CallFreebayes
cd ${path_project}${subpath_BAM_var_call} &&\
CallFreebayes 2 &&\ #autosomes and X
 "finished workflow at" `date` with exit state $ES 	