#!/bin/bash

# --- paths
project="/Volumes/cluster/Claire/SO_PaperI/manuscript/"
path_ref=${project}"annotations/"; 
path_data_raw=${project}"results/raw_vcf/";
path_data_processed=${project}"results/processed_vcf/";
path_data_count=${project}"results/counts/"; 
path_software=${project}"software/"; 
path_src=${project}"scripts/"; 
path_raw_BAM=${project}"results/raw_BAM/";

# ---- compute average coverage from number of reads and number of mapped reads
samtools flagstat ${path_raw_BAM}Pool_104a_index_6_Sam.bam > ${path_raw_BAM}S_000_R00_X.txt &&\ #73269122 mapped / 76852330 reads
samtools flagstat ${path_raw_BAM}Pool_104a_index_12_OrR.bam > ${path_raw_BAM}O_000_R00_X.txt &&\ #75749071 mapped / 77992744 reads
samtools flagstat ${path_raw_BAM}Pool_281a+b_i3.bam > ${path_raw_BAM}F_020_R01_B_29.txt &&\ #177549913 mapped / 186623116 reads
samtools flagstat ${path_raw_BAM}Pool_281a+b_i6.bam > ${path_raw_BAM}F_020_R02_B_29.txt &&\ #157053917 mapped / 164544420 reads
samtools flagstat ${path_raw_BAM}Pool_281a+b_i7.bam > ${path_raw_BAM}F_020_R03_B_29.txt &&\ #191703827 mapped / 199492124 reads
samtools flagstat ${path_raw_BAM}Pool_281a+b_i8.bam > ${path_raw_BAM}F_020_R04_B_29.txt &&\ #181418184 mapped / 188947172 reads
samtools flagstat ${path_raw_BAM}Pool_281a+b_i11.bam > ${path_raw_BAM}F_020_R05_B_29.txt &&\ #192985279 mapped / 200087080 reads
samtools flagstat ${path_raw_BAM}Pool_282a+b_i18.bam > ${path_raw_BAM}F_020_R06_B_29.txt &&\ #171931268 mapped / 181027688 reads
samtools flagstat ${path_raw_BAM}Pool_282a+b_i20.bam > ${path_raw_BAM}F_020_R07_B_29.txt &&\ #225330509 mapped / 234403160 reads
samtools flagstat ${path_raw_BAM}Pool_282a+b_i22.bam > ${path_raw_BAM}F_020_R08_B_29.txt &&\ #141801702 mapped / 148293058 reads
samtools flagstat ${path_raw_BAM}Pool_282a+b_i25.bam > ${path_raw_BAM}F_020_R09_B_29.txt &&\ #154232231 mapped / 160693384 reads
samtools flagstat ${path_raw_BAM}Pool_282a+b_i27.bam > ${path_raw_BAM}F_020_R10_B_29.txt #169302396 mapped / 175647844 reads
#cov_raw=c(76852330, 77992744, 186623116, 164544420, 199492124, 188947172, 200087080, 181027688, 234403160, 148293058,160693384, 175647844)*125/151635494
#63  64 154 136 164 156 165 149 193 122 132 145
#cov_mapped=c(73269122, 75749071, 177549913, 157053917, 191703827, 181418184, 192985279, 171931268, 225330509, 141801702,154232231, 169302396)*125/151635494
#60  62 146 129 158 150 159 142 186 117 127 140

# --- chomosome arms names
chrom=(2L 2R 3L 3R 4 X)

# --- subset vcf per arm
for chr in ${chrom[@]}; do
  vcftools --chr ${chr} --vcf ${path_data_raw}ploidy2_raw.vcf --recode --out ${path_data_raw}chr${chr}.vcf 
done

bedtools sort -i ${path_ref}dmel6_repeats/dmel6_XY234_MT_wMel_repeats_ge_200bp.bed > ${path_ref}dmel6_repeats/dmel6_XY234_MT_wMel_repeats_ge_200bp_sort.bed 

# --- remove what falls into repeated regions
for chr in ${chrom[@]}; do
  echo ${chr} &&\
  bedtools intersect -v -wa -header -a ${path_data_raw}chr${chr}.vcf.recode.vcf \
  -b ${path_ref}dmel6_repeats/dmel6_XY234_MT_wMel_repeats_ge_200bp_sort.bed > ${path_data_processed}chr${chr}_mask.vcf
done

# ---- Filter raw VCF with soft filters
cd ${path_data_processed} &&\
for chr in ${chrom[@]}; do
  echo ${chr} &&\

  # Keep only bi-allelic sites
  vcftools --vcf ${path_data_processed}chr${chr}_mask.vcf --min-alleles 2 --max-alleles 2 --stdout --recode > ${path_data_processed}chr${chr}_bi.vcf &&\

  # Decompose blocks
  ${path_software}vt/vt/vt normalize ${path_data_processed}chr${chr}_bi.vcf \
  -r ${path_ref}dmel6.03-clean.wMel_wRi_Lactobacillus_Acetobacter.fa \
  -o ${path_data_processed}chr${chr}_bi_norm.vcf.gz &&\
  tabix -f -h -p vcf ${path_data_processed}chr${chr}_bi_norm.vcf.gz &&\
  
  ${path_software}vt/vt/vt decompose_blocksub -m -a ${path_data_processed}chr${chr}_bi_norm.vcf.gz \
  -o ${path_data_processed}chr${chr}_bi_dec.vcf.gz
  tabix -f -h -p vcf ${path_data_processed}chr${chr}_bi_dec.vcf.gz &&\

  ${path_software}vt/vt/vt uniq ${path_data_processed}chr${chr}_bi_dec.vcf.gz -o ${path_data_processed}chr${chr}_bi_decnorm_uni.vcf.gz &&\ #from A. M. Langmüller
  tabix -f -h -p vcf ${path_data_processed}chr${chr}_bi_decnorm_uni.vcf.gz &&\
  
  # Remove SNPs in the neighbourhood of 5 bp around INDELs 
  bcftools filter -g 5 ${path_data_processed}chr${chr}_bi_decnorm_uni.vcf.gz > ${path_data_processed}chr${chr}_5bp_bi_decnorm_uni.vcf &&\
  
  # Keep SNPs ONLY
  bcftools filter -i 'TYPE="snp"' ${path_data_processed}chr${chr}_5bp_bi_decnorm_uni.vcf > ${path_data_processed}chr${chr}_5bp_bisnp.vcf 
done

# ---- summaries
for chr in ${chrom[@]}; do
  echo ${chr} &&\
  bcftools stats ${path_data_processed}/chr${chr}_mask.vcf > ${path_data_processed}/summary_chr${chr}_mask.txt &&\
  bcftools stats ${path_data_raw}/chr${chr}.vcf.recode.vcf > ${path_data_raw}/summary_chr${chr}_raw.txt &&\
  bcftools stats ${path_data_processed}chr${chr}_5bp_bi_decnorm_uni.vcf > ${path_data_processed}chr${chr}_5bp_bi_decnorm_uni.txt &&\
  bcftools stats ${path_data_processed}chr${chr}_5bp_bisnp.vcf > ${path_data_processed}chr${chr}_5bp_bisnp.txt 
done