#do phasing and masking of the data
#*.filtered.gz is the output of Filtering2 module of LM3
#map[1-21].txt ouput of Lep-MAP3 OrderMarker2 with marker number (column 1) mapped to "chr pos" (phased data is outputted by outputPhasedData=1)
#snps_mapped.txt lists all snps in "chr pos" format

# create header
zcat all.filtered.gz |head -n 4|tail -n 3|./transpose_tab|awk '($3!=0)'|/.transpose_tab|head -n 2|awk -vOFS="\t" -vFS="\t" '{$2=$2 "\tcMPos"; print}' >header.txt

# add tabs between segregation patterns
for i in `seq 1 21`
do
	paste <(cut -f 1-3 map$i.txt) <(cut -f 4- map$i.txt|sed -e 's/\t//g' -e 's/[01]/&\t/g') >map${i}_mapped.txt
done

#create phased data
for i in `cat families.txt` 
do
	zcat $i.filtered.gz|sed -e 's/*//g'|awk -f simpleConvert.awk| awk -f phase.awk >phased$i.txt
done

awk '{if (NR==1) print "CHR\tPOS\tcMPos"; else print $0 "\tNA"}' snps_mapped.txt >snps.txt

#create phased_all.txt
awk 'BEGIN{s="paste snps.txt <(cut -f 3- phasedC20.txt)"}{s = s " <(cut -f 3- phased" $1 ".txt)"}END{print s " >phased_all.txt"}' families.txt|bash

#calculate flips, correspondance of 
for i in `seq 1 21`
do
(head -n1 header.txt;(paste <(cut -f 1,2 map${i}_mapped.txt >tmp; awk '(NR==FNR){data[$1,$2]=$0}(NR!=FNR){print data[$1,$2]}' phased_all.txt tmp) map${i}_mapped.txt))|awk '(NR==1){for (i=4;i<=NF;++i) family[i]=$i}(NR>1){for (i=4;i<=NF/2;++i) if ($i==$(i+NF/2)) ++d[family[i], 1]; else if ($i!="-") ++d[family[i], -1]}END{for (i=4;i<=NF/2;++i) print family[i] "\t" d[family[i], 1]+0 "\t" d[family[i], -1]+0}'|awk '($2>$3) {print $1 "\t+"}($2<$3) {print $1 "\t-"}($2==$3) {print $1 "\t?"}' >flips$i.txt
done


for i in `seq 1 21`
do
awk -vFS="\t" -vOFS="\t" '(NR==FNR) {f[NR+3]=$2} (NR!=FNR){for (i=4;i<=NF;++i) if (f[i]=="-") $i=1-$i; print}' flips$i.txt map${i}_mapped.txt >map${i}_phased.txt
done

#mask non-informative markers
zcat ../all.filtered.gz|awk -f inf.awk|awk 'BEGIN{print "#"} 1'|cut -f 3-|paste snps.txt - >inf_all.txt

for i in `seq 1 21`
do 
paste map${i}_phased.txt <(cut -f 1,2 map${i}_phased.txt >tmp; awk '(NR==FNR){data[$1,$2]=$0}(NR!=FNR){print data[$1,$2]}' inf_all.txt tmp)|awk 'BEGIN{size=334}{for (i=1;i<=NF;++i) data[NR,i]=$i}END{for (j=2;j<=NR;++j) for (i=4;i<=size;++i) if (data[j-1, i]!=data[j,i] && data[j-1, i]!="-" && data[j, i]!="-") {k=j-1;while (k >= 1 && data[k,i+size]!=1 && data[k,i+size]!=3) {data[k,i]="-";--k} k=j+1;while (k <= NR && data[k,i+size]!=1 && data[k,i+size]!=3) {data[k,i]="-"; ++k}; if (data[j,i+size]!=1 && data[j,i+size]!=3) data[j,i]="-" } for (j=1;j<=NR;++j) {s=data[j,1];for (i=2;i<=size;++i) s = s "\t" data[j,i]; print s}}' >map${i}_masked.txt; 
done

#final output, map[1-21]_masked.txt
