#!/usr/bin/perl -w
use strict;
use File::Basename;
use File::Find;
use Cwd qw{abs_path};
# use constant NUMH => 21; # number of header 
use constant GT_COL=> 9; # start position of individual info
use constant NLINE => 100000;

my $indir=shift;
my $outdir=shift;
# my $header_file='qmat-ID-samples4SFS.txt';
my $header_file='obovata-ID-list.txt';
##
open(IN, $header_file) or die "Cannot open $header_file: $!";
$_=<IN>;
my @samples=<IN>;
close(IN);
our %sample_list=();
foreach (@samples){
	chomp;
	# my ($id,$pop)=(split/\t/)[0,4];
	# my ($id,$pop)=(split/\t/)[0,11];
	my ($id,$pop)=(split/\t/)[0,17];
	$sample_list{$id}=$pop;
	unless(-d $outdir.'/'.$pop){
		mkdir $outdir.'/'.$pop or die "Fail to create a folder: $!";
	}
}
print STDERR scalar keys %sample_list, " individuals selected\n";
###
my @filenames=get_filenames_recursive($indir, 'vcf.gz');
foreach my $file (@filenames){
	print STDERR "\nProcessing $file ... \n";
	open(my $fh, "gzip -d -c $file|") or die "Cannot open  $file: $!";
	## header
	my @ind_ids=();
	while(<$fh>){
		next if(/^#{2,}/);
		if(/^#CHROM/){
			chomp;
			my $header_line=$_;
			my @header_cols=split/\t/,$header_line;
			@ind_ids=splice(@header_cols, GT_COL);
			last;
		}
	}
	my @index=();
	foreach (0..$#ind_ids){
		my $head_name=$ind_ids[$_];
		$head_name=~s/\-/\_/g;
		$ind_ids[$_]=$head_name;
		if(exists $sample_list{$head_name}){
			push @index, $_;
		}
	}
	print STDERR join(" ,", @index)," index found\n";
	my @info_lines = map~~<$fh>, 1..NLINE;
	@info_lines=@info_lines;
	while($info_lines[0]){
		parse_vcf(\@info_lines, \@ind_ids, \@index);
		@info_lines = map~~<$fh>, 1..NLINE;
		# print STDERR "Done!";
	}
}
	
sub parse_vcf{
	my ($ref_lines, $ind_ids_ref, $index_ref)=@_;
	my @snp_list=();
	my @ind_ids=@$ind_ids_ref[@$index_ref];
	# print STDERR "parsing";
	OUT1:foreach (@$ref_lines){
		last if($_ eq ''); # after EOF empty assign to array
		chomp;
		my @cols=split/\t/;
		my ($chr_id, $pos, $ref, $alt, $info, $format)=@cols[0,1,3,4,7,8];
		print join("\t", @cols),"\n" unless(defined $ref);
		next if(length($ref) > 1 || $alt eq '*');# alt allele is missing due to a upstream deletion || INDEL in either ref or alt. 
		my @inds=splice(@cols,GT_COL);
		@inds=@inds[@$index_ref];
		# next if($alt=~/DEL|INS|DUP|INV|CNV/); # we can remove this later
		my @alt_alleles=split/,/,$alt;
		 ### for INDEL not detected by line 53
		foreach (@alt_alleles){
			next OUT1 if(length($_)>1); 
		}
		
		my @alleles=();
		push @alleles, ($ref, @alt_alleles);
		#my ($ad_index, $dp_index, $geno_qual_index) = get_field_index_by_format($format);
		# die "$chr_id, $pos, $format" unless(defined $geno_qual_index);
		die "Missing columns in \#snp$chr_id, $pos" unless($#ind_ids == $#inds);
		# die "More than one alt allele found in \#snp$chr_id, $pos" if($alt=~/,/);
		foreach my $ind_index (0..$#ind_ids){
			my @gt_column=split/:/, $inds[$ind_index];
			my $gt_str=$gt_column[0];
			# die "$chr_id, $pos, $format, $geno_qual_index, $ind_index, $inds[$ind_index]" unless(defined $geno_qual);
			# my @genotype_allele_indice=split/\/|\|/,$gt_col; # remove the AD, and split GT by / (unphased) or | (phased)
			#
			# my $genotype=determine_genotype(\@alleles, \@genotype_allele_indice);
			# next unless(defined $genotype && $genotype ne $ref);
			# next if($gt_str eq '0/0');
			#$geno_qual=0 unless(defined $geno_qual);
			# my $dp=$gt_col[2];
			# my $genotype_quality=$gt_col[4];
			#my ($ref_dp,$alt_dp)=(undef,undef);
			# my $ad=$gt_column[1];
			# if(defined $ad_index and $gt_column[$ad_index] ne '.'){
			# 	my $ad=$gt_column[$ad_index];
			# 	($ref_dp,$alt_dp)=split/,/,$ad;
			# 	$alt_dp=0 unless(defined $alt_dp);
			# }else{
			# 	$alt_dp=0;
			# 	$ref_dp=$dp;
			# 	# ($ref_dp,$alt_dp)=(0,0);
			# }
			# # print $ad,"\n";
			# my @ad=split/,/,$ad;
			# my $dp=$ref_dp+$alt_dp;
			# print $dp,"\n";
			if(length($alt)==1){
				$gt_str=~s/0/$ref/g;
				$gt_str=~s/1/$alt/g;
			}else{
				foreach my $num_allele (0..$#alleles){
					$gt_str=~s/$num_allele/$alleles[$num_allele]/g;
				}
			}
			die "$chr_id, $pos, $format, $ind_index, $inds[$ind_index]" unless(defined $gt_str);
			my $print_snp_str=$chr_id."\t".$pos."\t".$ref."\t".$gt_str;
			push  @{$snp_list[$ind_index]}, $print_snp_str;
			# print {$out_fh[$ind_index]} $chr,"\t", $pos,"\t",$ref,"\t",$genotype,"\n";
		}	
	}
	# print STDERR ".";
	foreach my $ind_index (0..$#snp_list){
		next unless(defined $snp_list[$ind_index]);
		#
		my $pop=$sample_list{$ind_ids[$ind_index]};
		die "$ind_ids[$ind_index]" unless(defined $pop);
		my $output1=$outdir.'/'.$pop.'/'.$ind_ids[$ind_index].'_1.snp';
		my $output2=$outdir.'/'.$pop.'/'.$ind_ids[$ind_index].'_2.snp';
		open(OUT1, '>>', $output1) or die "Cannot write into $output1: $!";
		open(OUT2, '>>', $output2) or die "Cannot write into $output2: $!";
		foreach (@{$snp_list[$ind_index]}){
			my ($chr_id, $pos, $ref, $gt)=split/\t/,$_;
			my ($allele1,$allele2)=(undef,undef);
			
			if($gt =~/\./){
				($allele1,$allele2)=('N','N');
			}else{
				($allele1,$allele2)=(split/\/|\|/,$gt)[0,1];
			}
			print OUT1 $chr_id,"\t",$pos,"\t",$ref,"\t",$allele1,"\n";
			print OUT2 $chr_id,"\t",$pos,"\t",$ref,"\t",$allele2,"\n";
		}		
		close(OUT1);
		close(OUT2);
	}
}


# sub determine_genotype{
# 	my ($alleles_arr, $index_arr)=@_;
# 	my $gt='';
# 	foreach (@$index_arr){
# 		my $allele='';
# 		if(/\d+/){
# 			$allele=$$alleles_arr[$_];
# 		}elsif($_ eq '.'){ # missing, then be 'N'
# 			$allele='N';
# 		}else{
# 			return(undef); ## cannot be anything other than number or '.'
# 		}
#
# 		if($allele=~ /INS/){  # insert relative to ref
# 			$allele='?'; # '+' not allowed in bioperl so we use ? for insertion
# 		}elsif($allele=~/DEL/){ # deletion relative to ref
# 			$allele='-';
# 		}
#
# 		if($gt eq $allele){
# 			# do nothing
# 		}else{
# 			$gt.=$allele;
# 		}
# 	}
# 	# $gt=uc($gt);
# 	my %IUPAC_DNA=('AG' => 'R', 'GA' => 'R',
# 				   'CT' => 'Y', 'TC' => 'Y',
# 				   'GC' => 'S', 'CG' => 'S',
# 				   'AT' => 'W', 'TA' => 'W',
# 				   'GT' => 'K', 'TG' => 'K',
# 				   'AC' => 'M', 'CA' => 'M',
# 				   'CGT' => 'B','CTG' => 'B','GTC' => 'B', 'GCT' => 'B', 'TCG' => 'B', 'TGC' => 'B',
# 				   'AGT' => 'D','ATG' => 'D','GTA' => 'D', 'GAT' => 'D', 'TAG' => 'D', 'TGA' => 'D',
# 				   'ACT' => 'H','ATC' => 'H','CTA' => 'H', 'CAT' => 'H', 'TAC' => 'H', 'TCA' => 'H',
# 				   'ACG' => 'V','AGC' => 'V','CGA' => 'V', 'CAG' => 'V', 'GAC' => 'V', 'GCA' => 'V',
# 				    );
# 	if($gt eq ''){
# 		$gt=undef;
# 	}elsif(length($gt)==1){
# 		# do nothing
# 	}elsif(exists $IUPAC_DNA{$gt}){
# 		$gt=$IUPAC_DNA{$gt};
# 	}elsif($gt=~/[ATGC]{4}/){
# 		$gt='N';
# 	}else{
# 		$gt=undef;
# 	}
# 	return($gt);
# }
sub get_field_index_by_format{
	my $format=shift;
	my @fields=split/:/,$format;
	my ($ad_index, $dp_index, $geno_qual_index, $pl_index)=(undef, undef, undef, undef);
	my $index=0;
	foreach (@fields){
		$ad_index=$index if($_ eq 'AD');
		$dp_index=$index if($_ eq 'DP');
		$geno_qual_index=$index if($_ eq 'GQ' || $_ eq 'RGQ');
		# $pl_index=$index if($_ eq 'PL');
		$index++;
	}
	# if(defined $ad_index and defined $dp_index and defined $geno_qual_index){
		return(($ad_index, $dp_index, $geno_qual_index));
	# }else{
		# die "Unknown FORMAT fileds: $format";
	# }
}

sub get_filenames_recursive{
	my ($parent_path, $filter)=@_;
	$parent_path=abs_path($parent_path);
	my @filenames=();
	find sub{
		push @filenames, $File::Find::name if(-f $File::Find::name && grep(/\.$filter$/i, $_));
	}, $parent_path;
	return(@filenames);
}