#!/usr/perl -w
use strict;
use File::Find;
use Cwd qw{abs_path};
use Benchmark;
use File::Basename;
use Getopt::Long qw(GetOptions);
use Pod::Usage qw(pod2usage);
# use constant NUM_CHILD => 2;
my $gatk_uppmax='/sw/apps/bioinfo/GATK/3.5.0/GenomeAnalysisTK.jar'; #for uppmax
my $mem='32g';
my $num_child=2;
my $help=0;
if ((@ARGV==0) && (-t STDIN)) {
	 pod2usage(-verbose => 0, -message =>  "$0: Argument required.\n");
}
my $command_line=$0.' '.join(" ",@ARGV);

&GetOptions('in=s'		=> \my $input_dir,
			'out=s'		=> \my $output_dir,
			'ref=s'		=> \my $ref_capture, 
			'post=s'	=> \my $postfix,
			'threads=i' => \$num_child,
			'memory=s'	=> \$mem,
			'gatk=s'	=> \$gatk_uppmax,
            'help|?'	=>	\$help) or pod2usage(-msg => 'Wrong options', -verbose => 1);

pod2usage(-verbose=>2) if $help;		
pod2usage(-msg => 'Please give a directory name for sequence files', -verbose=>2) unless($input_dir);
pod2usage(-msg => 'Please give a genome reference in fasta', -verbose=>2) unless($ref_capture);
pod2usage(-msg => 'Please give a name for output  directory', -verbose=>2) unless($output_dir);
pod2usage(-msg => 'Please give a postfix for combined GVCF name', -verbose=>2) unless($postfix);


print STDERR "\n####################################################################################################\n";
printf STDERR "%40s%20s\n","ANALYSIS","STARTED!";
print STDERR "####################################################################################################\n\n";
print STDERR "$command_line\n\n";

my $start_run=new Benchmark;
my $tmp_dir=$output_dir.'/tmp'; 
mkdir $tmp_dir or die "failed to create tmp directory $tmp_dir: $!" unless(-d $tmp_dir);
my ($ref_basename, $ref_path)=fileparse($ref_capture, qw{\.fa});

my @gvcf_files=get_filenames_recursive($input_dir, 'g.vcf');
@gvcf_files=sort {$a cmp $b} @gvcf_files;
print STDERR scalar @gvcf_files, " gVCF files have been found\n";

if($num_child==1){
	merge_gvcf_files(undef,$ref_capture, $output_dir, $postfix, \@gvcf_files, $tmp_dir, $mem, $gatk_uppmax);
}else{
	my @gatk_interval=();
	foreach (1..$num_child){
		my $gatk_interval=$ref_path.$ref_basename.'-'.$_.'.intervals';
		push @gatk_interval, $gatk_interval;
	}
	# create_interval_by_fasta($ref_capture, $num_child) unless(-e $gatk_interval[0]); ### for safe not create the interval files if already exist


	print STDERR "Combine all GVCF together\n";
	my $files_no_idx=all_have_vcf_index(\@gvcf_files); # return 1 or list of files without idx
	unless($files_no_idx==1){ ## only to create idx first
		my $fake_interval=$ref_path.$ref_basename.'-0.intervals';
		if(-e $fake_interval){
			merge_gvcf_files($fake_interval,$ref_capture, $output_dir, $postfix, $files_no_idx, $tmp_dir, $mem, $gatk_uppmax);
		}else{
			merge_gvcf_files($gatk_interval[0],$ref_capture, $output_dir, $postfix, $files_no_idx, $tmp_dir, $mem, $gatk_uppmax);
			shift @gatk_interval;
			$num_child--;
		}

	}

	##################
	parallele_machine_by_splice(\@gatk_interval, $num_child, \&merge_gvcf_files, $ref_capture, $output_dir, $postfix, \@gvcf_files, $tmp_dir, $mem, $gatk_uppmax);
	my @gvcf_part=();
	foreach (1..$num_child){	
		my $combined_gvcf=$output_dir.'/combined_'.$postfix.'-'.$_.'.g.vcf';
		push @gvcf_part, $combined_gvcf;
	}
	### concatenate gvcf
	my $gvcf_file=$output_dir.'/combined_'.$postfix.'-HC-p2.g.vcf';
	# my $con_gvcf_cmd='vcf-concat '.join(" ", @gvcf_part)." > $gvcf_file";
	# system($con_gvcf_cmd) == 0 or die "system failed $con_gvcf_cmd: $?";
	my $catGVCF_cmd="java -cp $gatk_uppmax org.broadinstitute.gatk.tools.CatVariants -R $ref_capture -V ".join(" -V ", @gvcf_part)." -out  $gvcf_file -assumeSorted";
	print STDERR "\t\t$catGVCF_cmd\n";
	system($catGVCF_cmd) == 0 or die "system failed $catGVCF_cmd: $?";

	unlink $output_dir.'/combined_'.$postfix.'-0.g.vcf', @gvcf_part;
}

print STDERR "Combination of all GVCF completed\n";
my $end_run=new Benchmark;
print STDERR "\nJob took ", timestr(timediff($end_run,$start_run)),"\n";



####################### functions
sub all_have_vcf_index{
	my $gvcf_files_ref=shift;
	my @files_no_idx=();
	foreach (@$gvcf_files_ref){
		push @files_no_idx, $_ unless(-e $_.'.idx');
	}
	if(scalar @files_no_idx){
		return \@files_no_idx;
	}else{
		return(1);
	}
}


sub create_interval_by_fasta{
	my ($seq_file, $num_files)=@_;
	my $in=Bio::SeqIO->new(-file => $seq_file, -format => 'fasta');
	my @id=();
	while(my $seqObj=$in->next_seq){
		push @id, $seqObj->id;
	}
	print STDERR scalar @id, " sequences have been read from $seq_file\n";
	my ($file_name, $path)=fileparse($seq_file, qw{\.fa});
	my $basename=$path.$file_name;
	my $seq_per_file=int(scalar @id / $num_files);
	my $output=undef;
	foreach my $index (1..$num_files){
		$output=$basename.'-'.$index.'.intervals';
		open(INTER, ">$output") or die "Cannot write to $output: $!";
		my @interval_to_write=splice(@id,0, $seq_per_file);
		print INTER join("\n", @interval_to_write),"\n";
		print STDERR $seq_per_file, " intervals have been written into $output\n" unless($index==$num_files);
		close(INTER);
	}
	if(@id){
		open(INTER, ">>$output") or die "Cannot write to $output: $!";
		print INTER join("\n", @id),"\n";
		close(INTER);
	}
	print STDERR $seq_per_file + scalar @id, " intervals have been written into $output\n";
}

sub merge_gvcf_files{
	my ($interval, $ref_capture, $output_dir, $postfix, $gvcf_file_ref, $tmp_dir, $mem, $gatk_uppmax)=@_;
	
	my $combined_gvcf_file=$output_dir.'/combined_'.$postfix;
	my $combined_gvcf_cmd="java -Djava.io.tmpdir=$tmp_dir -Xmx".$mem." -jar $gatk_uppmax  -R $ref_capture -T CombineGVCFs  -V ".join(" -V ", @$gvcf_file_ref);
	if(defined $interval){
		$combined_gvcf_cmd.=" -L $interval";
		my ($index)=$interval=~/\-(\d+)\.intervals$/;
		$combined_gvcf_file.='-'.$index.'.g.vcf';
	}else{
		$combined_gvcf_file.='-HC-p2.g.vcf';
	}
	$combined_gvcf_cmd.=" -o $combined_gvcf_file";
	system($combined_gvcf_cmd)== 0 or die "system $combined_gvcf_cmd failed: $?";
}

sub parallele_machine{
	my ($array, $function, @options)=@_;
	my @child;
	foreach my $child_process (@$array){
		my $pid=fork();
		if($pid){
			push @child, $pid; # parent process
		}elsif($pid==0){ # child process
			$function->($child_process, @options);
			exit 0;
		}else{
			die "Cannot fork: $!";
		}
	}
	foreach (@child){
		my $tmp=waitpid($_,0);		
	}
}

sub parallele_machine_by_splice{
	my ($array, $num_threads, $function_ref, @options)=@_;
	my $array_length=scalar @$array;
	my @child_process=();
	# initiate
	if($num_threads <= $array_length){
		@child_process=splice(@$array, 0, $num_threads);
	}else{
		@child_process=@$array;
	}
	
	while(@child_process){
		parallele_machine(\@child_process, $function_ref, @options);
		if($num_threads <= $array_length){
			@child_process=splice(@$array, 0, $num_threads);
		}else{
			@child_process=splice(@$array, 0);
		}
	}
}
sub get_filenames_recursive{
	my ($parent_path, $filter)=@_;
	$parent_path=abs_path($parent_path);
	my @filenames=();
	find sub{
		push @filenames, $File::Find::name if(-f $File::Find::name && grep(/\.$filter$/i, $_));
	}, $parent_path;
	return(@filenames);
}


__END__

###################################### usage
=head1 NAME


=head1 SYNOPSIS

 perl merge-gvcf.pl 		   -in <input dir> 
 							   -out <output dir> 
							   -ref <ref genome> 
							   -post <output vcf file name>
 							   -memory <java memory: 64G> 
							   -gatk <gatk dir>
							   -help            
		   
=head1 OPTIONS

=over 4


=back


=head1 AUTHORS

Jun Chen modified from the pipeline of John Baison SLU Umea

=cut
