#!/usr/bin/env perl
use strict;
use warnings;


# This program will read in two files. The first contains the coordinates of
# all N-mt interact genes, their acronyms, and whether or not (1 or 0) they interact
# directly with mt genes.

# The other file is a file with stats in windows with coordinates.  First
# the mean stat of N-mt interacting (1) and non-interacting (0) genes will be calculated
# then permutations will be performed where the difference between these categories is 
# recalculated after the interaction is permuted n times. This will allow a p value of the
# stat value to be estimated.

# this program deliberately ignores all genes in chrX.  A separate script will be generated
# that is only for chrX

# execute like this:
# ./All_N_mt_allinteract_stat_column_permutation.pl FINAL_OXPHOS_ARP2_MRP_MTREPLICATION_allinteractexceptC2_andallother_genez_orientation.txt stats_in_windows_nig.csv 7

# where the last argument is the 1-based column of the statistic being tested.
# TajD is 3
# seg sites is 4
# Fu.Li.F is 5
# Fu.Li.D is 6
# Fay.Wu.H is 7
# Zeng.E is 8

my $inputfile1 = $ARGV[0];
my $inputfile2 = $ARGV[1];
my $stat_column = $ARGV[2];
my $stat_name=$stat_column;
my $division_name = $ARGV[0];

if($stat_column == 3){
	$stat_name="TajD";
}
elsif($stat_column == 7){
	$stat_name="FW_H";
}
elsif($stat_column == 8){
	$stat_name="Zeng_E";
}

if($division_name eq "FINAL_OXPHOS_ARP2_MRP_MTREPLICATION_allinteractexceptC2_andallother_genez_orientation.txt"){
	$division_name="N_interact";
}
elsif($division_name eq "FINAL_OXPHOS_ARP2_MRP_onlyOXPHOSinteractexceptC2_andallother_genez.txt"){
	$division_name="only_OXPHOS";
}

my $window_size=100000;
my @windowsites;
my @stat_values;
my $sumsites=0;
my $counter=0;
my @temp;
my $y;
my $x;
my %OXPHOS;

# first open up the OXPHOS gene info (# OXPHOS_ARP2_MRP_andallother_genez.txt)
unless (open DATAINPUT, $inputfile1) {
	print "Can not find the input file.\n";
	exit;
}

while ( my $line = <DATAINPUT>) {
	chomp($line);
	@temp=split('\t',$line);
	if(($temp[0] ne 'gene')&&($temp[2] ne 'chrX')){ # deliberately ignores chrX
		if($temp[6] eq '+'){ # the gene is in the forward orientation
			$OXPHOS{$temp[2]."_".$temp[3]."_".$temp[4]}{"gene"} = $temp[0];
			$OXPHOS{$temp[2]."_".$temp[3]."_".$temp[4]}{"complex"} = $temp[1];
			$OXPHOS{$temp[2]."_".$temp[3]."_".$temp[4]}{"mt_interact"} = $temp[5];
		}
		elsif($temp[6] eq '-'){ # the gene is in the reverse orientation
			$OXPHOS{$temp[2]."_".$temp[4]."_".$temp[3]}{"gene"} = $temp[0];
			$OXPHOS{$temp[2]."_".$temp[4]."_".$temp[3]}{"complex"} = $temp[1];
			$OXPHOS{$temp[2]."_".$temp[4]."_".$temp[3]}{"mt_interact"} = $temp[5];
		}
		else{
			print "something wrong with gene orientation $line\n";
		}
	}	
}		
close DATAINPUT;

my $outputfile = $inputfile2."_".$division_name."_".$stat_name."_density.txt"; # the name of the output file is from the commandline
unless (open(OUTFILE, ">$outputfile"))  {
	print "I can\'t write to $outputfile\n";
	exit;
}

# now open up the stat data
unless (open DATAINPUT2, $inputfile2) {
	print "Can not find the input file.\n";
	exit;
}

my @temp1;
my $N_interact_window=0;
my $gene_containing_window=0;
my $number_of_genes_in_this_window=0;
my $number_of_Ninteract_genes_in_this_window=0;
my $Ninteract_acronym="";
my $number_of_Ninteract_genes_spanning_a_window=0;

print OUTFILE "chr\tpos\t".$stat_name."\tcontainsgenes\tcontainsNinteractgenez\tnumber_of_genes\tnumber_of_Ninteractgenez\tNinteract_acronym\n";

while ( my $line = <DATAINPUT2>) {
	chomp($line);
	@temp=split('\t',$line);
		$N_interact_window=0;
		$gene_containing_window=0;
		$number_of_genes_in_this_window=0;
		$number_of_Ninteract_genes_in_this_window=0;
		$Ninteract_acronym="-";
		# first check if the window has a stat, if not ignore that window
		#if($temp[$stat_column-1] ne 'NA'){
			# cycle through each gene
			foreach my $key (keys %OXPHOS){
				@temp1=split('_',$key);
				# check if this window contains one or more N_mt genes
				if(($temp1[0] eq $temp[0])&&($temp1[1] >= $temp[1])&&($temp1[1] <= ($temp[1]+$window_size))){
						$gene_containing_window=1;
						$number_of_genes_in_this_window+=1; # if the start or end is in the window, the gene gets counted
						$OXPHOS{$key}{"start_stat"} = $temp[$stat_column-1];
						if($OXPHOS{$key}{"mt_interact"} == 1){
							$N_interact_window=1; 
							$number_of_Ninteract_genes_in_this_window+=1; # if the start or end is in the window, the gene gets counted
							if($number_of_Ninteract_genes_in_this_window == 1){
								$Ninteract_acronym=$OXPHOS{$key}{"gene"};
							}	
							else{
								$Ninteract_acronym=$Ninteract_acronym.",".$OXPHOS{$key}{"gene"};
							}	
						}	
				} # start is in block
			}
		#}	
		if($temp[0] ne 'chr'){
			print OUTFILE $temp[0],"\t",$temp[1],"\t",$temp[$stat_column-1],"\t",$gene_containing_window,
			"\t",$N_interact_window,"\t",$number_of_genes_in_this_window,"\t",$number_of_Ninteract_genes_in_this_window,"\t",$Ninteract_acronym,"\n";
		}	
}

# Now the OXPHOS hash has stat and coordinates of all blocks that have genes

my @ave_stat_for_perms; # this has only the mtinteractors and all genes
my @ave_stat_for_perms_all_N_mt; # this has only the mtinteractors and the non-interactors that are still OXPHOS, MRP, or ARP2




close DATAINPUT2;

my $stat_associated=0; # all OXPHOS,MRP, ARP2 genes
my $stat_non_associated=0;
my $stat_non_associated_only_N_mt=0;
my $n_stat_associated=0; # this also works for only_N_mt
my $n_stat_non_associated=0; # this includes all non associated genes, including those anywhere 
my $n_stat_non_associated_only_N_mt=0; # this includes only non associated N-mt genes 
									  # (i.e., OXPHOS, MRP, ARP2 that don't directly interact with mt genes)
my $N_interact_counter=0;


# now calculate the average fst for associated and non-associated OXPHOS genes
foreach my $key (keys %OXPHOS){
	if((exists($OXPHOS{$key}{"start_stat"})) &&
		($OXPHOS{$key}{"start_stat"} ne 'NA')){

		if($OXPHOS{$key}{"mt_interact"} == 1){
			$N_interact_counter+=1;
			$stat_associated += $OXPHOS{$key}{"start_stat"};
			$n_stat_associated += 1;	
		}
		elsif($OXPHOS{$key}{"mt_interact"} == 0){
			$stat_non_associated += $OXPHOS{$key}{"start_stat"};
			$n_stat_non_associated += 1;
		}
		push(@ave_stat_for_perms,$OXPHOS{$key}{"start_stat"});
	}
}

# now report values
print "Number of associated blocks ",$n_stat_associated,"\n";
print "Number of N_interact genes ",$N_interact_counter,"\n";
print "Mean stat of all associated N_mt genes ",$stat_associated/$n_stat_associated,"\n";
print "Mean stat non-associated all genes ",$stat_non_associated/$n_stat_non_associated,"\n";
#print "Mean stat non-associated only N_mt ",$stat_non_associated_only_N_mt/$n_stat_non_associated_only_N_mt,"\n";


#################
# ALL COMPLEXES perms including all genes
#################

# calculate test statistic for all genes
my $test_stat = ($stat_associated/$n_stat_associated) - ($stat_non_associated/$n_stat_non_associated);

# do permutations for all_complex_mt_interact vs all_other_genes
# first make an array that will be shuffled with the same number of 1s and 0s as the $OXPHOS{$key}{"complex"} variable
my @associated_or_not_array = (('1') x $n_stat_associated, ('0') x $n_stat_non_associated);

my $perms=1000;
my $stat_associated_perm=0;
my $stat_not_associated_perm=0;
my $n_stat_associated_perm=0;
my $n_stat_not_associated_perm=0;

# first check if the length of the permutation array is the same as the stat array
# for this analysis
if($#associated_or_not_array ne $#ave_stat_for_perms){
	print "Problem: length associated_or_not_array ",$#associated_or_not_array,
	" length stat array ",$#ave_stat_for_perms,"\n";
}

my @perm_diffs;
@perm_diffs={};
for ($y = 0 ; $y < $perms; $y++ ) {
	fisher_yates_shuffle( \@associated_or_not_array );    # permutes @array in place
	$stat_associated_perm=0;
	$stat_not_associated_perm=0;
	$n_stat_associated_perm=0;
	$n_stat_not_associated_perm=0;
	for ($x = 0 ; $x <= $#associated_or_not_array; $x++ ) {
		if($associated_or_not_array[$x] == 1){
			$stat_associated_perm+= $ave_stat_for_perms[$x];
			$n_stat_associated_perm +=1;
		}
		elsif($associated_or_not_array[$x] == 0){	
			$stat_not_associated_perm+= $ave_stat_for_perms[$x];
			$n_stat_not_associated_perm +=1;
		}
	}
	push(@perm_diffs,($stat_associated_perm/$n_stat_associated_perm) - ($stat_not_associated_perm/$n_stat_not_associated_perm));	
}

my @perm_diffs_sorted = sort { $b <=> $a } @perm_diffs; # this is reverse sorted because we care about negative ones
my $switch=0;
my $pval=0;
# now figure out where the test stat is
for ($y = 0 ; $y <= $#perm_diffs_sorted; $y++ ) {
	if(($test_stat >= $perm_diffs_sorted[$y])&&($switch==0)){  # as soon as the test stat is higher
		$pval=$counter;
		$switch = 1;
	}
	$counter+=1;
}	

#print "@perm_diffs_sorted\n";
print "Test stat for test including all genes (if positive, then not significant):",$test_stat,"\n";
print "P = ",1-($pval/$perms),"\n";




# fisher_yates_shuffle( \@array ) : 
    # generate a random permutation of @array in place
    sub fisher_yates_shuffle {
        my $array = shift;
        my $i;
        for ($i = @$array; --$i; ) {
            my $j = int rand ($i+1);
            next if $i == $j;
            @$array[$i,$j] = @$array[$j,$i];
        }
    }


