#!/usr/bin/env perl
use strict;
use warnings;


# This program will read in one input file that contains the following info (tab delimited):
# gene	dN	dS	dNdS	Ninteract
# where the Ninteract column specifies whether the gene is (1) or is not (0) an Ninteract gene

# undefined or missing values should be entered as "NA" without the quotes

# It will then perform a permutation test and report the test stat and one sided p-value
# that the Ninteract genes (1) have a higher value than the non-Ninteract genes.

# You also must specify whether you wish to do the permutation with dN values only (1) or dNdS (3)

# ./All_N_mt_allinteract_paml_permutation.pl paml_in.txt 3

my $inputfile1 = $ARGV[0];
my $column = $ARGV[1];

my @temp;
my @temp1;
my @Ninteract;
my @nonNinteract;
my @permarray;
my $x;
my $y;

# Ppen up the Fst data
unless (open DATAINPUT, $inputfile1) {
	print "Can not find the input file.\n";
	exit;
}

while ( my $line = <DATAINPUT>) {
	chomp($line);
	@temp=split('\t',$line);
	if($temp[0] ne 'gene'){
		if($temp[$column] ne "NA"){
			if($temp[$column] < 5){ # do not include any values over 5
				if($temp[4] == 1){ # this is an Ninteract gene
					push(@Ninteract,$temp[$column]);
				}
				elsif($temp[4] == 0){ # this is not an Ninteract gene
					push(@nonNinteract,$temp[$column]);
				}
				else{
					print "problem parsing columns\n";
				}
			}
		}	
	}	
}
close DATAINPUT;


# now calculate the test statistic
my $counter=0;
my $sum;
foreach (@Ninteract){
	$sum +=$_;
	$counter +=1;
}
my $Ninteract_average = $sum/($counter);
$sum=0;
$counter=0;
foreach (@nonNinteract){
	$sum +=$_;
	$counter +=1;
}
my $nonNinteract_average = $sum/($counter);
my $test_stat = $Ninteract_average - $nonNinteract_average;


# now report values
print "Ninteract average ",$Ninteract_average,"\n";
print "Number of Ninteract considered ",$#Ninteract+1,"\n";
print "nonNinteract average ",$nonNinteract_average,"\n";
print "Number of nonNinteract considered ",$#nonNinteract,"\n";
print "Test stat ",$test_stat,"\n";

#################
# Perms including all genes
#################

# Make an array that will be shuffled
@permarray = (@Ninteract, @nonNinteract);

my $perms=1000;
my $Ninteract_perm=0;
my $nonNinteract_perm=0;
my $n_Ninteract_perm=0;
my $n_nonNinteract_perm=0;

# first check if the length of the permutation array is the same as the fst array
# for this analysis
if(($#permarray+1) ne ($#Ninteract+$#nonNinteract+2)){
	print "Problem: length of permarray ",$#permarray+1," ",$#Ninteract+$#nonNinteract+2,"\n";
}

my @perm_diffs; # this will store the permuted differences
for ($y = 0 ; $y < $perms; $y++ ) {
	fisher_yates_shuffle( \@permarray );    # permutes @array in place
	$Ninteract_perm=0;
	$nonNinteract_perm=0;
	$n_Ninteract_perm=0;
	$n_nonNinteract_perm=0;
	for ($x = 0 ; $x <= $#Ninteract; $x++ ) {
		$Ninteract_perm += $permarray[$x];
		$n_Ninteract_perm +=1;
	}
	for ($x = ($#Ninteract+1) ; $x <= $#permarray; $x++ ) {
		$nonNinteract_perm += $permarray[$x];
		$n_nonNinteract_perm +=1;
	}	
	push(@perm_diffs,($Ninteract_perm/$n_Ninteract_perm) - ($nonNinteract_perm/$n_nonNinteract_perm));	
}



my @perm_diffs_sorted = sort { $a <=> $b } @perm_diffs;
my $switch=0;
my $pval=$perms; # this will be updated if the test stat is lower than any of the perms
$counter=0;
#print "@perm_diffs_sorted\n";
# now figure out where the test stat is in the distribution of differences
for ($x = 0 ; $x <= $#perm_diffs_sorted; $x++ ) {
	if(($test_stat <= $perm_diffs_sorted[$x])&&($switch==0)){
		$pval=$counter;
		$switch = 1;
	}
	$counter+=1;
}	

print "P = ",1-($pval/$perms),"\n";




# fisher_yates_shuffle( \@array ) : 
    # generate a random permutation of @array in place
    sub fisher_yates_shuffle {
        my $array = shift;
        my $i;
        for ($i = @$array; --$i; ) {
            my $j = int rand ($i+1);
            next if $i == $j;
            @$array[$i,$j] = @$array[$j,$i];
        }
    }


