#!/usr/bin/env perl
use strict;
use warnings;


# This program will read in two files. The first contains the coordinates of
# N_interact genes and their acronyms, and all other genes

# The other file is a file with ROH blocks from a population with lengths

# First identify which ROH blocks have N_interact genes and which do not.
# test stat is the difference in the summed length of ROH blocks that do and do not have N_interact genes
# Permutations then randomly assign N_interact genes and recalculate this stat
# I will also output info on gene number that can be used for linear models to test for effects of gene density

# run like this:
# ./ROH_permutation_new.pl FINAL_OXPHOS_ARP2_MRP_MTREPLICATION_allinteractexceptC2_andallother_genez_orientation.txt RG_bor_roh.txt ROH_bor_perms.out ROH_bor_density.out

my $inputfile1 = $ARGV[0];
my $inputfile2 = $ARGV[1];
#my $outfile = $ARGV[2];
#my $outfile2 = $ARGV[3];

my @windowsites;
my @Fst_values;
my $sumsites=0;
my @temp;
my $y;
my $x;
my %N_interact_hash;
my @interact_perm;

# first open up the N_interact_hash gene info 
unless (open DATAINPUT, $inputfile1) {
	print OUTFILE "Can not find input file1.\n";
	exit;
}

while ( my $line = <DATAINPUT>) {
	chomp($line);
	@temp=split('\t',$line);
	if(($temp[0] ne 'gene')&&($temp[2] ne 'chrX')){ # deliberately exclude the X from the ROH analysis
		if($temp[6] eq '+'){ # the gene is in the forward orientation
			$N_interact_hash{$temp[2]."_".$temp[3]."_".$temp[4]}{"gene"} = $temp[0];
			$N_interact_hash{$temp[2]."_".$temp[3]."_".$temp[4]}{"mt_interact"} = $temp[5];
			push(@interact_perm,$temp[5]); # this will be used for the permutations later
		}
		elsif($temp[6] eq '-'){ # the gene is in the reverse orientation
			$N_interact_hash{$temp[2]."_".$temp[4]."_".$temp[3]}{"gene"} = $temp[0];
			$N_interact_hash{$temp[2]."_".$temp[4]."_".$temp[3]}{"mt_interact"} = $temp[5];
			push(@interact_perm,$temp[5]); # this will be used for the permutations later
		}
	}	
}		
close DATAINPUT;

#print @interact_perm,"\n";
print "length ",$#interact_perm,"\n";

print "hash ",scalar keys %N_interact_hash,"\n";

#my @unique = do { my %seen; grep { !$seen{$_}++ } @interact_perm };
#print "unique @unique \n";



# now open up the ROH block data
unless (open DATAINPUT2, $inputfile2) {
	print OUTFILE "Can not find input file2.\n";
	exit;
}


my $outputfile = $ARGV[2]; # the name of the output file is from the commandline
unless (open(OUTFILE, ">$outputfile"))  {
	print OUTFILE "I can\'t write to $outputfile\n";
	exit;
}


my @temp1;
my $n_ROH_blocks_with_interacting_genez=0;
my $n_ROH_blocks_with_other_genez=0;
my $n_ROH_blocks_without_genez=0;

my $length_sum_ROH_blocks_with_interacting_genez=0;
my $length_sum_ROH_blocks_with_other_genez=0;
my $length_sum_ROH_blocks_without_genez=0;

my $n_Ninteract_genes_on_ROH_blockz=0;
my $n_genes_on_ROH_blockz=0;
my $n_genes_on_non_ROH_blockz=0;
my %ROH_blocks;
my %perm_blockz; 
my @length_array_for_perms;
my @N_array_for_perms;
my %unique_sample_tracker; # the key will be the unique samples

while ( my $line = <DATAINPUT2>) {
	chomp($line);
	@temp=split('\t',$line);
	# ignore first line
	if(($temp[0] eq 'RG')&&($temp[2] ne 'chrX')){ # all of the data begin with RG; other lines are comments or blank
			# intially set the block to be no genes (0)
			$ROH_blocks{$temp[2]."_".$temp[3]."_".$temp[4]."_".$temp[1]}{"n_all_genes"} = 0; 
			# also assume that it does not have any interacting genes (this is a binary variable)
			$ROH_blocks{$temp[2]."_".$temp[3]."_".$temp[4]."_".$temp[1]}{"interacting"} = 0;
			# also keep track of the number of Ninteract genes in each block
			$ROH_blocks{$temp[2]."_".$temp[3]."_".$temp[4]."_".$temp[1]}{"n_Ninteract_genes"} = 0;
			# also keep track of lengths
			$ROH_blocks{$temp[2]."_".$temp[3]."_".$temp[4]."_".$temp[1]}{"length"} = $temp[5];
			# also keep track of the individual
			$ROH_blocks{$temp[2]."_".$temp[3]."_".$temp[4]."_".$temp[1]}{"individual"} = $temp[1];
			$unique_sample_tracker{$temp[1]} = 1; # the key will be each unique sample
			# Now cycle through all the genes to see if any are in this block
			foreach my $key (keys %N_interact_hash){
				@temp1=split('_',$key);
				# now check if this block contains any genes
				if(($temp1[0] eq $temp[2])&&($temp1[1] >= $temp[3])&&($temp1[1] <= $temp[4])) # beginning of the gene is in this block
					{
						# this ROH block has a gene
						$ROH_blocks{$temp[2]."_".$temp[3]."_".$temp[4]."_".$temp[1]}{"n_all_genes"} += 1;
						$n_genes_on_ROH_blockz+=1;
						push(@length_array_for_perms,$temp[5]);
						# check if it is an interacting gene
						if($N_interact_hash{$key}{"mt_interact"} == 1){
							$ROH_blocks{$temp[2]."_".$temp[3]."_".$temp[4]."_".$temp[1]}{"interacting"} = 1;
							$ROH_blocks{$temp[2]."_".$temp[3]."_".$temp[4]."_".$temp[1]}{"n_Ninteract_genes"} += 1;
							print OUTFILE $N_interact_hash{$key}{"gene"}," is in ROH block ",$temp[2]."_".$temp[3]."_".$temp[4]."_".$temp[1],"\n";
							$n_Ninteract_genes_on_ROH_blockz+=1;
					}
				}
			}
	}	
}

my %all_interact_perm;
print "heey ";
foreach my $key (keys %unique_sample_tracker) {
	print $key," ";
}	
print "\n";

# ok now I have a hash that has information on whether or not each ROH block has any genes, and how many genes there are on each block 
							#(no: $ROH{$key}{"n_all_genes"}=0; yes: $ROH{$key}{"n_all_genes"}>0; )
# and whether or not any of these genes have any N_interact genes.
							#(no: $ROH{$key}{"interacting"}=0; yes: $ROH{$key}{"interacting"}=1; )
# and how many Ninteract genes there are on each block: $ROH{$key}{"n_Ninteract_genes"}						
# print these numbers

# ok calculate the mean lengths of ROH with and without genes, and with and without Ninteract genes
# also calculate the gene density of ROH blocks with and without Ninteract genes
my $gene_density_Ninteract_ROHs=0;
my $gene_density_nonNinteract_ROHs=0;

foreach my $key (keys %ROH_blocks){
	@temp1=split('_',$key);
	if($ROH_blocks{$key}{"n_all_genes"} > 0){
		if($ROH_blocks{$key}{"interacting"} == 1){
			$n_ROH_blocks_with_interacting_genez+=1;
			$length_sum_ROH_blocks_with_interacting_genez+=$ROH_blocks{$key}{"length"};
			# sum densities for each block now and later we will divide by the number of blocks to get the mean density
			$gene_density_Ninteract_ROHs += ($ROH_blocks{$key}{"n_all_genes"}/$ROH_blocks{$key}{"length"})
			#print OUTFILE2 $temp1[0]."\t".$temp1[1]."\t".$ROH_blocks{$key}{"length"}."\t1\t1\n";
		}
		else{
			$n_ROH_blocks_with_other_genez+=1;
			$length_sum_ROH_blocks_with_other_genez+=$ROH_blocks{$key}{"length"};
			# sum densities for each block now and later we will divide by the number of blocks to get the mean density
			$gene_density_nonNinteract_ROHs += ($ROH_blocks{$key}{"n_all_genes"}/$ROH_blocks{$key}{"length"})
			#print OUTFILE2 $temp1[0]."\t".$temp1[1]."\t".$ROH_blocks{$key}{"length"}."\t1\t0\n";
		}	
	}
	else{
		$n_ROH_blocks_without_genez+=1;
		$length_sum_ROH_blocks_without_genez+=$ROH_blocks{$key}{"length"};
		#print OUTFILE2 $temp1[0]."\t".$temp1[1]."\t".$ROH_blocks{$key}{"length"}."\t0\t0\n";
	}	
}	

#close OUTFILE2;

print OUTFILE "Number of ROH blocks with N_interact genes: ",$n_ROH_blocks_with_interacting_genez,"\n";
print OUTFILE "Number of ROH blocks with other genes: ",$n_ROH_blocks_with_other_genez,"\n";
print OUTFILE "Number of ROH blocks without genes: ",$n_ROH_blocks_without_genez,"\n";
print OUTFILE "Average length of ROH blocks with N_interact genes: ",$length_sum_ROH_blocks_with_interacting_genez/$n_ROH_blocks_with_interacting_genez,"\n";
print OUTFILE "Average length of ROH blocks with other genes: ",$length_sum_ROH_blocks_with_other_genez/$n_ROH_blocks_with_other_genez,"\n";
print OUTFILE "Average length of ROH blocks without genes: ",$length_sum_ROH_blocks_without_genez/$n_ROH_blocks_without_genez,"\n";
print OUTFILE "Number of genes on ROH blocks: ",$n_genes_on_ROH_blockz,"\n";
print OUTFILE "Proportion of ROH blocks with genes that have N_interact genes ",$n_ROH_blocks_with_interacting_genez/
($n_ROH_blocks_with_interacting_genez+$n_ROH_blocks_with_other_genez),"\n";
print OUTFILE "Proportion of genes on ROH blocks that are N_interact genes ",$n_Ninteract_genes_on_ROH_blockz/$n_genes_on_ROH_blockz,"\n";
print OUTFILE "Genes density on Ninteract ROH blocks:",$gene_density_Ninteract_ROHs/$n_ROH_blocks_with_interacting_genez,"\n";
print OUTFILE "Genes density on non-Ninteract ROH blocks:",$gene_density_nonNinteract_ROHs/$n_ROH_blocks_with_other_genez,"\n";

print  "Number of ROH blocks with N_interact genes: ",$n_ROH_blocks_with_interacting_genez,"\n";
print  "Number of ROH blocks with other genes: ",$n_ROH_blocks_with_other_genez,"\n";
print  "Number of ROH blocks without genes: ",$n_ROH_blocks_without_genez,"\n";
print  "Average length of ROH blocks with N_interact genes: ",$length_sum_ROH_blocks_with_interacting_genez/$n_ROH_blocks_with_interacting_genez,"\n";
print  "Average length of ROH blocks with other genes: ",$length_sum_ROH_blocks_with_other_genez/$n_ROH_blocks_with_other_genez,"\n";
print  "Average length of ROH blocks without genes: ",$length_sum_ROH_blocks_without_genez/$n_ROH_blocks_without_genez,"\n";
print  "Number of genes on ROH blocks: ",$n_genes_on_ROH_blockz,"\n";
print  "Proportion of ROH blocks with genes that have N_interact genes ",$n_ROH_blocks_with_interacting_genez/
($n_ROH_blocks_with_interacting_genez+$n_ROH_blocks_with_other_genez),"\n";
print  "Proportion of genes on ROH blocks that are N_interact genes ",$n_Ninteract_genes_on_ROH_blockz/$n_genes_on_ROH_blockz,"\n";
print  "Genes density on Ninteract ROH blocks:",$gene_density_Ninteract_ROHs/$n_ROH_blocks_with_interacting_genez,"\n";
print  "Genes density on non-Ninteract ROH blocks:",$gene_density_nonNinteract_ROHs/$n_ROH_blocks_with_other_genez,"\n";



# This is the test statistic
my $test_stat2 = sprintf("%.1f",(($length_sum_ROH_blocks_with_interacting_genez/$n_ROH_blocks_with_interacting_genez)-
($length_sum_ROH_blocks_with_other_genez/$n_ROH_blocks_with_other_genez)));



######################
# Permutations
######################

my $perms=1000;
my $switch=0;
my %counter=();
my $introg_interacter=0;
my $introg_withgenez=0;
my $introg_without_genez=0;
my $introg_withgenez_switch=0;
my $introg_interacter_switch=0;
my $genes_on_ROH_blocks=0;
my $Ninteract_genes_on_ROH_blocks=0;
my $n_ROH_blocks_with_interacting_genez_perms=0;
my $n_ROH_blocks_with_other_genez_perms=0;
my $indiv;
my @quick_perm_genez;
my $tempinteger;
my @values;
for ($y = 0 ; $y < $perms; $y++ ) {
	$n_ROH_blocks_with_interacting_genez_perms=0;
	$n_ROH_blocks_with_other_genez_perms=0;
	# shuffle the genes for each unique sample
	foreach my $key (keys %unique_sample_tracker) {
		# generate a randomized gene array for each sample
		fisher_yates_shuffle( \@interact_perm ); # permutes the N_interact assignment for each of 16049 genes
		@{$all_interact_perm{$key}} = @interact_perm;
	}
	# the @{$all_interact_perm{$key}} hash or arrays should have randomized N_interact genes for each sample
	# with the key being the name of each sample, which is also stored in $ROH_blocks{$key}{"individual"}
		# clear the counter hash for this permutation
		%counter=();
		# initialize the counter hash for each sample
		foreach my $key (keys %unique_sample_tracker) {
			$counter{$key}=0;
		}	
		$switch=0;
		$length_sum_ROH_blocks_with_interacting_genez=0;
		$length_sum_ROH_blocks_with_other_genez=0;
		$length_sum_ROH_blocks_without_genez=0;
		foreach my $key (keys %ROH_blocks){ # go through all the blocks
			$switch=0; # assume no genes in block
			if($ROH_blocks{$key}{"n_all_genes"} > 0){ # does this block have any genes?
				# if yes, go through each gene on this ROH and check if we encounter an Ninteract gene
				for ($x = 0 ; $x < $ROH_blocks{$key}{"n_all_genes"}; $x++ ) {
					$indiv = $ROH_blocks{$key}{"individual"};
					#if($all_interact_perm{$ROH_blocks{$key}{"individual"}}[$counter{$ROH_blocks{$key}{"individual"}}] eq '1'){
					$tempinteger=sprintf("%d",$counter{$indiv}); # this is the index for the random array for this individual
					if($all_interact_perm{$indiv}[$tempinteger] == 1){	
						$switch=1; # if we find any 1s, this means there is an N_interact gene
					}
					$counter{$indiv}+=1;
					# $counter{$indiv} = int(scalar($counter{$indiv})) + 1; # using this counter means we sample the randomized genes 
												   # for each sample without replacement
				}
				# if the ROH is an Ninteract ROH for this individual, add its length to the Ninteract block total
				if($switch == 1){
					$length_sum_ROH_blocks_with_interacting_genez+=$ROH_blocks{$key}{"length"};
					$n_ROH_blocks_with_interacting_genez_perms+=1;
				}
				# otherwise add it to the non-Ninteract total	
				else{
					$length_sum_ROH_blocks_with_other_genez+=$ROH_blocks{$key}{"length"};
					$n_ROH_blocks_with_other_genez_perms+=1;
				}
			}
			else{ # block has no genes
				$length_sum_ROH_blocks_without_genez+=$ROH_blocks{$key}{"length"};
			}	
		}
		#print $counter,"\n";
		push(@quick_perm_genez,sprintf("%.1f",(($length_sum_ROH_blocks_with_interacting_genez/$n_ROH_blocks_with_interacting_genez_perms)-
			($length_sum_ROH_blocks_with_other_genez/$n_ROH_blocks_with_other_genez_perms))));	
}	


if($#quick_perm_genez != $perms-1){
	print OUTFILE "Hey, something wrong with perms\n";
}



my @quick_perm_genez_sorted = sort { $a <=> $b } @quick_perm_genez;
$switch=0;
my $pval=$perms; # this will make the pval be zero if the test stat is larger than all the perms
my $counter=0;
print "hello @quick_perm_genez_sorted\n";
# now figure out where the test stat is
for ($y = 0 ; $y <= $#quick_perm_genez_sorted; $y++ ) {
	if(($test_stat2 <= $quick_perm_genez_sorted[$y])&&($switch==0)){ 
		$pval=$counter;
		$switch = 1;
	}
	$counter+=1;
}	


print OUTFILE "Test stat:",$test_stat2,"\n";
print OUTFILE "QuickP = ",1-$pval/$perms,"\n";
print "Test stat:",$test_stat2,"\n";
print "QuickP = ",1-$pval/$perms,"\n";

close OUTFILE;

# fisher_yates_shuffle( \@array ) : 
    # generate a random permutation of @array in place
    sub fisher_yates_shuffle {
        my $array = shift;
        my $i;
        for ($i = @$array; --$i; ) {
            my $j = int rand ($i+1);
            next if $i == $j;
            @$array[$i,$j] = @$array[$j,$i];
        }
    }



# ok, now make file for density plot

my $outputfile2 = $ARGV[3]; # the name of the output file is from the commandline
unless (open(OUTFILE2, ">$outputfile2"))  {
	print "I can\'t write to $outputfile2\n";
	exit;
}

print OUTFILE2 "chr\tpos\tlength\tcontainsgenes\tcontainsNinteractgenez\tnum_genes\tnum_ninteractgenes\n";

foreach my $key (keys %ROH_blocks){
	@temp1=split('_',$key);
	if($ROH_blocks{$key}{"n_all_genes"} > 0){
		if($ROH_blocks{$key}{"interacting"} == 1){
			$n_ROH_blocks_with_interacting_genez+=1;
			$length_sum_ROH_blocks_with_interacting_genez+=$ROH_blocks{$key}{"length"};
			print OUTFILE2 $temp1[0]."\t".$temp1[1]."\t".$ROH_blocks{$key}{"length"}."\t1\t1\t".$ROH_blocks{$key}{"n_all_genes"}."\t".$ROH_blocks{$key}{"n_Ninteract_genes"}."\n";
		}
		else{
			$n_ROH_blocks_with_other_genez+=1;
			$length_sum_ROH_blocks_with_other_genez+=$ROH_blocks{$key}{"length"};
			print OUTFILE2 $temp1[0]."\t".$temp1[1]."\t".$ROH_blocks{$key}{"length"}."\t1\t0\t".$ROH_blocks{$key}{"n_all_genes"}."\t".$ROH_blocks{$key}{"n_Ninteract_genes"}."\n";
		}	
	}
	else{
		$n_ROH_blocks_without_genez+=1;
		$length_sum_ROH_blocks_without_genez+=$ROH_blocks{$key}{"length"};
		print OUTFILE2 $temp1[0]."\t".$temp1[1]."\t".$ROH_blocks{$key}{"length"}."\t0\t0\t".$ROH_blocks{$key}{"n_all_genes"}."\t".$ROH_blocks{$key}{"n_Ninteract_genes"}."\n";
	}	
}	

close OUTFILE2;




