#Script by KM Dlugosch, April 2020
#Associated with paper: "TagSeq for gene expression in non-model plants: a pilot study at the Santa Rita Experimental Range NEON core site" by Hannah E. Marx; Stephen Scheidt; Michael S. Barker; Katrina M. Dlugosch. 2020 bioRXiv.

#This script takes a set of input htseq output files and makes a tab delim table with header of sample names and first col of row names
#Counts are in each cell
#Input file list file requires lists of Htseq files to combine on each line, with lines tab delimited of the following form:
#<NameForOutputFile>[TAB]<firstHtseqFile>[TAB]<NextHtseqFile>[TAB]<...etc...>

# run perl combine_HtSeq.pl <inputFileList>

###################################
use strict;
use warnings;


my $inList = "$ARGV[0]";
my @sets = ();
open LIST, "<$inList";
	while (<LIST>) {
		chomp $_;
		push @sets, $_;
	}
close LIST;

#setup
my @files = ();
my %samples = ();
my $header = "RefLocus";
my $firstSet = 'Y';

foreach my $set (@sets) {  #sets are groups of files to be summed

	#prep
	my %lines = ();

	#get the set of files
	my @files = split /\t/, $set; #TODO check that tab delim

	my $outName = shift @files;
	$header = "$header\t$outName";

	#work through the set
	foreach my $infile (@files){
		my $ending = 'N';
		
		#open the file
		open FILE, "<$infile";
		while (<FILE>) {
			chomp $_;
			if ($_ =~ m/^no_feature/) { $ending = 'Y'; }#start of the final info lines
			unless ($ending eq 'Y') { #unless we are at the end material
				my @parts = split /\t/, $_; #split each line to get the locus and value
				if ($lines{$parts[0]}) { $lines{$parts[0]} = $lines{$parts[0]} + $parts[1]; } #if there is an entry for the locus already, add the new numbers
				else {$lines{$parts[0]} = $parts[1];} #else enter the first number
			}
		}
		#close the file
		close FILE;
	}

	#store entry for set in master hash of samples
	foreach my $locus (keys %lines) {
		if ($firstSet eq 'N') { $samples{$locus} = "$samples{$locus}\t$lines{$locus}"; }
		else { $samples{$locus} = $lines{$locus}; }
	}
	%lines = (); #clear within-set hash
	$firstSet = 'N'; #set to N for all after first set
}

#print outfile
open OUT, ">$inList.combinedAllLoci";
print OUT "$header\n";
foreach my $gene (keys %samples) {
	print OUT "$gene\t$samples{$gene}\n";
}
close OUT;