#Script by KM Dlugosch, April 2020 #Associated with paper: "TagSeq for gene expression in non-model plants: a pilot study at the Santa Rita Experimental Range NEON core site" by Hannah E. Marx; Stephen Scheidt; Michael S. Barker; Katrina M. Dlugosch. 2020 bioRXiv. #This script takes a set of input htseq output files and makes a tab delim table with header of sample names and first col of row names #Counts are in each cell #Input file list file requires lists of Htseq files to combine on each line, with lines tab delimited of the following form: #<NameForOutputFile>[TAB]<firstHtseqFile>[TAB]<NextHtseqFile>[TAB]<...etc...> # run perl combine_HtSeq.pl <inputFileList> ################################### use strict; use warnings; my $inList = "$ARGV[0]"; my @sets = (); open LIST, "<$inList"; while (<LIST>) { chomp $_; push @sets, $_; } close LIST; #setup my @files = (); my %samples = (); my $header = "RefLocus"; my $firstSet = 'Y'; foreach my $set (@sets) { #sets are groups of files to be summed #prep my %lines = (); #get the set of files my @files = split /\t/, $set; #TODO check that tab delim my $outName = shift @files; $header = "$header\t$outName"; #work through the set foreach my $infile (@files){ my $ending = 'N'; #open the file open FILE, "<$infile"; while (<FILE>) { chomp $_; if ($_ =~ m/^no_feature/) { $ending = 'Y'; }#start of the final info lines unless ($ending eq 'Y') { #unless we are at the end material my @parts = split /\t/, $_; #split each line to get the locus and value if ($lines{$parts[0]}) { $lines{$parts[0]} = $lines{$parts[0]} + $parts[1]; } #if there is an entry for the locus already, add the new numbers else {$lines{$parts[0]} = $parts[1];} #else enter the first number } } #close the file close FILE; } #store entry for set in master hash of samples foreach my $locus (keys %lines) { if ($firstSet eq 'N') { $samples{$locus} = "$samples{$locus}\t$lines{$locus}"; } else { $samples{$locus} = $lines{$locus}; } } %lines = (); #clear within-set hash $firstSet = 'N'; #set to N for all after first set } #print outfile open OUT, ">$inList.combinedAllLoci"; print OUT "$header\n"; foreach my $gene (keys %samples) { print OUT "$gene\t$samples{$gene}\n"; } close OUT;