#!usr/bin/perl -w

## Clunky but ultimately functional script for getting fasta files that have the 
## matching sequences from the gene trees pruned for PAML

## Elegant parts of the script written by Adam C. Payton
## Clunky parts added by Sarah B. Carey


## The script has to be run twice...
## first run of this script need to run open OUT, ">$fasta_file.list.txt";
## second run need to comment out open OUT, ">$fasta_file.list.txt";
## same goes for print OUT "$line"; (run it first time 'round, hash it out second time)

#usage = "perl <script_name>";

@filearray = glob("cluster*\_pruned.tre");
for $file(@filearray)
{
    if ($file =~ m/cluster(\d+)\_pruned.tre/)
    {
	$filenum = $1;

# for nuc files
$fasta_file = "cluster$filenum.fa";

# for aa files
#$fasta_file = "cluster$filenum.fa.aln.filtered.fa";

$tree_file = "cluster$filenum\_pruned.tre";

open IN, "<$fasta_file" or die "Fasta file not working\n";

%genomehash = ();
$seqname = 0;

while (<IN>)
	{	if (/^>([\S]+)(\s)([\s\S]+)/)
		
			{	$seqname = $1;
		        #print "$seqname";
				chomp $seqname;
				$genomehash{$seqname} = "";
			}
		elsif (/^(\S+)/)
			{	$genomehash{$seqname} = $genomehash{$seqname} . $1;	}
	}
	

@minningarray = ();
$found_count = 0;


open IN2, "<$tree_file" or die "Tree not working\n";
#open OUT, ">cluster$filenum.list.txt";

#for nuc
open OUT2, ">cluster$filenum.out.fa";

#for aa 
#open OUT2, ">cluster$filenum.aa.out.fa";

while (<IN2>)
	{	$line = $_;
		chomp $line;
		$line =~ s/:(\d+)e-\d+//g;
		$line =~ s/:(\d+).\d+e-\d+//g;
		$line =~ s/[\,\;]/\n/g;
		$line =~ s/[\)\(]//g;
		$line =~ s/:(\d+).\d+//g;
		#print OUT "$line";
		#push @minningarray, $line;
		#print"@minningarray\n\n";
	}	

open IN3, "<cluster$filenum.list.txt" or die "List not working\n";

while (<IN3>)
	{
	$list_line = $_;
	chomp $list_line;
	push (@minningarray, $list_line);
	}		
				
for $name (@minningarray)
	{	if (exists $genomehash{$name})
			{	
				print OUT2 ">$name\n$genomehash{$name}\n";	
				$found_count++;
			}
		else	{	print "WARNING: $name was not found in Hash\n";	}
	}

print "\n\n$found_count\t sequences in $fasta_file\n\n";

		
#for $list (@minningarray)	
	
#	{ 
	#print "$name\n";
	#print "$_\n" for keys %genomehash;
#	if (exists $genomehash{$list})
#			{	
#				print OUT2 ">$list\n$genomehash{$list}\n";	
#				$found_count++;
#			}
#		else	{	print "WARNING: $line was not found in Hash\n\n";	}
	
#	}
}
}	

#print "\n\n$found_count\t sequences in $fasta_file\n\n";

close IN;
close IN2;
close IN3;
close OUT;
close OUT2;
@minningarray = ();
%genomehash = ();

