#!/usr/bin/perl -w

## script to root gene trees using outgroup taxa
## and keep the longest isoform within a clade for a sample
## requires newick-utils and the correct path to it in line 129

## written by J. Gordon Burleigh 

@filearray = glob("cluster*.fa");
for $file(@filearray)
{
    if ($file =~ m/cluster(\d+).fa/)
    {
	$filenum = $1;

#defining a hash with the outgroup taxa
	%oghash = ("Azolla" => 0, "Salvinia" => 0, "Selaginella" => 0, "Takakia" => 0, "Sphagnum" => 0);

	$fastafile = "cluster$filenum.fa";
#the tree file needs to end with .tre
	$treefile = "cluster$filenum.tre";

#opening the fasta file
	open FHfa, "<$fastafile";

#defining a hash that will have the number of nucleotides in all sequences
	%lenhash = ();

#defining an array that will include all outgroup sequences
	@ogarray = ();

	%taxhash = ();

	while (<FHfa>)
	{
    #if the line has the sequence name, find out if it's an outgroup sequence 
	    if (/^>((([^\_]+)\_[^\_]+)\_\S*)/)
	    {
		$fullname = $1;
		$taxname = $2;
		$genname = $3;
	
		if (! exists $taxhash{$taxname})
		{
		    $taxhash{$taxname} = 1;
		}

		if (exists $oghash{$genname})
		{
		    $oghash{$genname}++;
		    push @ogarray, $fullname;
		}
	    }

    #if it's the line with the sequence - count how many nucleotides are in the sequence and add the number to the hash
	    elsif (/^(\S+)/)
	    {
		$seq = $1;
		$seq =~ s/[^ACGTacgt]//g;
		$lenhash{$fullname} = length $seq;
	    }
	}
	close FHfa;

#this part will root the tree (if there is at least 1 outgroup sequence)
	if ((scalar @ogarray) > 0)
	{
	    $ogs = "";
	    if ($oghash{Azolla} > 0)
	    {
		for $og(@ogarray)
		{
		    if ($og =~ m/Azolla\_\S+/)
		    {
			$ogs = $ogs . "$og ";
		    }
		}
	    }

	    elsif ($oghash{Salvinia} > 0)
	    {   
		for $og(@ogarray)
		{
		    if ($og =~ m/Salvinia\_\S+/)
		    {   
			$ogs = $ogs . "$og ";
		    }   
		}
	    }

	    elsif ($oghash{Selaginella} > 0)
	    {   
		for $og(@ogarray)
		{
		    if ($og =~ m/Selaginella\_\S+/)
		    {   
			$ogs = $ogs . "$og ";
		    }   
		}
	    }
	    
	    elsif ($oghash{Takakia} > 0)
	    {   
		for $og(@ogarray)
		{
		    if ($og =~ m/Takakia\_\S+/)
		    {   
			$ogs = $ogs . "$og ";
		    }   
		}
	    }

	    elsif ($oghash{Sphagnum} > 0)
	    {   
		for $og(@ogarray)
		{
		    if ($og =~ m/Sphagnum\_\S+/)
		    {   
			$ogs = $ogs . "$og ";
		    }   
		}
	    }

#    print "OG = $ogs\n";
	    if ($treefile =~ m/(\S+).tre/)
	    {
		$treename = $1;
	#below - make sure you have the right path to the newick utlities application
		system "/Users/scarey/Downloads/newick-utils-1.6/src/nw_reroot $treefile $ogs >$treename.Root.tre";
	    }    
	}

	open FHtree, "<$treename.Root.tre";
	open OUT, ">$treename.Root.ED.tre";
	while (<FHtree>)
	{
	    if (/^(\(\S+;)/)
	    {
		$tree = $1;
	#I just got rid of the branch lengths with e- in them- this is a dumb, not accurate fix, but I just added a bunch of 0's to the decimal
		$tree =~ s/(\d+)\.(\d+)e-\d+/0\.000000$1$2/g;
       
#	print "$tree\n";

		for (1..100)
		{
		    for $tax(keys %taxhash)
		    {
#			print "$tax\n";
			if ($tree =~ m/\(($tax\_[^\:]+)\:([0-9\.e\-]+),($tax\_[^\:]+)\:([0-9\.e\-]+)\)\d*\:([0-9\.e\-]+)/)
			{
#			    print "ok\n";
			    $seq1 = $1;
			    $bl1 = $2;
			    $seq2 = $3;
			    $bl2 = $4;
			    $bl3 = $5;
			    if ($lenhash{$seq1} > $lenhash{$seq2})
			    {
#				print "OK1\n";
				$newbl = $bl1 + $bl3;
				$tree =~ s/\($seq1\:$bl1,$seq2\:$bl2\)\d*\:$bl3/$seq1\:$newbl/;
			    }
			    else
			    {
#				print "OK2\n";
				$newbl = $bl2 + $bl3;
				$tree =~ s/\($seq1\:$bl1,$seq2\:$bl2\)\d*\:$bl3/$seq2\:$newbl/;
			    }
			}
		    }
		}
		print OUT "$tree\n";
	    }
	}
    }
}


		
	    
