## python script to identify U and V-linked orthologous genes
## in C. purpureus and the closest P. patens outgroup

## Written by Sarah B. Carey

## to execute this on multiple tree files can use bash command:
## for f in *.tre; do python physco_outgroup.py $f; done > physco_outgroup.txt


from ete3 import Tree
import random
import sys,os

t=Tree(sys.argv[1])

filename=sys.argv[1]
filename_split=filename.split(".")[0]

leaf_names=t.get_leaf_names()
physco_genes=[leaf for leaf in leaf_names if leaf.startswith("Physco")]
gg1=[leaf for leaf in leaf_names if leaf.startswith("Ceratodon_purpureus_v1pt1_CepurGG1.UG")]
r40=[leaf for leaf in leaf_names if leaf.startswith("Ceratodon_purpureus_v1pt1_CepurR40.VG")]


## get ancestor node for gg1, r40, and closest physco
branch_index=[]
branch_lengths=[]

for i in physco_genes:
	branch_index.append(i)
	branch_lengths.append(t.get_distance(i,*gg1))
	
physco_short_bl=branch_index[branch_lengths.index(min(branch_lengths))]

ancestor_node=t.get_common_ancestor(physco_short_bl,*gg1,*r40)

## prune tree
t.prune(ancestor_node, preserve_branch_length=True)

leaf_names_pruned=t.get_leaf_names() 
physco_outgroup=[leaf for leaf in leaf_names_pruned if leaf.startswith("Physco")]

result=[gg1,r40,physco_outgroup]
print(result,sep='\t')