## python script to identify U and V-linked orthologous genes
## in C. purpureus and prune at the closest P. patens outgroup
## script will also remove other GG1 and R40 isoforms
## and randomly select one isoform/homolog to keep for each other species 

## Written by Sarah B. Carey

## to execute this on multiple tree files can use bash command:
## for f in *.tre; do python physco_outgroup.py $f; done > physco_outgroup.txt


from ete3 import Tree
import random
import sys,os

t=Tree(sys.argv[1])

filename=sys.argv[1]
filename_split=filename.split(".")[0]

leaf_names=t.get_leaf_names()
physco_genes=[leaf for leaf in leaf_names if leaf.startswith("Physco")]
gg1=[leaf for leaf in leaf_names if leaf.startswith("Ceratodon_GG1")]
r40=[leaf for leaf in leaf_names if leaf.startswith("Ceratodon_R40")]


## get ancestor node for gg1, r40, and closest physco
branch_index=[]
branch_lengths=[]

for i in physco_genes:
	branch_index.append(i)
	branch_lengths.append(t.get_distance(i,*gg1))
	
physco_short_bl=branch_index[branch_lengths.index(min(branch_lengths))]

ancestor_node=t.get_common_ancestor(physco_short_bl,*gg1,*r40)

## prune tree
t.prune(ancestor_node, preserve_branch_length=True)

## remove extra Ceratodons (leave GG1 and R40)
leaf_names_pruned=t.get_leaf_names() 
extra_ceratodon=[leaf for leaf in leaf_names_pruned if leaf.startswith("Ceratodon_purpureus")]
remove_extra_ceratodon=[leaf for leaf in leaf_names_pruned if leaf not in extra_ceratodon]

## remove duplicates of each species by random selection of which copy to keep
leaf_names_split=[]
leaf_names_species=[]

for x in remove_extra_ceratodon:
	leaf_names_split=x.split("_")[0:2]
	leaf_names_species.append('_'.join(leaf_names_split))
	
unique_names=list(set(leaf_names_species))

duplicates_random=[]

for j in unique_names:
	gene_names=[leaf for leaf in leaf_names_pruned if leaf.startswith(j)]
	duplicates_random.append(random.choice(gene_names))
	
## prune tree
t.prune(duplicates_random, preserve_branch_length=True)

t.write(outfile=filename_split+"_pruned.tre",format=1)


	