#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jun  6 13:57:45 2025

@author: brown

This script merges both distance values (phylogenetic distance from the tree built in IQ-TREE and Kimura distance from the sequence alignment) from the same genome.
and determine the MLST relationship between those genomes
"""

import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform
import os
import re





def parse_iqtree_dist_matrix(filename,id_file):
    dicc = {}
    with open(filename) as f:
        lines = f.readlines()
    with open(id_file) as f:
        strains = [line.strip() for line in f]
        
    for i, id1 in enumerate(strains):
        values = list(map(float, lines[i].split()[1:]))  # Skip first element (strain name)
        dicc[id1] = {
            strains[j]: values[j] for j in range(len(strains)) if i != j
        }
    return dicc
                
def parse_square_matrix_to_dict(matriz_path, ids_path):
    dicc = {}
    with open(matriz_path) as f:
        lines = f.readlines()
    with open(ids_path) as f:
        strains = [line.strip() for line in f]
        
    dist_values = lines[1:]
    
    for i, line in enumerate(dist_values):
        parts = line.strip().split('\t')
        id1 = parts[0]
        match = re.search(r"\(([^)]+)\)", id1)
        if match:
            id_base = match.group(1)
        values = list(map(float, parts[1:]))
        dicc[id_base] = {
            strains[j]: values[j] for j in range(len(values)) if strains[j] != id1
        }
    return dicc


def load_st_mapping(filepath):
    st_dict = {}
    with open(filepath) as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            path, st = line.split(",")
            # Extraer solo el ID del genoma sin ruta ni extensión
            genome_id = os.path.basename(path).replace(".fna", "")
            st_dict[genome_id] = st
    return st_dict

def main():
   
    # Load iqtree distance matrix and convert into dict
    dist_phylo_iqtree = "./phylo_iqtree.dist"
    ids_iqtree = "./strains_phylo_iqtree.txt"
    dicc_iqtree = parse_iqtree_dist_matrix(dist_phylo_iqtree, ids_iqtree)
    

    # Load emboss distance matrix and convert into dict
    matrix_emboss = "./SspBCDE_kimura_square.dist"
    ids_emboss = "./strains_SspBCDE.id"
    dicc_emboss = parse_square_matrix_to_dict(matrix_emboss, ids_emboss)
 

    # Build MLST dict
    st_dict = load_st_mapping("aso_ST.id")
    
    # Empty output list
    data = []
    
    
    # Intersection of IDs from both distance dicts
    common_ids = set(dicc_iqtree.keys()).intersection(set(dicc_emboss.keys()))
    common_ids = sorted(common_ids)

 
    for i, id1 in enumerate(common_ids):
           for j in range(i+1, len(common_ids)):
               id2 = common_ids[j]
               pair = f"{id1}-{id2}"
               dist_iqtree = dicc_iqtree[id1][id2]
               dist_emboss = dicc_emboss[id1][id2]
               
               # Compare MLSTs: if ST from both genomes are the same, the relation is 'same', ....
               st1 = st_dict.get(id1, "-")
               st2 = st_dict.get(id2, "-")
               if st1 == "-" and st2 == "-":
                   relation = "unknown"
               elif st1 == st2:
                   relation = "same"
               else:
                   relation = "different"
               
               
               data.append([pair, dist_iqtree, dist_emboss, relation])
   
    df = pd.DataFrame(data, columns=["pair", "dist_iqtree", "dist_emboss", "ST_relation"])
    
    # Save output dataframe
    df.to_csv("distances_comparison_SspBCDE.csv", index=False)

if __name__ == "__main__":
    main()