# Script to build a presence/absence matrix using prophage clusters
import pandas as pd
import numpy as np
import sys
import re

# Loading data
path = "/home/user/cluster_ab_phigaro_90_ale_2step_def_1prcst_cl.txt" || "/home/user/cluster_ab_phigaro_90_ale_2step_def_freqmlst8_cl.txt"
strains_path = "/home/user/strains_nored100.ab"

colnames = ['Clusters','Prophages','Identity','Types']      # Column names
clustering = pd.read_csv(path, sep='\t', index_col = False, names = colnames)      

    
num = clustering.Clusters.values.tolist()         #  Get prophage clusters
# Filter out all the single prophage clusters
num_cl = {x for x in num if num.count(x)>1}           
num_cl = list(num_cl)        
clusters = clustering[clustering['Clusters'].isin(num_cl)]      

num_cl = [str(x) for x in num_cl] 

# Obtain a list with strains
with open(strains_path,'r') as f:
    strains = f.read().split("\n")

# Building the output dataframe
df = pd.DataFrame(columns = num_cl, index= strains)

for i in df.columns:
    for idx,row in clusters.iterrows():   # Iterate through the input file
        strain = re.search(r"ab[0-9]{5}",row.Prophages).group()   # Get only the ID of the genomes         
        if str(row.Clusters) == i:         
            df.loc[strain,i] = 1            # Indicate presence by a 1 in the matrix
            pass                       
            
df = df.fillna(0)                       
sort_df= df.sort_index() 


# Filter prophage clusters by their frequency
# for cl in sort_df.columns:              
#     count = sort_df[cl].sum()           
#     treshold = len(sort_df.index)*0.01  
#     if count < treshold:                   
#         sort_df.drop(cl, axis = 1, inplace = True)     
#     else:
#         pass

# Sorting dataframe by the total number of prophage per each genome
# sort_df.loc['Total'] = sort_df.sum(numeric_only = True) 
# df = sort_df.sort_values(by = 'Total', axis =1)    

# sort_df.set_index(sort_df.columns[0])
# sort_df.drop('Total',inplace = True)

# Save output file
sort_df.to_csv('/home/user/matrix_90_ab_1prcst_cl.tsv', sep='\t') || sort_df.to_csv('/home/user/matrix_90_ab_freqmlst8.tsv', sep='\t') 
