import os import pandas as pd import numpy as np from sklearn.metrics.pairwise import cosine_similarity
def compiler_lexical_isomorphism_purge(): print("連鎖的数独パージ (Lexical Isomorphism) コンパイラ 起動...\n")
# 入出力ファイルの定義
stream_a_csv = r"C:\Users\141798\Desktop\Voynich_Base\Stream_A_Dense_56D.csv"
stream_b_csv = r"C:\Users\141798\Desktop\Voynich_Base\Stream_B_Dense_56D.csv"
anchors_csv = r"C:\Users\141798\Desktop\Voynich_Base\Voynich_Absolute_Anchors_Strict.csv"
output_csv = r"C:\Users\141798\Desktop\Voynich_Base\Voynich_Absolute_Anchors_Phase4.csv"

if not all(os.path.exists(p) for p in [stream_a_csv, stream_b_csv, anchors_csv]):
    print("!!! 致命的エラー: 必要なファイルが存在しません。")
    return

# =====================================================================
# Phase 1: 初期足場のロードと空間の絶対的切除（パージ）
# =====================================================================
print("■ Phase 1: 初期足場のロードと空間パージ")

df_a = pd.read_csv(stream_a_csv)
df_b = pd.read_csv(stream_b_csv)

labels_a = df_a.iloc[:, 0].astype(str).to_numpy()
matrix_a = df_a.iloc[:, 1:].to_numpy()

labels_b = df_b.iloc[:, 0].astype(str).to_numpy()
matrix_b = df_b.iloc[:, 1:].to_numpy()

# 初期アンカーの読み込み
df_init_anchors = pd.read_csv(anchors_csv)
all_anchors = []

targets_a_to_drop = set()
targets_b_to_drop = set()

for _, row in df_init_anchors.iterrows():
    # カラム名の揺らぎ（履歴互換）に対応
    sym = str(row.get('Target_Symbol', row.get('EVA_Symbol')))
    kwd = str(row.get('Keyword', row.get('Latin_Word')))
    sim = float(row.get('Cos_Sim', row.get('Similarity')))
   
    all_anchors.append({
        'Loop_Cycle': 0,
        'Target_Symbol': sym,
        'Keyword': kwd,
        'Cos_Sim': sim,
        'Gap': float(row['Gap']),
        'Z_Score': float(row['Z_Score'])
    })
    targets_a_to_drop.add(sym)
    targets_b_to_drop.add(kwd)

print(f" -> 初期アンカー {len(all_anchors)} 件をロック。")

# 空間からの物理的切除（Drop）関数
def drop_elements(labels, matrix, drop_set):
    keep_idx = [i for i, lbl in enumerate(labels) if lbl not in drop_set]
    return labels[keep_idx], matrix[keep_idx]

labels_a, matrix_a = drop_elements(labels_a, matrix_a, targets_a_to_drop)
labels_b, matrix_b = drop_elements(labels_b, matrix_b, targets_b_to_drop)

print(f" -> 空間の再定義完了: Stream A ({len(labels_a)} 変数), Stream B ({len(labels_b)} 語彙)\n")

# =====================================================================
# Phase 2 & 3: 連鎖的同型衝突のループ起動（While Loop）
# =====================================================================
print("■ Phase 2 & 3: 連鎖的数独パージ ループ起動")
loop_cycle = 1

while True:
    if len(labels_a) == 0 or len(labels_b) == 0:
        print(f"  [Cycle {loop_cycle}] 空間が枯渇。")
        break
       
    print(f"\n--- [Cycle {loop_cycle}] 同型衝突開始 ---")
    print(f" -> 現在の空間サイズ: Stream A={len(labels_a)} | Stream B={len(labels_b)}")

    # 総当たりマトリクス生成
    sim_matrix = cosine_similarity(matrix_a, matrix_b)
   
    best_b_for_a = np.argmax(sim_matrix, axis=1)
    best_a_for_b = np.argmax(sim_matrix, axis=0)
   
    sorted_sim_a = np.sort(sim_matrix, axis=1)
    gap_a = sorted_sim_a[:, -1] - sorted_sim_a[:, -2] if sim_matrix.shape[1] > 1 else np.zeros(sim_matrix.shape[0])
   
    global_mean_gap = np.mean(gap_a)
    global_std_gap = np.std(gap_a)
   
    new_anchors_this_cycle = []
    a_purge_set = set()
    b_purge_set = set()
   
    # MNNと特異ギャップ(Z>=2.0)の判定
    Z_THRESHOLD = 2.0
   
    for a_idx, b_idx in enumerate(best_b_for_a):
        if best_a_for_b[b_idx] == a_idx: # MNN（両思い）の確認
            gap_val = gap_a[a_idx]
            z_score = (gap_val - global_mean_gap) / global_std_gap if global_std_gap > 0 else 0
           
            if z_score >= Z_THRESHOLD:
                sym = labels_a[a_idx]
                kwd = labels_b[b_idx]
                new_anchors_this_cycle.append({
                    'Loop_Cycle': loop_cycle,
                    'Target_Symbol': sym,
                    'Keyword': kwd,
                    'Cos_Sim': sim_matrix[a_idx, b_idx],
                    'Gap': gap_val,
                    'Z_Score': round(z_score, 4)
                })
                a_purge_set.add(sym)
                b_purge_set.add(kwd)
   
    if len(new_anchors_this_cycle) == 0:
        print(f" -> [停止] 新規アンカーの現像数: 0件。空間の特異点が消失。")
        break
       
    print(f" -> 【現像成功】 新規アンカーを {len(new_anchors_this_cycle)} 件抽出！")
    for anc in new_anchors_this_cycle:
        print(f"    [Z:{anc['Z_Score']:.2f}] EVA: '{anc['Target_Symbol']}' <===> Latin: '{anc['Keyword']}'")
       
    # リストへの追加と空間からのパージ
    all_anchors.extend(new_anchors_this_cycle)
    labels_a, matrix_a = drop_elements(labels_a, matrix_a, a_purge_set)
    labels_b, matrix_b = drop_elements(labels_b, matrix_b, b_purge_set)
   
    loop_cycle += 1

# =====================================================================
# Phase 4: 連鎖の停止（Termination）とマトリクス現像
# =====================================================================
print("\n■ Phase 4: 連鎖停止と最終マトリクスの出力")
df_final = pd.DataFrame(all_anchors)
df_final.to_csv(output_csv, index=False, encoding="utf-8")

print("="*80)
print(f"👑 【数独パージ完了】 全 {loop_cycle - 1} サイクルの連鎖が終了。")
print(f" -> 最終確定アンカー総数: {len(all_anchors)} 件")
print(f" -> 出力ファイル: {output_csv}")
print(f" -> 空間に残存した {len(labels_a)} 個のEVA記号は、[未定義_XXX]として保留されます。")
print("="*80)
if name == "main": compiler_lexical_isomorphism_purge()