import os import time import pandas as pd import numpy as np
def compiler_lexical_isomorphism_128d_purge_chunked_v3(): print("128D 連鎖的数独パージ起動...\n")
path_a = r"C:\Users\141798\Desktop\Voynich_Base\Stream_A_ContextDense_128D_v5.csv"
path_b = r"C:\Users\141798\Desktop\Voynich_Base\Stream_B_ContextDense_128D_v5.csv"
path_init = r"C:\Users\141798\Desktop\Voynich_Base\Voynich_Absolute_Anchors_Phase4.csv"
output_csv = r"C:\Users\141798\Desktop\Voynich_Base\Voynich_Absolute_Anchors_Phase5_128D.csv"

if not all(os.path.exists(p) for p in [path_a, path_b, path_init]):
    print("!!! 致命的エラー: ファイルが見つかりません。")
    return

# Phase 1: ロード
df_a = pd.read_csv(path_a, keep_default_na=False)
df_b = pd.read_csv(path_b, keep_default_na=False)
df_init = pd.read_csv(path_init, keep_default_na=False)

labels_a = df_a.iloc[:, 0].astype(str).to_numpy()
matrix_a = df_a.iloc[:, 1:].astype(float).to_numpy()
labels_b = df_b.iloc[:, 0].astype(str).to_numpy()
matrix_b = df_b.iloc[:, 1:].astype(float).to_numpy()
TOTAL_A_ORIGINAL = len(labels_a)

all_anchors = []
targets_a_to_drop = set()
targets_b_to_drop = set()
for _, row in df_init.iterrows():
    sym = str(row.get('Target_Symbol', row.get('EVA_Symbol')))
    kwd = str(row.get('Keyword', row.get('Latin_Word')))
    all_anchors.append({
        'Loop_Cycle': 0, 'Target_Symbol': sym, 'Keyword': kwd,
        'Cos_Sim': float(row.get('Cos_Sim', row.get('Similarity', 0))),
        'Gap': float(row.get('Gap', 0)), 'Z_Score': float(row.get('Z_Score', 0))
    })
    targets_a_to_drop.add(sym)
    targets_b_to_drop.add(kwd)

def drop_elements(labels, matrix, drop_set):
    keep_idx = [i for i, lbl in enumerate(labels) if lbl not in drop_set]
    return labels[keep_idx], matrix[keep_idx]

labels_a, matrix_a = drop_elements(labels_a, matrix_a, targets_a_to_drop)
labels_b, matrix_b = drop_elements(labels_b, matrix_b, targets_b_to_drop)

print(f" -> 初期アンカーパージ完了。空間再定義: A({len(labels_a)}), B({len(labels_b)})")

# ループ開始
loop_cycle = 1
while True:
    if len(labels_a) == 0 or len(labels_b) == 0: break

    progress = ((TOTAL_A_ORIGINAL - len(labels_a)) / TOTAL_A_ORIGINAL) * 100
    print(f"\n{'='*75}\n🔄 [Cycle {loop_cycle}] 進捗: {progress:.2f}% | 残存 A:{len(labels_a)} B:{len(labels_b)}\n{'-'*75}")

    # --- チャンク処理による類似度・MNN演算 ---
    num_a = len(labels_a)
    num_b = len(labels_b)
   
    # グローバル追跡用
    best_sim_for_a = np.zeros(num_a)
    best_idx_b_for_a = np.zeros(num_a, dtype=int)
    gap_a = np.zeros(num_a)
   
    # B側から見た「現時点での」最高Simと対応するAのIndex
    best_sim_for_b = np.full(num_b, -1.0)
    best_idx_a_for_b = np.full(num_b, -1, dtype=int)

    # ベクトルの正規化（一度だけ行う）
    norm_a = np.linalg.norm(matrix_a, axis=1, keepdims=True)
    norm_a[norm_a == 0] = 1e-9
    ma_n = matrix_a / norm_a
    norm_b = np.linalg.norm(matrix_b, axis=1, keepdims=True)
    norm_b[norm_b == 0] = 1e-9
    mb_n = matrix_b / norm_b

    chunk_size = 500  # メモリ安全圏
    print(f" [チャンク演算中] 全 {num_a} 件を {chunk_size} 件ずつ処理...")
   
    start_t = time.time()
    for i in range(0, num_a, chunk_size):
        end_i = min(i + chunk_size, num_a)
        # 部分行列計算 (i〜end_i の A vs 全ての B)
        # メモリ消費: 500 * 173848 * 8 bytes ≈ 695 MB (安全)
        chunk_sim = np.dot(ma_n[i:end_i], mb_n.T)

        # A側: 各行のTop2とGapを取得
        # argpartitionは全ソートより遥かに速い
        top2_indices = np.argpartition(chunk_sim, -2, axis=1)[:, -2:]
        top2_values = np.take_along_axis(chunk_sim, top2_indices, axis=1)
        top2_values.sort(axis=1) # [2位, 1位]
       
        best_sim_for_a[i:end_i] = top2_values[:, 1]
        best_idx_b_for_a[i:end_i] = top2_indices[np.arange(len(top2_values)), 1]
        gap_a[i:end_i] = top2_values[:, 1] - top2_values[:, 0]

        # B側: このチャンク内での最大値をグローバルと比較・更新
        chunk_best_sim_b = np.max(chunk_sim, axis=0)
        chunk_best_idx_a_b = np.argmax(chunk_sim, axis=0) + i
       
        # ベクトル演算で一括更新
        better_mask = chunk_best_sim_b > best_sim_for_b
        best_sim_for_b[better_mask] = chunk_best_sim_b[better_mask]
        best_idx_a_for_b[better_mask] = chunk_best_idx_a_b[better_mask]
       
        if (i // chunk_size) % 5 == 0:
            print(f"  -> {end_i}/{num_a} 完了...")

    print(f" [完了] 総演算時間: {time.time() - start_t:.2f}秒")

    # --- 統計と判定 ---
    mean_gap = np.mean(gap_a)
    std_gap = np.std(gap_a)
    new_anchors = []
    a_purge_set, b_purge_set = set(), set()
   
    for a_idx in range(num_a):
        b_idx = best_idx_b_for_a[a_idx]
        # MNN条件: Aが選んだBにとって、最高なのがこのAであるか
        if best_idx_a_for_b[b_idx] == a_idx:
            z_score = (gap_a[a_idx] - mean_gap) / std_gap if std_gap > 0 else 0
            if z_score >= 2.0:
                new_anchors.append({
                    'Loop_Cycle': loop_cycle, 'Target_Symbol': labels_a[a_idx],
                    'Keyword': labels_b[b_idx], 'Cos_Sim': best_sim_for_a[a_idx],
                    'Gap': gap_a[a_idx], 'Z_Score': round(z_score, 4)
                })
                a_purge_set.add(labels_a[a_idx])
                b_purge_set.add(labels_b[b_idx])

    if not new_anchors:
        print(" -> [停止] ")
        break

    print(f" -> 【現像】 新規アンカー {len(new_anchors)} 件確定。")
    for anc in sorted(new_anchors, key=lambda x: x['Z_Score'], reverse=True)[:5]:
        print(f"    [Z:{anc['Z_Score']:.2f}] '{anc['Target_Symbol']}' <==> '{anc['Keyword']}'")

    all_anchors.extend(new_anchors)
    labels_a, matrix_a = drop_elements(labels_a, matrix_a, a_purge_set)
    labels_b, matrix_b = drop_elements(labels_b, matrix_b, b_purge_set)
    loop_cycle += 1

# 保存
pd.DataFrame(all_anchors).to_csv(output_csv, index=False, encoding="utf-8")
print(f"\n👑 【完了】 総アンカー数: {len(all_anchors)} 件 -> {output_csv}")
if name == "main": compiler_lexical_isomorphism_128d_purge_chunked_v3()