import os import re import pandas as pd import numpy as np from collections import defaultdict from sklearn.feature_extraction import DictVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.decomposition import TruncatedSVD from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score
def compiler_undefined_variable_structure_reverse_analysis(): print("未定義スロット・クラスタリングコンパイラ 起動...\n")
# 入出力パス
record_txt = r"C:\Users\141798\Desktop\Voynich_Base\Voynich_Decompiled_Record.txt"
output_csv = r"C:\Users\141798\Desktop\Voynich_Base\Voynich_Undefined_Variable_Types.csv"

if not os.path.exists(record_txt):
    print("!!! 致命的エラー: デコンパイル済レコードが見つかりません。")
    return

# =====================================================================
# Phase 1 & 2: 物理パラメータ(特徴量)の深層抽出
# =====================================================================
print("■ Phase 1 & 2: 物理パラメータの深層抽出 (全スロット走査)")

var_features = defaultdict(lambda: defaultdict(int))
var_metadata = defaultdict(list)

# 17のOS基板（プレフィックス）抽出パターン
substrate_pattern = re.compile(r'^([0-9][a-z]|[a-z][0-9]|[a-z]{1,2})')

with open(record_txt, "r", encoding="utf-8") as f:
    for line in f:
        # 1. メタデータ除去
        clean_line = re.sub(r'<[^>]+>', '', line).strip()
        if not clean_line: continue
       
        # 2. 【絶対防壁】ドット、カンマ、ハイフン、等号、スペースで厳密にトークン化
        # 例: [fa19s].[9],[hae] -> ['[fa19s]', '[9]', '[hae]']
        tokens = re.split(r'[\.\, \- =]+', clean_line)
        tokens = [t.strip() for t in tokens if t.strip()]
        total_tokens = len(tokens)
        if total_tokens == 0: continue

        for i, token in enumerate(tokens):
            # 未定義変数 [EVA] を検知
            match = re.match(r'\[([^\]]+)\]', token)
            if match:
                eva_symbol = match.group(1)
               
                # 特徴1: 4段階論理アーキテクチャ座標
                rel_pos = i / total_tokens
                if rel_pos < 0.25: phase = "Boot"
                elif rel_pos < 0.50: phase = "Set"
                elif rel_pos < 0.75: phase = "Transition"
                else: phase = "Termination"
               
                var_features[eva_symbol][f"phase_{phase}"] += 1
                var_metadata[eva_symbol].append({'phase': phase})

                # 特徴2: 基板依存性 (OS Substrate)
                sub_match = substrate_pattern.match(eva_symbol)
                if sub_match:
                    var_features[eva_symbol][f"substrate_{sub_match.group(1)}"] += 1

                # 特徴3: 確定アンカー(ラテン語)の隣接重力場 (t-1, t+1)
                if i > 0 and not tokens[i-1].startswith('['):
                    var_features[eva_symbol][f"gravity_prev_{tokens[i-1]}"] += 1
                if i < total_tokens - 1 and not tokens[i+1].startswith('['):
                    var_features[eva_symbol][f"gravity_next_{tokens[i+1]}"] += 1

vocab = list(var_features.keys())
if len(vocab) < 2:
    print(f" -> 警告: 検出された変数が少なすぎます ({len(vocab)}件)。パース条件を再確認してください。")
    if len(vocab) == 0: return

print(f" -> 検出されたユニークな未定義変数: {len(vocab)} 件")

# =====================================================================
# Phase 3: 動的クラスタリング (kの自動算出)
# =====================================================================
print("■ Phase 3: 動的クラスタリング (数理的評価による最適kの決定)")

vectorizer = DictVectorizer(sparse=True)
X_counts = vectorizer.fit_transform([var_features[v] for v in vocab])
X_tfidf = TfidfTransformer().fit_transform(X_counts)

# 56次元への射影（トポロジーの固定）
n_comp = min(56, X_tfidf.shape[1] - 1)
X_reduced = TruncatedSVD(n_components=n_comp, random_state=42).fit_transform(X_tfidf)

best_k = 2
max_score = -1

# クラスタ数の動的探索 (k=2〜10)
limit_k = min(11, len(vocab))
for k in range(2, limit_k):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(X_reduced)
    # シルエットスコアで分離の純度を計測
    score = silhouette_score(X_reduced, cluster_labels)
    if score > max_score:
        max_score = score
        best_k = k

print(f" -> 【結果】 最適な型(Type)の数を決定: k={best_k} (純度: {max_score:.4f})")

final_kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
final_labels = final_kmeans.fit_predict(X_reduced)

# =====================================================================
# Phase 4: 型(Type)のマトリクス現像
# =====================================================================
print("■ Phase 4: 型(Type)のマトリクス現像と出力")

results = []
for i, eva in enumerate(vocab):
    cluster_id = final_labels[i]
    phases = [m['phase'] for m in var_metadata[eva]]
    dominant_phase = max(set(phases), key=phases.count)
   
    # 重力場上位の抽出
    gravity_features = [f for f in var_features[eva] if f.startswith('gravity_')]
    top_anchors = sorted(gravity_features, key=lambda x: var_features[eva][x], reverse=True)[:2]
    top_anchors_str = ", ".join([a.replace('gravity_prev_', 't-1:').replace('gravity_next_', 't+1:') for a in top_anchors])

    results.append({
        'Target_Symbol': eva,
        'Assigned_Cluster': f"Type_{cluster_id}",
        'Dominant_Phase': dominant_phase,
        'Top_Adjacent_Anchors': top_anchors_str
    })

pd.DataFrame(results).to_csv(output_csv, index=False, encoding="utf-8")

print("\n" + "="*80)
print(f"👑 【現像完了】 未定義変数を {best_k} 個の物理的役割（型）に分類。")
for c in range(best_k):
    cluster_members = [r['Target_Symbol'] for r in results if r['Assigned_Cluster'] == f"Type_{c}"]
    print(f" -> [Type_{c}] 構成数: {len(cluster_members)} | 例: {', '.join(cluster_members[:3])}")
print(f"\n -> 物理レコード現像完了: {output_csv}")
print("="*80)
if name == "main": compiler_undefined_variable_structure_reverse_analysis()