import os import re import pandas as pd from collections import defaultdict from sklearn.feature_extraction import DictVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.decomposition import TruncatedSVD
def compiler_context_topology_rebuild_v5(): print("空間パース完全修復コンパイラ (動的スケール同期版) 起動...\n")
stream_a_raw = r"C:\Users\141798\Desktop\Voynich_Base\voynich.txt"
stream_b_dir = r"C:\Users\141798\Desktop\Voynich_Base\Final_Refined_Library"

# バージョン5として出力を区別
out_a_csv = r"C:\Users\141798\Desktop\Voynich_Base\Stream_A_ContextDense_128D_v5.csv"
out_b_csv = r"C:\Users\141798\Desktop\Voynich_Base\Stream_B_ContextDense_128D_v5.csv"

TARGET_DIMENSIONS = 128

# =====================================================================
# Phase 1-A: Stream A のパース処理
# =====================================================================
print("■ Phase 1-A: Stream A (手稿) の絶対パースとスケール測定")

a_texts = []
if not os.path.exists(stream_a_raw):
    print(f"!!! 致命的エラー: {stream_a_raw} が見つかりません。")
    return
   
with open(stream_a_raw, "r", encoding="utf-8", errors="ignore") as f:
    for line in f:
        clean_line = re.sub(r'<[^>]+>', ' ', line)
        clean_line = re.sub(r'[.,\-=\s]+', ' ', clean_line).strip()
        words = clean_line.split()
        if len(words) >= 2:
            a_texts.append(words) # wordsはリスト
           
print(f" -> [Stream A] 抽出テキストブロック数(行): {len(a_texts)}")

# 【絶対防壁】Stream Aの空間スケールに基づく動的ウィンドウサイズの計算
total_words_a = sum(len(words) for words in a_texts)
# Stream Aの1行あたりの平均単語数を算出し、整数化（最低でも2以上を保証）
dynamic_window_size = max(2, int(total_words_a / len(a_texts)))
print(f" -> [空間同期] Stream Aの平均スケールを算出。動的スライス幅を【 {dynamic_window_size} 単語 】にロックしました。\n")

# =====================================================================
# Phase 1-B: Stream B (ラテン語) の動的スライスパース処理
# =====================================================================
print("■ Phase 1-B: Stream B (ラテン語) の動的スライスパース")

b_texts = []
alpha_pattern = re.compile(r'[^a-zA-Z\s]')

if not os.path.exists(stream_b_dir):
    print(f"!!! 致命的エラー: {stream_b_dir} が見つかりません。")
    return
   
for root, _, filenames in os.walk(stream_b_dir):
    for f in filenames:
        if f.lower().endswith(".txt"):
            filepath = os.path.join(root, f)
            try:
                with open(filepath, "r", encoding="utf-8", errors="ignore") as file:
                    raw_text = file.read().lower()
                   
                    # アルファベット以外のノイズをパージし、全単語を1次元配列化
                    clean_text = alpha_pattern.sub(' ', raw_text).strip()
                    words = clean_text.split()
                   
                    # 【ハードコード排除】計算された dynamic_window_size で強制スライス
                    for i in range(0, len(words), dynamic_window_size):
                        block_words = words[i:i + dynamic_window_size]
                        if len(block_words) >= 2:
                            b_texts.append(block_words)
                           
            except Exception as e:
                pass

print(f" -> [Stream B] 抽出テキストブロック数(動的チャンク): {len(b_texts)}\n")

# =====================================================================
# Phase 2 & Phase 3: コンテキスト特徴量抽出とSVD射影
# =====================================================================
def extract_and_project(text_blocks, stream_name):
    print(f"■ Phase 2 & 3: [{stream_name}] 特徴量抽出と128D射影")
    word_features = defaultdict(lambda: defaultdict(int))
   
    for words in text_blocks:
        length = len(words)
        for i, w in enumerate(words):
            if not w: continue
           
            # 特徴①: 空間的配置
            if i == 0:
                pos = 'START'
            elif i == length - 1:
                pos = 'END'
            else:
                pos = 'MID'
            word_features[w][f'pos_{pos}'] += 1
           
            # 特徴②: マルコフ遷移 (前後の引力)
            prev_w = words[i-1] if i > 0 else 'START'
            next_w = words[i+1] if i < length - 1 else 'END'
           
            word_features[w][f'prev_{prev_w}'] += 1
            word_features[w][f'next_{next_w}'] += 1
           
    vocab = list(word_features.keys())
    feature_dicts = [word_features[w] for w in vocab]
   
    print(f" -> [{stream_name}] DictVectorizerによる疎行列化...")
    vectorizer = DictVectorizer(sparse=True)
    X_counts = vectorizer.fit_transform(feature_dicts)
   
    print(f" -> [{stream_name}] TfidfTransformerによる特異引力計算...")
    tfidf = TfidfTransformer()
    X_tfidf = tfidf.fit_transform(X_counts)
   
    actual_dim = min(TARGET_DIMENSIONS, X_tfidf.shape[1] - 1)
   
    print(f" -> [{stream_name}] TruncatedSVDによる {actual_dim}Dへの射影...")
    svd = TruncatedSVD(n_components=actual_dim, random_state=42)
    X_dense = svd.fit_transform(X_tfidf)
    var_ratio = svd.explained_variance_ratio_.sum() * 100
   
    print(f"    * 対象変数(語彙数): {len(vocab)}")
    print(f"    * SVD分散説明率: {var_ratio:.2f}%\n")
   
    return vocab, X_dense, actual_dim

vocab_a, dense_a, dim_a = extract_and_project(a_texts, "Stream A")
vocab_b, dense_b, dim_b = extract_and_project(b_texts, "Stream B")

# =====================================================================
# Phase 4: マトリクスの現像（CSV出力）
# =====================================================================
print("■ Phase 4: 128Dマトリクスの現像（CSV出力）")

final_dim = min(dim_a, dim_b)
dense_a = dense_a[:, :final_dim]
dense_b = dense_b[:, :final_dim]

dim_cols = [f"Dim_{i+1}" for i in range(final_dim)]

df_a = pd.DataFrame(dense_a, columns=dim_cols)
df_a.insert(0, 'Target_Symbol', vocab_a)
df_a.to_csv(out_a_csv, index=False, encoding='utf-8')
print(f" -> [出力完了] {out_a_csv}")

df_b = pd.DataFrame(dense_b, columns=dim_cols)
df_b.insert(0, 'Keyword', vocab_b)
df_b.to_csv(out_b_csv, index=False, encoding='utf-8')
print(f" -> [出力完了] {out_b_csv}")

print("\n" + "="*80)
print("👑 【トポロジー完全修復・現像完了】")
print(f" -> 動的スケール同期により、Stream B はStream Aと同じ『{dynamic_window_size}単語の空間幅』で完全に切断・同期。")
print("="*80)
if name == "main": compiler_context_topology_rebuild_v5()