"""
===============================================================================
OI-2026 Protocol Phase II: Endurance Test 1 - Tri-Collision Domain Comparison
===============================================================================

Purpose:
    Verify that the alchemy domain (Stream B) maintains statistical superiority
    against theology (Stream C) AND practical literature with similar
    "recipe-like structure" (Stream D: Apicius cookery, Vitruvius architecture,
    Columella agriculture).

Hypothesis under test:
    If the alchemy match is merely an artifact of "recipe-like writing style"
    rather than actual alchemical content, then practical Latin literature
    (cookbooks, agricultural manuals) should score comparably to alchemy.

Expected outcome (publication result):
    Voynich     :  40,762 words
    Alchemy (B) : 300,000 words
    Theology(C) : 300,000 words
    Practical(D): 180,646 words

    B (Alchemy)  : avg cos = 0.8032 | win rate = 92.7%
    C (Theology) : avg cos = 0.6626 | win rate =  5.5%
    D (Practical): avg cos = 0.6647 | win rate =  1.7%
    Kruskal-Wallis p ~ 0

Author: Keishi Oi (ORCID: 0009-0006-7040-8353)
License: CC BY-NC 4.0
Reference: Zenodo DOI 10.5281/zenodo.20071552
===============================================================================
"""

import os
import glob
import re
import gc
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import kruskal


# =============================================================================
# Configuration: paths to input data
# =============================================================================
PATH_VOYNICH = './voynich.txt'
PATH_STREAM_B = r'C:\Voynich_Lab\TRUE_Latin_Included\**\*.txt'
PATH_SCHEMA = './Voynich_Schema_Mapped_Translation.txt'
PATH_OUT = 'Domain_Tri_Collision_Result.csv'

# Stream C: Theology corpus (Bible + Christian writings)
PATH_STREAM_C_DIRS = [
    r'C:\Users\keish\cltk_data\lat\text\lat_text_latin_library\bible\**\*.txt',
    r'C:\Users\keish\cltk_data\lat\text\lat_text_latin_library\christian\**\*.txt',
]

# Stream D: Practical literature corpus (cookery / architecture / agriculture)
PATH_STREAM_D_DIRS = [
    r'C:\Users\keish\cltk_data\lat\text\lat_text_latin_library\apicius*.txt',
    r'C:\Users\keish\cltk_data\lat\text\lat_text_latin_library\apicius\**\*.txt',
    r'C:\Users\keish\cltk_data\lat\text\lat_text_latin_library\vitruvius*.txt',
    r'C:\Users\keish\cltk_data\lat\text\lat_text_latin_library\vitruvius\**\*.txt',
    r'C:\Users\keish\cltk_data\lat\text\lat_text_latin_library\columella*.txt',
    r'C:\Users\keish\cltk_data\lat\text\lat_text_latin_library\columella\**\*.txt',
]


# =============================================================================
# Phase 0: CLTK corpus auto-download
# =============================================================================
def download_cltk_corpora():
    """Download the Latin Library corpus via CLTK if not already present."""
    print('[0/4] Checking CLTK Latin Library...')
    try:
        from cltk.data.fetch import FetchCorpus
        downloader = FetchCorpus(language='lat')
        downloader.import_corpus('lat_text_latin_library')
    except Exception:
        # Already downloaded, or offline; proceed silently.
        pass


# =============================================================================
# Domain-specific text cleaners
# =============================================================================
def clean_latin(text: str) -> str:
    """
    Cleaner for classical Latin corpora.
    Strips digits (chapter numbers, etc.) and punctuation.
    """
    return re.sub(r'[0-9\[\]\(\)\.\,;:!\?]+', ' ', text.lower())


def clean_voynich(text: str) -> str:
    """
    Cleaner for the Voynich Manuscript (EVA transcription).
    Removes locator tags such as <1r.1>, but PRESERVES digits because
    EVA uses 0-9 as actual letter symbols.
    """
    text = re.sub(r'<[^>]+>', ' ', text)                     # locator tags
    text = re.sub(r'[\[\]\(\)\.\,;:!\?]+', ' ', text.lower())  # punctuation only
    return text


# =============================================================================
# Corpus loaders
# =============================================================================
def load_text(path: str) -> str:
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        return f.read()


def load_multi_corpus(paths, max_words: int = 300_000):
    """Load multiple Latin corpora (capped at max_words)."""
    words = []
    for path in paths:
        for f in glob.glob(path, recursive=True):
            with open(f, 'r', encoding='utf-8', errors='ignore') as file:
                words.extend(clean_latin(file.read()).split())
                if len(words) > max_words:
                    return words[:max_words]
    return words[:max_words]


def load_schema(path: str) -> pd.DataFrame:
    """Load the schema-mapped translation file and extract (POS, target word) pairs."""
    rows = []
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        content = f.read()
    pattern = re.compile(r'\[type_\d+:([a-z]+):[^_]+_([^\]]+)\]', re.IGNORECASE)
    for pos, word in pattern.findall(content):
        rows.append({
            'Target_Word': word.strip().lower(),
            'Assigned_POS': pos.strip().upper(),
        })
    return pd.DataFrame(rows).drop_duplicates(subset=['Target_Word'])


# =============================================================================
# Main: Tri-collision test (B vs C vs D)
# =============================================================================
def run_tri_collision():
    download_cltk_corpora()

    print('\n[1/4] Loading corpora (A, B, C, D)...')
    words_v = clean_voynich(load_text(PATH_VOYNICH)).split()
    words_b = load_multi_corpus([PATH_STREAM_B])
    words_c = load_multi_corpus(PATH_STREAM_C_DIRS)
    words_d = load_multi_corpus(PATH_STREAM_D_DIRS)

    print(
        f'  Voynich: {len(words_v):,} | '
        f'Alchemy(B): {len(words_b):,} | '
        f'Theology(C): {len(words_c):,} | '
        f'Practical(D): {len(words_d):,}'
    )

    if len(words_d) < 5_000:
        print('\n[FATAL] Stream D (practical literature) has insufficient data.')
        return

    print('\n[2/4] Loading schema...')
    df_schema = load_schema(PATH_SCHEMA)
    matched = set(df_schema['Target_Word']) & set(words_v)
    print(f'  Matched: {len(matched)} / {len(df_schema)} words')
    if not matched:
        return

    print('\n[3/4] Building shared 128D SVD space...')
    mk_ctx = lambda ws: [
        ' '.join(ws[max(0, i - 3):i + 4]) for i in range(len(ws))
    ]
    all_ctx = mk_ctx(words_v) + mk_ctx(words_b) + mk_ctx(words_c) + mk_ctx(words_d)

    vec = CountVectorizer(ngram_range=(1, 2), max_features=40_000)
    X = vec.fit_transform(all_ctx)
    del all_ctx
    gc.collect()

    svd = TruncatedSVD(n_components=128, random_state=42)
    M = svd.fit_transform(X)

    lv, lb, lc = len(words_v), len(words_b), len(words_c)
    M_v = M[:lv]
    M_b = M[lv:lv + lb]
    M_c = M[lv + lb:lv + lb + lc]
    M_d = M[lv + lb + lc:]

    arr_v, arr_b, arr_c, arr_d = map(
        np.array, [words_v, words_b, words_c, words_d]
    )

    print('\n[4/4] Computing Tri-Collision (B vs C vs D)...')
    results = []
    df_matched = df_schema[df_schema['Target_Word'].isin(matched)]

    for i, (_, row) in enumerate(df_matched.iterrows(), 1):
        word = row['Target_Word']
        idx_v = np.where(arr_v == word)[0][0]
        vec_v = M_v[idx_v].reshape(1, -1)

        sim_b = cosine_similarity(vec_v, M_b).flatten()
        sim_c = cosine_similarity(vec_v, M_c).flatten()
        sim_d = cosine_similarity(vec_v, M_d).flatten()

        score_b = float(sim_b.max())
        score_c = float(sim_c.max())
        score_d = float(sim_d.max())

        scores = {'B': score_b, 'C': score_c, 'D': score_d}
        winner = max(scores, key=scores.get)

        results.append({
            'Target_Word': word,
            'POS': row['Assigned_POS'],
            'B_CosSim': round(score_b, 4),
            'C_CosSim': round(score_c, 4),
            'D_CosSim': round(score_d, 4),
            'Winner': winner,
        })

        if i % 500 == 0 or i == len(df_matched):
            print(f'  {i}/{len(df_matched)} done')

    df_out = pd.DataFrame(results)
    df_out.to_csv(PATH_OUT, index=False)

    avg_b = df_out['B_CosSim'].mean()
    avg_c = df_out['C_CosSim'].mean()
    avg_d = df_out['D_CosSim'].mean()

    win_counts = df_out['Winner'].value_counts()
    win_rates = {k: (v / len(results)) * 100 for k, v in win_counts.items()}

    _, pval = kruskal(
        df_out['B_CosSim'], df_out['C_CosSim'], df_out['D_CosSim']
    )

    print(f"\n{'=' * 55}")
    print(f"Processed words            : {len(results)}")
    print(f"B (Alchemy)  avg CosSim    : {avg_b:.4f} (win rate: {win_rates.get('B', 0):.1f}%)")
    print(f"C (Theology) avg CosSim    : {avg_c:.4f} (win rate: {win_rates.get('C', 0):.1f}%)")
    print(f"D (Practical) avg CosSim   : {avg_d:.4f} (win rate: {win_rates.get('D', 0):.1f}%)")
    print(f"Kruskal-Wallis p-value     : {pval:.4e}")
    print(f"{'=' * 55}")

    b_win = win_rates.get('B', 0)
    if b_win > max(win_rates.get('C', 0), win_rates.get('D', 0)) and pval < 0.05:
        print('Verdict: B (Alchemy) is statistically superior to all other domains.')
        print('         => Not merely "recipe-like structure"; alchemical content is supported.')
    else:
        print('Verdict: B (Alchemy) superiority is uncertain. Recipe-style bias may be present.')


if __name__ == '__main__':
    run_tri_collision()