"""
===============================================================================
OI-2026 Protocol Phase II: Endurance Test 2 - Binary Domain Collision (B vs C)
===============================================================================

Purpose:
    Cross-validate Test 1 with a smaller, independent sample. Compare the
    alchemy domain (Stream B) directly against theology (Stream C) using
    Mann-Whitney U with the alternative hypothesis "B > C".

Hypothesis under test:
    If alchemy is the correct target domain, B should statistically dominate
    C across thousands of independent word-level comparisons.

Expected outcome (publication result):
    Voynich   :  45,223 words
    Alchemy(B): 300,000 words
    Theology(C): 300,000 words
    Schema-matched: 6,157 / 8,763 (70.3%)

    B (Alchemy)  : avg cos = 0.7988
    C (Theology) : avg cos = 0.6630
    Mean diff B - C: 0.1358
    B win rate     : 92.7%
    Mann-Whitney p ~ 0

Note on tokenization:
    Because EVA (which uses 0-9 as letter symbols) is mixed with Latin in
    the same SVD space, we use word-level n-grams with a permissive token
    pattern (token_pattern=r'[^\s]+') rather than character n-grams.

Author: Keishi Oi (ORCID: 0009-0006-7040-8353)
License: CC BY-NC 4.0
Reference: Zenodo DOI 10.5281/zenodo.20071552
===============================================================================
"""

import glob
import re
import gc
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import mannwhitneyu


# =============================================================================
# Configuration: paths to input data
# =============================================================================
PATH_VOYNICH = './voynich.txt'
PATH_STREAM_B = r'C:\Voynich_Lab\TRUE_Latin_Included\**\*.txt'
PATH_SCHEMA = './Voynich_Schema_Mapped_Translation.txt'
PATH_OUT = 'Domain_Comparison_Fixed2.csv'

PATH_STREAM_C_DIRS = [
    r'C:\Users\keish\cltk_data\lat\text\lat_text_latin_library\bible\**\*.txt',
    r'C:\Users\keish\cltk_data\lat\text\lat_text_latin_library\christian\**\*.txt',
]


# =============================================================================
# Domain-specific text cleaners
# =============================================================================
def clean_latin(text: str) -> str:
    """Cleaner for classical Latin: strips digits and punctuation."""
    return re.sub(r'[0-9\[\]\(\)\.\,;:!\?]+', ' ', text.lower())


def clean_voynich(text: str) -> str:
    """
    Cleaner for the Voynich Manuscript (EVA transcription).
    Preserves digits and '!' '+' (these are EVA letter symbols).
    Removes only brackets and basic punctuation.
    """
    text = text.lower()
    text = re.sub(r'[\[\]\(\)\.\,;:]', ' ', text)
    return text


# =============================================================================
# Corpus loaders
# =============================================================================
def load_text(path: str) -> str:
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        return f.read()


def load_corpus(glob_path: str, max_words: int = 300_000):
    words = []
    for f in glob.glob(glob_path, recursive=True):
        with open(f, 'r', encoding='utf-8', errors='ignore') as file:
            words.extend(clean_latin(file.read()).split())
            if len(words) > max_words:
                break
    if not words:
        raise FileNotFoundError(f'Missing: {glob_path}')
    return words[:max_words]


def load_multi_corpus(paths, max_words: int = 300_000):
    words = []
    for path in paths:
        for f in glob.glob(path, recursive=True):
            with open(f, 'r', encoding='utf-8', errors='ignore') as file:
                words.extend(clean_latin(file.read()).split())
                if len(words) > max_words:
                    return words[:max_words]
    if not words:
        raise FileNotFoundError(f'Missing: {paths}')
    return words[:max_words]


def load_schema(path: str) -> pd.DataFrame:
    rows = []
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        content = f.read()
    pattern = re.compile(r'\[type_\d+:([a-z]+):[^_]+_([^\]]+)\]', re.IGNORECASE)
    for pos, word in pattern.findall(content):
        w = word.strip().lower()
        if w:
            rows.append({'Target_Word': w, 'Assigned_POS': pos.strip().upper()})
    if not rows:
        raise ValueError('Failed to extract schema.')
    df = pd.DataFrame(rows).drop_duplicates(subset=['Target_Word'])
    print(f'  Schema: {len(df)} words')
    return df


# =============================================================================
# Main: Binary collision test (B vs C)
# =============================================================================
def run_binary_collision():
    print('[1/4] Loading corpora...')

    # Voynich is read with clean_voynich (EVA-preserving)
    words_v = clean_voynich(load_text(PATH_VOYNICH)).split()
    words_b = load_corpus(PATH_STREAM_B)
    words_c = load_multi_corpus(PATH_STREAM_C_DIRS)

    print(
        f'  Voynich : {len(words_v):,} / '
        f'Alchemy: {len(words_b):,} / '
        f'Theology: {len(words_c):,}'
    )

    print('\n[2/4] Loading schema...')
    df_schema = load_schema(PATH_SCHEMA)

    set_v = set(words_v)
    matched = set(df_schema['Target_Word']) & set_v
    unmatched_sample = list(set(df_schema['Target_Word']) - set_v)[:10]

    print(f'  Matched     : {len(matched)} / {len(df_schema)}')
    print(f'  Match rate  : {len(matched) / len(df_schema) * 100:.1f}%')
    print(f'  Unmatched ex: {unmatched_sample}')

    if not matched:
        print('FATAL: no matches. Check voynich.txt format.')
        return

    print('\n[3/4] Building shared 128D SVD space...')
    mk_ctx = lambda ws: [
        ' '.join(ws[max(0, i - 3):i + 4]) for i in range(len(ws))
    ]
    all_ctx = mk_ctx(words_v) + mk_ctx(words_b) + mk_ctx(words_c)

    # Word-level n-grams (EVA + Latin coexist; permissive token pattern)
    vec = CountVectorizer(
        ngram_range=(1, 2),
        max_features=30_000,
        analyzer='word',
        token_pattern=r'[^\s]+',
    )
    X = vec.fit_transform(all_ctx)
    del all_ctx
    gc.collect()

    svd = TruncatedSVD(n_components=128, random_state=42)
    M = svd.fit_transform(X)

    lv, lb = len(words_v), len(words_b)
    M_v = M[:lv]
    M_b = M[lv:lv + lb]
    M_c = M[lv + lb:]

    arr_v = np.array(words_v)
    arr_b = np.array(words_b)
    arr_c = np.array(words_c)

    print('\n[4/4] B (Alchemy) vs C (Theology) collision...')
    results = []
    df_matched = df_schema[df_schema['Target_Word'].isin(matched)]

    for i, (_, row) in enumerate(df_matched.iterrows(), 1):
        word = row['Target_Word']
        idx_v = np.where(arr_v == word)[0][0]
        vec_v = M_v[idx_v].reshape(1, -1)

        sim_b = cosine_similarity(vec_v, M_b).flatten()
        sim_c = cosine_similarity(vec_v, M_c).flatten()
        score_b = float(sim_b.max())
        score_c = float(sim_c.max())

        results.append({
            'Target_Word': word,
            'POS': row['Assigned_POS'],
            'B_Top_Word': arr_b[sim_b.argmax()],
            'B_CosSim': round(score_b, 4),
            'C_Top_Word': arr_c[sim_c.argmax()],
            'C_CosSim': round(score_c, 4),
            'B_minus_C': round(score_b - score_c, 4),
            'B_wins': score_b > score_c,
        })

        if i % 200 == 0 or i == len(df_matched):
            print(f'  {i}/{len(df_matched)} done')

    df_out = pd.DataFrame(results)
    df_out.to_csv(PATH_OUT, index=False)

    b_win = df_out['B_wins'].mean() * 100
    avg_b = df_out['B_CosSim'].mean()
    avg_c = df_out['C_CosSim'].mean()
    diff = df_out['B_minus_C'].mean()
    _, pval = mannwhitneyu(
        df_out['B_CosSim'], df_out['C_CosSim'], alternative='greater'
    )

    print(f"\n{'=' * 55}")
    print(f"Processed words            : {len(results)}")
    print(f"B (Alchemy)  avg CosSim    : {avg_b:.4f}")
    print(f"C (Theology) avg CosSim    : {avg_c:.4f}")
    print(f"Mean diff B - C            : {diff:.4f}")
    print(f"B win rate (CosSim)        : {b_win:.1f}%")
    print(f"Mann-Whitney p-value       : {pval:.4e}")
    print(f"{'=' * 55}")

    if b_win >= 70 and pval < 0.01:
        print('Verdict: B (Alchemy) is statistically significant superior.')
        print('         => Strongly supports domain selection.')
    elif b_win >= 60 and pval < 0.05:
        print('Verdict: B (Alchemy) marginally superior (5% level).')
    else:
        print('Verdict: Small difference; further verification needed.')


if __name__ == '__main__':
    run_binary_collision()