"""
COLLABORATIVE TRANSLATION OUTPUT
=================================
Shows confirmed matches normally.
Shows UNMATCHED/MECHANICAL words in *italics* for human review.

This allows humans to see what's translated vs. what needs work.
"""

import re
import os

# ============================================================
# CONFIRMED MATCHES (90%+ confidence, strict phonetic match)
# ============================================================

CONFIRMED = {
    # Prefixes
    "QO": ("QUO", "by which", 95),
    "D": ("DI/DA", "of/from", 95),
    "S": ("SI/SU", "self/on", 90),
    "L": ("LOCO", "place", 95),
    "P": ("PORTA", "carry", 95),
    "T": ("TEMPO", "time", 90),
    "O": ("OLIM/ORA", "state/now", 80),  # Less certain
    
    # Roots - Materials
    "OR": ("ORCIO", "jar", 95),
    "OL": ("OLLA", "pot", 95),
    "AR": ("ARIA", "air", 98),
    "AIR": ("AIRE", "air", 98),
    "CHOL": ("CHOLERA", "fluid", 85),
    "CHOR": ("CHORE", "essence", 80),
    "CHAR": ("CHIARO", "clear", 85),
    "DAL": ("DATO-L", "salt?", 70),
    
    # Roots - Actions  
    "DAIIN": ("DARE-IN", "add inside", 95),
    "DAM": ("DATUM", "done", 90),
    "TAL": ("TAGLIA", "cut", 95),
    "TOL": ("TOGLI", "remove", 95),
    "CH": ("CHIUSA", "channel", 82),
    "CTH": ("CONDOTTO", "conduit", 80),
    
    # Roots - Heat (Medieval K-spelling)
    "KAL": ("KALDA", "hot", 85),
    "K": ("K(ALDO)", "heat", 85),
    
    # Suffixes
    "DY": ("-DI", "of/measure", 95),
    "AL": ("-ALE", "final", 95),
    "IN": ("-IN", "inside", 95),
    "AM": ("-AMMA", "batch", 80),
    "OL": ("-OL(TO)", "done", 85),
    "R": ("-R(OU)", "out", 75),
    "Y": ("(-Y)", "active", 70),
    "EY": ("(-EY)", "ongoing", 70),
}

# ============================================================
# UNCONFIRMED/MECHANICAL (shown in italics)
# These need human review!
# ============================================================

UNCONFIRMED = {
    # SH- cluster (rare in Romance)
    "SH": "?SH?",
    "SHE": "?SHE?",
    "SHEDY": "?SHEDY?",
    "SHEK": "?SHEK?",
    "SHEOR": "?SHEOR?",
    
    # AIN cluster (may be mechanical)
    "AIN": "?AIN?",
    "AIIN": "?AIIN?",
    "RAIN": "?RAIN?",
    
    # KEE intensifier (unclear)
    "KEE": "?KEE?",
    "KED": "?KED?",
    "KECH": "?KECH?",
    
    # Other problematic
    "PCH": "?PCH?",
    "OK": "?OK?",
    "OT": "?OT?",
    "SAM": "?SAM?",
    "QOL": "?QOL?",
}


def translate_word(word):
    """
    Translate a word showing:
    - CONFIRMED: normal text
    - UNCONFIRMED: in *asterisks* (markdown italics)
    """
    original = word.upper().strip()
    word = re.sub(r'[{}\[\]<>%$#@,;.\d]', '', original)
    
    if not word or len(word) < 2:
        return None
    
    parts = []
    remaining = word
    has_unconfirmed = False
    
    # Find prefix
    for pre in ["QO", "OT", "D", "S", "L", "P", "T", "O"]:
        if remaining.startswith(pre):
            if pre in CONFIRMED:
                mac, meaning, conf = CONFIRMED[pre]
                parts.append(f"{mac}")
            elif pre in UNCONFIRMED:
                parts.append(f"*{UNCONFIRMED[pre]}*")
                has_unconfirmed = True
            else:
                parts.append(f"*?{pre}?*")
                has_unconfirmed = True
            remaining = remaining[len(pre):]
            break
    
    # Find root (check longest first)
    root_found = False
    for root in sorted([k for k in list(CONFIRMED.keys()) + list(UNCONFIRMED.keys()) 
                       if k not in ["QO", "OT", "D", "S", "L", "P", "T", "O", "DY", "AL", "IN", "AM", "OL", "R", "Y", "EY"]],
                      key=len, reverse=True):
        if root in remaining:
            if root in CONFIRMED:
                mac, meaning, conf = CONFIRMED[root]
                parts.append(f"{mac}")
            elif root in UNCONFIRMED:
                parts.append(f"*{UNCONFIRMED[root]}*")
                has_unconfirmed = True
            root_found = True
            break
    
    if not root_found and remaining:
        # Check if entire remaining is unconfirmed
        for unc_key in UNCONFIRMED:
            if unc_key in remaining:
                parts.append(f"*{UNCONFIRMED[unc_key]}*")
                has_unconfirmed = True
                break
    
    # Find suffix
    for suf in ["DY", "AL", "IN", "AM", "EY", "OL", "R", "Y"]:
        if word.endswith(suf) and suf in CONFIRMED:
            mac, meaning, conf = CONFIRMED[suf]
            parts.append(f"{mac}")
            break
    
    if not parts:
        return {"voynich": original, "translation": f"*?{original}?*", "unconfirmed": True}
    
    return {
        "voynich": original,
        "translation": " ".join(parts),
        "unconfirmed": has_unconfirmed
    }


def extract_folio_content(content, folio_id):
    """Extract lines for a folio."""
    lines = []
    in_folio = False
    for line in content.split('\n'):
        if line.strip().startswith(f'<{folio_id}>'):
            in_folio = True
            continue
        if in_folio and re.match(r'^<f\d+[rv]>', line.strip()):
            break
        if in_folio and line.strip().startswith(f'<{folio_id}.'):
            match = re.search(r'>\s*(.+)$', line)
            if match:
                lines.append(match.group(1).strip())
    return lines


def extract_words(line):
    """Extract words from line."""
    line = re.sub(r'<[^>]+>', '', line)
    line = re.sub(r'\{[^}]+\}', '', line)
    return [w.strip() for w in re.split(r'[.\s]+', line) if w.strip() and len(w) >= 2]


def generate_collaborative_output(folio_id):
    """Generate output with confirmed/unconfirmed marking."""
    
    with open('voynich_ZL3b.txt', 'r', encoding='utf-8', errors='ignore') as f:
        content = f.read()
    
    lines = extract_folio_content(content, folio_id)
    if not lines:
        return None
    
    output = []
    output.append(f"# FOLIO {folio_id.upper()} - COLLABORATIVE TRANSLATION")
    output.append("")
    output.append("Legend:")
    output.append("- Normal text = CONFIRMED translation (90%+ match)")
    output.append("- *Italics* = NEEDS HUMAN REVIEW (unconfirmed/mechanical)")
    output.append("")
    output.append("---")
    output.append("")
    
    total_words = 0
    confirmed_count = 0
    unconfirmed_count = 0
    
    for line_num, line in enumerate(lines, 1):
        words = extract_words(line)
        if not words:
            continue
        
        output.append(f"## Line {line_num}")
        output.append("")
        output.append(f"**Original:** `{' '.join(words)}`")
        output.append("")
        output.append("| Voynich | Translation |")
        output.append("|---------|-------------|")
        
        for w in words:
            result = translate_word(w)
            if result:
                output.append(f"| {result['voynich']} | {result['translation']} |")
                total_words += 1
                if result['unconfirmed']:
                    unconfirmed_count += 1
                else:
                    confirmed_count += 1
        
        output.append("")
    
    # Summary
    output.append("---")
    output.append("")
    output.append("## Statistics")
    output.append("")
    output.append(f"- Total words: {total_words}")
    output.append(f"- Confirmed: {confirmed_count} ({int(100*confirmed_count/total_words) if total_words else 0}%)")
    output.append(f"- **Needs Review: {unconfirmed_count}** ({int(100*unconfirmed_count/total_words) if total_words else 0}%)")
    output.append("")
    output.append("## Words Needing Human Input")
    output.append("")
    output.append("The following patterns appear frequently but lack confirmed translations:")
    output.append("")
    for k, v in UNCONFIRMED.items():
        output.append(f"- **{k}**: {v} ← PLEASE SUGGEST MEANING")
    
    return "\n".join(output)


def main():
    os.makedirs("collaborative", exist_ok=True)
    
    sample_folios = ["f78r", "f88r", "f4r"]
    
    for folio_id in sample_folios:
        output = generate_collaborative_output(folio_id)
        if output:
            filepath = f"collaborative/{folio_id}_COLLABORATIVE.md"
            with open(filepath, 'w', encoding='utf-8') as f:
                f.write(output)
            print(f"✓ Created: {filepath}")
    
    # Also create a simple text version for quick viewing
    print("\n" + "=" * 60)
    print("SAMPLE OUTPUT (f78r, first 10 lines):")
    print("=" * 60)
    
    output = generate_collaborative_output("f78r")
    if output:
        lines = output.split('\n')[:60]
        print('\n'.join(lines))
        print("\n[...continued in file...]")


if __name__ == "__main__":
    main()
