# -*- coding: utf-8 -*-
"""
analysis_code.py

Reproducibility script for:
Global Evidence Mapping of Food Sensory Evaluation:
A Scoping Review of Web of Science Literature from 2009 to 2025

Purpose:
This script reproduces the main descriptive statistics, cross-tabulations,
chi-square tests, and Cramer's V results from the final coded dataset.

Important:
This script does NOT reproduce the manual screening and coding decisions.
Those decisions were made through the coder-checker procedure described in
the manuscript and are stored in coded_dataset.xlsx.

Required packages:
pandas, numpy, scipy, openpyxl

Run:
python analysis_code.py
"""

from pathlib import Path
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency


# ============================================================
# 1. File settings
# ============================================================

INPUT_FILE = Path("coded_dataset.xlsx")
if not INPUT_FILE.exists():
    # This fallback is only for local testing before renaming the file.
    fallback = Path("英文文献最终版2.0.xlsx")
    if fallback.exists():
        INPUT_FILE = fallback

OUTPUT_FILE = Path("analysis_outputs.xlsx")
MASTER_SHEET = "MASTER_双口径_含中文"


# ============================================================
# 2. Column names in the final coded dataset
# ============================================================

COL_TITLE = "题名"
COL_YEAR = "年份"
COL_INCLUDED = "全文纳入"
COL_EXCLUSION = "全文排除理由"

COL_COUNTRY = "研究国家(自动-待核实)"
COL_CONTINENT = "洲(自动)"
COL_FOOD = "食品类别(自动)"
COL_PURPOSE = "研究目的(自动)"
COL_METHOD = "方法家族(自动)"
COL_OUTCOME = "Outcome focus/指标侧重"


REQUIRED_COLUMNS = [
    COL_TITLE,
    COL_YEAR,
    COL_INCLUDED,
    COL_EXCLUSION,
    COL_COUNTRY,
    COL_CONTINENT,
    COL_FOOD,
    COL_PURPOSE,
    COL_METHOD,
    COL_OUTCOME,
]


# ============================================================
# 3. Helper functions
# ============================================================

def clean_label(x):
    """Standardize missing and text labels."""
    if pd.isna(x):
        return "Unknown"
    text = str(x).strip()
    if text == "" or text.lower() in {"nan", "none"}:
        return "Unknown"
    return text


def normalize_include_flag(x):
    """Normalize inclusion label."""
    if pd.isna(x):
        return ""
    return str(x).strip().upper()


def frequency_table(data, column):
    """Generate frequency and percentage table."""
    out = (
        data[column]
        .apply(clean_label)
        .value_counts()
        .reset_index()
    )
    out.columns = [column, "n"]
    out["percentage"] = out["n"] / len(data) * 100
    return out


def count_crosstab(data, row_var, col_var):
    """Generate count cross-tabulation."""
    return pd.crosstab(
        data[row_var].apply(clean_label),
        data[col_var].apply(clean_label),
    )


def row_percent_crosstab(data, row_var, col_var):
    """Generate row-percentage cross-tabulation."""
    tab = count_crosstab(data, row_var, col_var)
    return tab.div(tab.sum(axis=1), axis=0) * 100


def cramers_v_from_table(contingency_table):
    """Calculate chi-square test and Cramer's V from a contingency table."""
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    n = contingency_table.to_numpy().sum()
    rows, cols = contingency_table.shape
    denominator = n * min(rows - 1, cols - 1)

    if denominator == 0:
        v = np.nan
    else:
        v = np.sqrt(chi2 / denominator)

    return chi2, dof, p, v


def association_test(data, row_var, col_var, relationship_name):
    """Run chi-square test and Cramer's V for two categorical variables."""
    table = count_crosstab(data, row_var, col_var)
    chi2, dof, p, v = cramers_v_from_table(table)

    return {
        "Relationship": relationship_name,
        "n": int(table.to_numpy().sum()),
        "Chi-square": chi2,
        "df": dof,
        "p-value": p,
        "Cramer's V": v,
    }


def safe_sheet_name(name):
    """Excel sheet names must be <=31 characters."""
    return name[:31]


# ============================================================
# 4. Load data and validate structure
# ============================================================

df = pd.read_excel(INPUT_FILE, sheet_name=MASTER_SHEET)
df.columns = [str(c).strip() for c in df.columns]

missing_columns = [c for c in REQUIRED_COLUMNS if c not in df.columns]
if missing_columns:
    raise ValueError(
        "The following required columns are missing from the dataset: "
        + ", ".join(missing_columns)
    )


# ============================================================
# 5. Define records, included studies, and excluded records
# ============================================================

# Records entering eligibility assessment
records = df[df[COL_TITLE].notna()].copy()

# Final included studies
included = records[
    records[COL_INCLUDED].apply(normalize_include_flag).isin(["Y", "YES", "1", "TRUE"])
].copy()

# Excluded records after eligibility assessment
excluded = records[
    records[COL_INCLUDED].apply(normalize_include_flag).isin(["N", "NO", "0", "FALSE"])
].copy()


# ============================================================
# 6. Validation checks for key manuscript counts
# ============================================================

EXPECTED_RECORDS = 1006
EXPECTED_INCLUDED = 954
EXPECTED_EXCLUDED = 52

if len(records) != EXPECTED_RECORDS:
    raise ValueError(
        f"Validation failed: expected {EXPECTED_RECORDS} records entering eligibility "
        f"assessment, but found {len(records)}."
    )

if len(included) != EXPECTED_INCLUDED:
    raise ValueError(
        f"Validation failed: expected {EXPECTED_INCLUDED} final included studies, "
        f"but found {len(included)}."
    )

if len(excluded) != EXPECTED_EXCLUDED:
    raise ValueError(
        f"Validation failed: expected {EXPECTED_EXCLUDED} excluded records after "
        f"eligibility assessment, but found {len(excluded)}."
    )


# ============================================================
# 7. Overview statistics
# ============================================================

overview = pd.DataFrame({
    "Metric": [
        "Records entering eligibility assessment",
        "Final included studies",
        "Excluded records after eligibility assessment",
        "Minimum publication year",
        "Maximum publication year",
        "Number of research countries",
        "Number of continents",
        "Number of food categories",
        "Number of research purpose categories",
        "Number of sensory method families",
        "Number of outcome focus categories",
    ],
    "Value": [
        len(records),
        len(included),
        len(excluded),
        int(included[COL_YEAR].min()),
        int(included[COL_YEAR].max()),
        included[COL_COUNTRY].apply(clean_label).nunique(),
        included[COL_CONTINENT].apply(clean_label).nunique(),
        included[COL_FOOD].apply(clean_label).nunique(),
        included[COL_PURPOSE].apply(clean_label).nunique(),
        included[COL_METHOD].apply(clean_label).nunique(),
        included[COL_OUTCOME].apply(clean_label).nunique(),
    ],
})


# ============================================================
# 8. Single-variable summaries
# ============================================================

year_summary = (
    included[COL_YEAR]
    .astype(int)
    .value_counts()
    .sort_index()
    .reset_index()
)
year_summary.columns = ["Year", "n"]
year_summary["percentage"] = year_summary["n"] / len(included) * 100

country_summary = frequency_table(included, COL_COUNTRY)
continent_summary = frequency_table(included, COL_CONTINENT)
food_summary = frequency_table(included, COL_FOOD)
purpose_summary = frequency_table(included, COL_PURPOSE)
method_summary = frequency_table(included, COL_METHOD)
outcome_summary = frequency_table(included, COL_OUTCOME)

excluded_summary = frequency_table(excluded, COL_EXCLUSION)


# ============================================================
# 9. Cross-tabulations
# ============================================================

cross_tabs = {
    "Continent_Food_count": count_crosstab(included, COL_CONTINENT, COL_FOOD),
    "Continent_Food_percent": row_percent_crosstab(included, COL_CONTINENT, COL_FOOD),

    "Continent_Method_count": count_crosstab(included, COL_CONTINENT, COL_METHOD),
    "Continent_Method_percent": row_percent_crosstab(included, COL_CONTINENT, COL_METHOD),

    "Continent_Outcome_count": count_crosstab(included, COL_CONTINENT, COL_OUTCOME),
    "Continent_Outcome_percent": row_percent_crosstab(included, COL_CONTINENT, COL_OUTCOME),

    "Food_Method_count": count_crosstab(included, COL_FOOD, COL_METHOD),
    "Food_Method_percent": row_percent_crosstab(included, COL_FOOD, COL_METHOD),

    "Food_Outcome_count": count_crosstab(included, COL_FOOD, COL_OUTCOME),
    "Food_Outcome_percent": row_percent_crosstab(included, COL_FOOD, COL_OUTCOME),

    "Method_Outcome_count": count_crosstab(included, COL_METHOD, COL_OUTCOME),
    "Method_Outcome_percent": row_percent_crosstab(included, COL_METHOD, COL_OUTCOME),
}


# ============================================================
# 10. Chi-square tests and Cramer's V
# ============================================================

test_pairs = [
    (COL_CONTINENT, COL_FOOD, "Continent × Food category"),
    (COL_CONTINENT, COL_METHOD, "Continent × Method family"),
    (COL_CONTINENT, COL_OUTCOME, "Continent × Outcome focus"),
    (COL_FOOD, COL_METHOD, "Food category × Method family"),
    (COL_FOOD, COL_OUTCOME, "Food category × Outcome focus"),
    (COL_METHOD, COL_OUTCOME, "Method family × Outcome focus"),
    (COL_FOOD, COL_PURPOSE, "Food category × Research purpose"),
    (COL_PURPOSE, COL_METHOD, "Research purpose × Method family"),
    (COL_PURPOSE, COL_OUTCOME, "Research purpose × Outcome focus"),
]

association_results = pd.DataFrame([
    association_test(included, row_var, col_var, name)
    for row_var, col_var, name in test_pairs
])


# ============================================================
# 11. Export outputs
# ============================================================

with pd.ExcelWriter(OUTPUT_FILE, engine="openpyxl") as writer:
    overview.to_excel(writer, sheet_name="Overview", index=False)
    year_summary.to_excel(writer, sheet_name="Year_summary", index=False)
    country_summary.to_excel(writer, sheet_name="Country_summary", index=False)
    continent_summary.to_excel(writer, sheet_name="Continent_summary", index=False)
    food_summary.to_excel(writer, sheet_name="Food_summary", index=False)
    purpose_summary.to_excel(writer, sheet_name="Purpose_summary", index=False)
    method_summary.to_excel(writer, sheet_name="Method_summary", index=False)
    outcome_summary.to_excel(writer, sheet_name="Outcome_summary", index=False)
    excluded_summary.to_excel(writer, sheet_name="Excluded_summary", index=False)
    association_results.to_excel(writer, sheet_name="Association_tests", index=False)

    for name, table in cross_tabs.items():
        table.to_excel(writer, sheet_name=safe_sheet_name(name))


print("Analysis completed successfully.")
print(f"Input file: {INPUT_FILE}")
print(f"Records entering eligibility assessment: {len(records)}")
print(f"Final included studies: {len(included)}")
print(f"Excluded records after eligibility assessment: {len(excluded)}")
print(f"Output file saved as: {OUTPUT_FILE}")