import os
import sys
import logging
import json
from pathlib import Path
from typing import Dict, Set, Any
import numpy as np
import pandas as pd
from rapidfuzz import process, fuzz
import re


######################################################################################################
# 0. Paths & Logging
######################################################################################################

BASE_DIR = Path(__file__).resolve().parents[1]
RAW_DATA_DIR = BASE_DIR / "raw_data"
METADATA_DIR = BASE_DIR / "metadata"
LOGS_DIR = BASE_DIR / "code" / "logs"

LOGS_DIR.mkdir(parents=True, exist_ok=True)


def setup_logging(log_dir: Path, log_filename: str) -> Path:
    """
    Basic logging setup: writes both to a log file and to stdout.
    """
    log_dir.mkdir(parents=True, exist_ok=True)
    log_path = log_dir / log_filename

    # Avoid duplicate handlers if this module is imported multiple times
    if not logging.getLogger().handlers:
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s - %(levelname)s - %(message)s",
            handlers=[
                logging.FileHandler(log_path),
                logging.StreamHandler(sys.stdout),
            ],
        )
    else:
        # If logging is already configured, just add a file handler if needed
        file_handler_paths = {
            getattr(h, "baseFilename", None) for h in logging.getLogger().handlers
            if isinstance(h, logging.FileHandler)
        }
        if str(log_path) not in file_handler_paths:
            logging.getLogger().addHandler(logging.FileHandler(log_path))

    return log_path


LOG_FILE_PATH = setup_logging(LOGS_DIR, "raw_data_checker.log")
change_counter = 0


######################################################################################################
# 1. Load Files and Configurations
######################################################################################################

def load_file(file_path: Path) -> pd.DataFrame:
    """
    Load an Excel or CSV file into a DataFrame, handling errors.

    Args:
        file_path (Path): The path to the file to be loaded.

    Returns:
        pd.DataFrame: A DataFrame containing the file's contents.

    Raises:
        Exception: For any unexpected errors.
    """
    try:
        suffix = file_path.suffix.lower()
        if suffix == ".xlsx":
            return pd.read_excel(file_path)
        elif suffix == ".csv":
            return pd.read_csv(file_path)
        else:
            raise ValueError(f"Unsupported file format: {file_path}")
    except Exception as e:
        logging.error(f"Error loading file {file_path}: {e}")
        raise


def load_reference_data(file_path: Path) -> Set[str]:
    """
    Load reference file and return a set of column names.

    Args:
        file_path (Path): The path to the reference file to be loaded.

    Returns:
        Set[str]: A set containing the column names from the loaded reference file.
    """
    try:
        df_ref = load_file(file_path)
        if df_ref is None or not hasattr(df_ref, "columns"):
            logging.error(
                "Loaded reference data is not a valid DataFrame or contains no columns."
            )
            raise ValueError(
                "Loaded reference data is not a valid DataFrame or contains no columns."
            )
        columns = set(df_ref.columns)
        return columns
    except FileNotFoundError as e:
        logging.error(f"Reference file not found: {e}")
        raise
    except ValueError as e:
        logging.error(f"Reference data error: {e}")
        raise
    except Exception as e:
        logging.error(f"Unexpected error when loading reference data: {e}")
        raise


def load_feature_bounds(file_path: Path) -> Dict[str, Any]:
    """
    Load feature bounds from a JSON file.

    Args:
        file_path (Path): The path to the feature bounds JSON file.

    Returns:
        Dict[str, Any]: A dictionary containing feature bounds information.
    """
    try:
        with open(file_path, "r") as f:
            return json.load(f)
    except FileNotFoundError:
        logging.warning(
            f"Feature bounds file not found at {file_path}. "
            "No numeric/categorical bounds will be enforced."
        )
        return {}
    except Exception as e:
        logging.error(f"Error loading feature bounds from {file_path}: {e}")
        raise


def load_name_mappings(file_path: Path) -> Dict[str, str]:
    """
    Load feature (column) name mappings from JSON.

    Args:
        file_path (Path): The path to the feature name mapping JSON file.

    Returns:
        Dict[str, str]: A dictionary {old_name: new_name}
    """
    try:
        with open(file_path, "r") as f:
            return json.load(f)
    except FileNotFoundError:
        logging.warning(
            f"Feature name mapping file not found at {file_path}. "
            "No custom renames will be applied."
        )
        return {}
    except Exception as e:
        logging.error(f"Error loading feature name mappings from {file_path}: {e}")
        raise


# Use names consistent with README.md: feature_name_mappings.json, feature_bounds.json
CUSTOM_RENAMES = load_name_mappings(METADATA_DIR / "features_names_mappings.json")
FEATURE_BOUNDS = load_feature_bounds(METADATA_DIR / "features_bounds.json")

# Reasonable default "reference columns" for fuzzy matching:
# union of all known feature-bound keys and all "canonical" names in mappings.
REQUIRED_COLUMNS: Set[str] = set(FEATURE_BOUNDS.keys()) | set(CUSTOM_RENAMES.values())


######################################################################################################
# 2. Column Processing Functions
######################################################################################################

def find_fuzzy_column_matches(df_new: pd.DataFrame) -> Dict[str, str]:
    """
    Find fuzzy-matched columns that need renaming.

    Args:
        df_new (pd.DataFrame): The DataFrame to process for fuzzy column matches.

    Returns:
        Dict[str, str]: {column_to_rename: best_match_target_name}
    """
    if not REQUIRED_COLUMNS:
        return {}

    matches: Dict[str, str] = {}
    for col in df_new.columns:
        best_match = process.extractOne(col, REQUIRED_COLUMNS, scorer=fuzz.partial_ratio)
        if best_match and best_match[1] >= 85:
            matches[col] = best_match[0]
    return matches


def rename_columns(df_new: pd.DataFrame) -> None:
    """
    Apply custom renames, then fuzzy-matching renames, then case-insensitive harmonization.
    """
    global change_counter

    # 1) Custom renames based on metadata mappings
    for col, new_name in CUSTOM_RENAMES.items():
        if col in df_new.columns and new_name != col:
            df_new.rename(columns={col: new_name}, inplace=True)
            logging.warning(
                f"Renamed '{col}' to '{new_name}' based on custom renaming rules."
            )
            change_counter += 1

    # 2) Fuzzy matches to known REQUIRED_COLUMNS
    fuzzy_matches = find_fuzzy_column_matches(df_new)
    for old_col, new_col in fuzzy_matches.items():
        if old_col != new_col and old_col in df_new.columns:
            df_new.rename(columns={old_col: new_col}, inplace=True)
            logging.warning(
                f"Renaming '{old_col}' to '{new_col}' based on fuzzy matching."
            )
            change_counter += 1

    # 3) Case-insensitive exact matches to canonical names
    #    (e.g., "size" -> "SIZE")
    for col in list(df_new.columns):
        for ref_col in REQUIRED_COLUMNS:
            if col.lower() == ref_col.lower() and col != ref_col:
                df_new.rename(columns={col: ref_col}, inplace=True)
                logging.info(f"Standardizing column '{col}' -> '{ref_col}' (case fix).")
                change_counter += 1


######################################################################################################
# 3. Data Validation Functions
######################################################################################################

def validate_feature_values(df_new: pd.DataFrame) -> None:
    """
    Apply numeric and categorical bounds from FEATURE_BOUNDS.

    Numeric: [min, max] -> out of bounds -> NaN
    Categorical: list[str] -> fuzzy-correct invalid values or set to NaN.
    """
    global change_counter
    logged_warnings = set()

    for feature, bounds in FEATURE_BOUNDS.items():
        if feature not in df_new.columns:
            continue

        # Numeric feature bounds: [min, max]
        if (
            isinstance(bounds, list)
            and len(bounds) == 2
            and all(isinstance(x, (int, float)) for x in bounds)
        ):
            min_val, max_val = bounds
            out_of_bounds = (df_new[feature] < min_val) | (df_new[feature] > max_val)

            if out_of_bounds.any():
                warning_message = (
                    f"{feature}: Replacing out-of-bounds values with NaN "
                    f"(allowed range: [{min_val}, {max_val}])."
                )
                if warning_message not in logged_warnings:
                    logging.warning(warning_message)
                    logged_warnings.add(warning_message)

                df_new.loc[out_of_bounds, feature] = np.nan
                change_counter += int(out_of_bounds.sum())

        # Categorical feature bounds: list[str]
        elif isinstance(bounds, list) and all(isinstance(x, str) for x in bounds):
            valid_values = bounds
            for i, value in df_new[feature].items():
                if pd.notna(value) and value not in valid_values:
                    match = process.extractOne(
                        str(value), valid_values, scorer=fuzz.partial_ratio
                    )
                    if match and match[1] >= 85:
                        corrected_value = match[0]
                        warning_message = (
                            f"{feature}: Replacing invalid '{value}' "
                            f"with '{corrected_value}' (fuzzy match)."
                        )
                        if warning_message not in logged_warnings:
                            logging.warning(warning_message)
                            logged_warnings.add(warning_message)

                        df_new.at[i, feature] = corrected_value
                        change_counter += 1
                    else:
                        warning_message = (
                            f"{feature}: '{value}' is invalid and cannot be "
                            f"matched; replacing with NaN."
                        )
                        if warning_message not in logged_warnings:
                            logging.warning(warning_message)
                            logged_warnings.add(warning_message)

                        df_new.at[i, feature] = np.nan
                        change_counter += 1


######################################################################################################
# 4. File Processing Functions
######################################################################################################

def read_log_file(file_path: Path) -> str:
    """
    Read the content of the log file.

    Args:
        file_path (Path): The path to the log file.

    Returns:
        str: The content of the log file.
    """
    try:
        with open(file_path, "r") as f:
            return f.read()
    except FileNotFoundError:
        # First time the script is run, the log may not exist.
        return ""


def get_processed_files(file_path: Path) -> Set[str]:
    """
    Extract filenames that have already been processed from the log file.

    Args:
        file_path (Path): The path to the log file.

    Returns:
        Set[str]: A set of filenames (e.g., 'initial_formulations_raw.xlsx')
                  that have already been processed.
    """
    log_content = read_log_file(file_path)
    # Look for lines like: "---> Processing initial_formulations_raw.xlsx..."
    file_names = re.findall(r"---> Processing (.*?\.xlsx)\.\.\.", log_content)
    return set(file_names)


def process_file(file_path: Path) -> None:
    """
    Process a single file and apply changes as needed.

    Args:
        file_path (Path): The path to the file to be processed.
    """
    df_new = load_file(file_path)
    if df_new is None or df_new.empty:
        logging.error(f"Skipping {file_path} due to loading failure or empty content.")
        return

    logging.info(f"---> Processing {file_path.name}...")
    rename_columns(df_new)
    validate_feature_values(df_new)

    # Save as CSV in the same raw_data directory, preserving the base name
    output_file_path = RAW_DATA_DIR / f"{file_path.stem}.csv"
    df_new.to_csv(output_file_path, index=False)
    logging.info(f"Processed data saved to {output_file_path}")


def process_all_files(folder_path: Path, log_file_path: Path) -> None:
    """
    Process all .xlsx files in the folder, avoiding reprocessing files already processed.

    Args:
        folder_path (Path): The folder path where the files are located.
        log_file_path (Path): The path to the log file to check already processed files.
    """
    global change_counter

    # Which files have already been processed according to the log?
    processed_files = get_processed_files(log_file_path)

    # All .xlsx raw files in raw_data/
    files = [f for f in folder_path.glob("*.xlsx")]
    files_to_process = [f for f in files if f.name not in processed_files]

    if not files_to_process:
        logging.info("No new .xlsx raw files to process.")
        return

    for file in files_to_process:
        process_file(file)

    logging.info(f"Total changes made across all files: {change_counter}")


######################################################################################################
# 5. Pipeline Execution
######################################################################################################

def run_pipeline() -> None:
    """
    Main function to execute the data-checking pipeline:
    - Converts all .xlsx files in raw_data/ to cleaned .csv
    - Applies column name standardization and feature bounds.
    """
    logging.info("CHECKING RAW DATA...")
    process_all_files(RAW_DATA_DIR, LOG_FILE_PATH)
    logging.info("...RAW DATA CHECK DONE!\n\n")


if __name__ == "__main__":
    try:
        run_pipeline()
    except Exception as e:
        logging.error(f"An unexpected error occurred in raw_data_checker: {e}")
        raise
