import logging
import os
import sys
from pathlib import Path
from typing import Optional, List

import pandas as pd


######################################################################################################
# 0. Paths & Logging
######################################################################################################

BASE_DIR = Path(__file__).resolve().parents[1]
RAW_DATA_DIR = BASE_DIR / "raw_data"
DATA_DIR = BASE_DIR / "data"
LOGS_DIR = BASE_DIR / "code" / "logs"

LOGS_DIR.mkdir(parents=True, exist_ok=True)


def setup_logging(log_dir: Path, log_filename: str) -> Path:
    """
    Basic logging setup: writes both to a log file and to stdout.
    """
    log_dir.mkdir(parents=True, exist_ok=True)
    log_path = log_dir / log_filename

    # Avoid duplicate handlers if this module is imported multiple times
    if not logging.getLogger().handlers:
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s - %(levelname)s - %(message)s",
            handlers=[
                logging.FileHandler(log_path),
                logging.StreamHandler(sys.stdout),
            ],
        )
    else:
        file_handler_paths = {
            getattr(h, "baseFilename", None) for h in logging.getLogger().handlers
            if isinstance(h, logging.FileHandler)
        }
        if str(log_path) not in file_handler_paths:
            logging.getLogger().addHandler(logging.FileHandler(log_path))

    return log_path


LOG_FILE_PATH = setup_logging(LOGS_DIR, "raw_data_slicer.log")


######################################################################################################
# 1. Helper Functions
######################################################################################################

def _load_csv_or_excel(basename: str) -> Optional[pd.DataFrame]:
    """
    Try to load a dataset from RAW_DATA_DIR, preferring CSV if present
    (output of raw_data_checker), otherwise falling back to XLSX.

    For example, basename='initial_formulations_raw' will look for:
    - raw_data/initial_formulations_raw.csv
    - raw_data/initial_formulations_raw.xlsx
    """
    csv_path = RAW_DATA_DIR / f"{basename}.csv"
    xlsx_path = RAW_DATA_DIR / f"{basename}.xlsx"

    if csv_path.exists():
        logging.info(f"Loading cleaned CSV: {csv_path}")
        return pd.read_csv(csv_path)

    if xlsx_path.exists():
        logging.info(f"Loading XLSX (no cleaned CSV found yet): {xlsx_path}")
        return pd.read_excel(xlsx_path)

    logging.warning(
        f"No file found for base name '{basename}' "
        f"(.csv or .xlsx) in {RAW_DATA_DIR}"
    )
    return None


def _clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """
    Apply simple cleaning rules consistent with the README:
    - Keep only Micromixer rows in CHIP column.
    - Drop redundant columns: ID, ML, CHIP, OUTPUT (if present).
    - Drop rows that are completely empty.
    """
    df = df.copy()

    # Drop rows that are completely empty
    df.dropna(how="all", inplace=True)

    # Filter for Micromixer chip (if CHIP exists)
    if "CHIP" in df.columns:
        before = len(df)
        df = df[df["CHIP"].astype(str).str.lower() == "micromixer"].copy()
        after = len(df)
        logging.info(
            f"Filtering CHIP == 'Micromixer': kept {after} of {before} rows."
        )

    # Drop redundant columns if present
    redundant_cols = ["ID", "ML", "CHIP", "OUTPUT"]
    to_drop = [c for c in redundant_cols if c in df.columns]
    if to_drop:
        df.drop(columns=to_drop, inplace=True)
        logging.info(f"Dropped redundant columns: {to_drop}")

    return df


######################################################################################################
# 2. Core Processing Functions
######################################################################################################

def build_formulations_dataset() -> None:
    """
    Build the main cleaned dataset `data/formulations.csv` by:
    - Loading initial_formulations_raw and new_formulations_raw
    - Cleaning them
    - Concatenating them
    """
    DATA_DIR.mkdir(parents=True, exist_ok=True)

    basename_list: List[str] = [
        "initial_formulations_raw",
        "new_formulations_raw",
    ]

    dfs = []
    for basename in basename_list:
        df = _load_csv_or_excel(basename)
        if df is None:
            continue
        logging.info(
            f"Loaded {basename} with {df.shape[0]} rows and {df.shape[1]} columns."
        )
        df_clean = _clean_dataframe(df)
        logging.info(
            f"After cleaning {basename}: {df_clean.shape[0]} rows, "
            f"{df_clean.shape[1]} columns."
        )
        dfs.append(df_clean)

    if not dfs:
        logging.error(
            "No seed/extension datasets were loaded. "
            "Cannot build data/formulations.csv."
        )
        return

    formulations = pd.concat(dfs, ignore_index=True)
    out_path = DATA_DIR / "formulations.csv"
    formulations.to_csv(out_path, index=False)
    logging.info(
        f"Saved combined formulations dataset to {out_path} "
        f"({formulations.shape[0]} rows, {formulations.shape[1]} columns)."
    )


def build_validation_dataset() -> None:
    """
    Build the independent wet-lab validation dataset `data/wet_lab_validation.csv` by:
    - Loading wet_lab_validation_raw
    - Cleaning it
    """
    DATA_DIR.mkdir(parents=True, exist_ok=True)

    basename = "wet_lab_validation_raw"
    df = _load_csv_or_excel(basename)
    if df is None:
        logging.warning(
            "No wet_lab_validation_raw dataset found. "
            "Skipping creation of wet_lab_validation.csv."
        )
        return

    logging.info(
        f"Loaded {basename} with {df.shape[0]} rows and {df.shape[1]} columns."
    )
    df_clean = _clean_dataframe(df)
    out_path = DATA_DIR / "wet_lab_validation.csv"
    df_clean.to_csv(out_path, index=False)
    logging.info(
        f"Saved wet-lab validation dataset to {out_path} "
        f"({df_clean.shape[0]} rows, {df_clean.shape[1]} columns)."
    )


######################################################################################################
# 3. Backwards-Compatible API Wrappers
######################################################################################################

def process_data(file_path: str, aggregation_column: str) -> None:
    """
    Backwards-compatible stub retained for API compatibility.

    The original version grouped by a CHIP-like column and wrote separate files.
    In the published dataset, preprocessing is instead centralized:
      - filtering by CHIP
      - dropping redundant columns
      - consolidating into standard outputs.

    This function now simply logs that the logic is centralized
    and delegates to the main pipeline.
    """
    logging.info(
        "process_data() is now a thin wrapper. "
        "Use build_formulations_dataset() and build_validation_dataset() "
        "for the main functionality."
    )
    # We do NOT use file_path and aggregation_column directly anymore.
    # They are kept only to avoid breaking existing external calls.


def process_all_files(
    directory_path: str = str(RAW_DATA_DIR),
    aggregation_column: str = "CHIP",
) -> None:
    """
    Backwards-compatible entry point that now builds:
    - data/formulations.csv
    - data/wet_lab_validation.csv

    The directory_path and aggregation_column arguments are kept for
    compatibility but are no longer used.
    """
    logging.info(
        "Starting consolidated dataset building: "
        "formulations.csv and wet_lab_validation.csv."
    )
    build_formulations_dataset()
    build_validation_dataset()
    logging.info("Consolidated dataset building completed.")


######################################################################################################
# 4. Main
######################################################################################################

if __name__ == "__main__":
    try:
        logging.info("Starting data slicing / consolidation pipeline...")
        process_all_files(str(RAW_DATA_DIR), "CHIP")
        logging.info("...DATA SLICING / CONSOLIDATION DONE!\n\n")
    except Exception as e:
        logging.error(f"An unexpected error occurred in raw_data_slicer: {e}")
        raise
