from pathlib import Path
from datetime import datetime
from typing import Dict, List, Optional, Set, Tuple, Union
import logging

import click
import pandas as pd
import re
import yaml
import json

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('data_validator')


def load_config(config_path: Path) -> Dict:
    """
    Load configuration from a YAML file.
    
    Parameters
    ----------
    config_path : Path
        Path to the configuration YAML file
        
    Returns
    -------
    Dict
        Configuration dictionary
    """
    with open(config_path, 'r') as f:
        return yaml.safe_load(f)

def log_validation_issue(errors: List[Dict], row_idx: int, column: str, message: str, 
                         severity: str = "ERROR") -> None:
    """
    Add a validation error and log it.
    
    Parameters
    ----------
    errors : List[Dict]
        List to append the error to
    row_idx : int
        Index of the row where the error occurred
    column : str
        Name of the column where the error occurred
    message : str
        Error message
    severity : str
        Severity level (ERROR, WARNING, INFO)
    """
    error = {
        "row": row_idx,
        "column": column,
        "message": message,
        "severity": severity
    }
    errors.append(error)
    
    # Log the error
    log_method = getattr(logger, severity.lower(), logger.error)
    log_method(f"Row {row_idx}: {column} - {message}")
    

def validate_lead_level(df: pd.DataFrame, config: Dict, errors: List[Dict]) -> None:
    """
    Validate lead level values.
    
    Parameters
    ----------
    df : pd.DataFrame
        The dataframe to validate
    config : Dict
        Configuration dictionary
    errors : List[Dict]
        List to append errors to
    """
    lead_col = config['orig_lead_column'].replace('\\n', '\n')
    if lead_col not in df.columns:
        log_validation_issue(errors, -1, lead_col, f"{lead_col} column not found", "WARNING")
        return
    
    for idx, value in df[lead_col].items():
        if pd.isna(value):
            log_validation_issue(errors, idx, lead_col, f"Empty lead level value (Orig ID: {df.iloc[idx, 1]})", "WARNING")
            continue
            
        value = str(value).strip()
        if value == "Not Tested":
            continue

        if '<' in value:
            continue
            
        if not value.endswith('ug/ft²'):
            log_validation_issue(errors, idx, lead_col, f"Invalid lead level format: {value} (Orig ID: {df.iloc[idx, 1]})")
            continue
            
        try:
            clean_value = value.replace('µg/ft²', '').replace('ug/ft²', '').replace(',', '').strip()
            lead_value = float(clean_value)
            
            # New validation rules
            if lead_value < 0:
                log_validation_issue(errors, idx, lead_col, f"Negative lead level value: {lead_value} (Orig ID: {df.iloc[idx, 1]})", "ERROR")
            elif lead_value > 250:
                log_validation_issue(errors, idx, lead_col, f"Lead level exceeds 250 µg/ft². Consider re-checking the test: {lead_value} (Orig ID: {df.iloc[idx, 1]})", "WARNING")
                
        except ValueError:
            log_validation_issue(errors, idx, lead_col, f"Could not extract numeric value from {value} (Orig ID: {df.iloc[idx, 1]})")


def validate_lead_locations(df: pd.DataFrame, config: Dict, errors: List[Dict]) -> None:
    """
    Validate lead detection locations.
    
    Parameters
    ----------
    df : pd.DataFrame
        The dataframe to validate
    config : Dict
        Configuration dictionary
    errors : List[Dict]
        List to append errors to
    """
    locations_col = config['lead_locations_column'].replace('\\n', '\n')
    peak_col = config['peak_sample_column'].replace('\\n', '\n')
    valid_locations = set(config['valid_locations_and_epa_limits'].keys())
    
    if locations_col not in df.columns:
        log_validation_issue(errors, -1, locations_col, f"{locations_col} column not found", "WARNING")
        return
        
    if peak_col not in df.columns:
        log_validation_issue(errors, -1, peak_col, f"{peak_col} column not found", "WARNING")
        return
    
    for idx, row in df.iterrows():
        locations = str(row[locations_col]).split(',') if pd.notna(row[locations_col]) else []
        locations = [loc.strip() for loc in locations]
        
        # Check if all locations are valid
        for loc in locations:
            if loc and (loc not in valid_locations and loc[:-1] not in valid_locations) and pd.notna(loc) and loc != 'Not Tested':
                log_validation_issue(errors, idx, locations_col, f"Invalid location: {loc}")
        
        # Check if peak location is in the list of locations
        peak_loc = row[peak_col]
        if (pd.notna(peak_loc) and peak_loc and peak_loc != "Not Tested"
            and locations and peak_loc not in locations and peak_loc+'s' not in locations):
            log_validation_issue(errors, idx, peak_col, 
                                 f"Peak location '{peak_loc}' not in locations list: {locations}")


def validate_asbestos(df: pd.DataFrame, config: Dict, errors: List[Dict]) -> None:
    """
    Validate asbestos entries.
    
    Parameters
    ----------
    df : pd.DataFrame
        The dataframe to validate
    config : Dict
        Configuration dictionary
    errors : List[Dict]
        List to append errors to
    """
    asbestos_col = config['asbestos_column'].replace('\\n', '\n')
    
    if asbestos_col not in df.columns:
        log_validation_issue(errors, -1, asbestos_col, f"{asbestos_col} column not found", "WARNING")
        return
    
    for idx, value in df[asbestos_col].items():
        if pd.isna(value):
            continue
            
        value = str(value)
        if value == "Not Tested":
            continue
            
        if not any(method in value for method in ["TEM", "PLM", "PCM"]):
            log_validation_issue(errors, idx, asbestos_col, 
                                 f"Asbestos entry missing required testing method: {value}")


def validate_wildfire_debris(df: pd.DataFrame, config: Dict, errors: List[Dict]) -> None:
    """
    Validate wildfire debris category values.

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe to validate
    config : Dict
        Configuration dictionary
    errors : List[Dict]
        List to append errors to
    """
    debris_col = config['original_wildfire_debris_column'].replace('\\n', '\n')

    if debris_col not in df.columns:
        log_validation_issue(errors, -1, debris_col, f"{debris_col} column not found", "WARNING")
        return

    valid_categories = config['valid_wildfire_debris_categories']

    for idx, value in df[debris_col].items():
        if pd.isna(value):
            continue

        value_str = str(value).strip()
        if value_str not in valid_categories:
            log_validation_issue(errors, idx, debris_col, f"Invalid wildfire debris category: {value_str}")


def validate_metal(df: pd.DataFrame, column: str, config: Dict, errors: List[Dict]) -> None:
    """
    Validate a metal contaminant column.
    
    Parameters
    ----------
    df : pd.DataFrame
        The dataframe to validate
    column : str
        Name of the metal column to validate
    config : Dict
        Configuration dictionary
    errors : List[Dict]
        List to append errors to
    """
    if column not in df.columns:
        log_validation_issue(errors, -1, column, f"{column} column not found", "WARNING")
        return
    
    valid_locations = set(config['valid_locations_and_epa_limits'].keys())
    valid_prefixes = config['valid_metal_prefixes']
    pattern = r"DETECTED: (\d+(?:\.\d+)?) ug/ft² \((.*?)\)"
    
    for idx, value in df[column].items():
        if pd.isna(value):
            continue
            
        value = str(value).strip()
        
        if '<' in value:
            continue
            
        # Check for valid prefix
        if not any(value.startswith(prefix) for prefix in valid_prefixes):
            log_validation_issue(errors, idx, column, f"Invalid prefix: {value}")
            continue
        
        if value.startswith("DETECTED"):
            match = re.search(pattern, value)
            if match:
                _, location = match.groups()
                
                if location not in valid_locations:
                    log_validation_issue(errors, idx, column, f"Invalid location: {location}")
            else:
                log_validation_issue(errors, idx, column, f"Invalid DETECTED format: {value}")


def validate_data(df: pd.DataFrame, config: Dict) -> List[Dict]:
    """
    Run all validation checks on the dataframe.
    
    Parameters
    ----------
    df : pd.DataFrame
        The dataframe to validate
    config : Dict
        Configuration dictionary
        
    Returns
    -------
    List[Dict]
        List of validation errors
    """
    errors = []
    
    validate_lead_level(df, config, errors)
    validate_lead_locations(df, config, errors)
    validate_asbestos(df, config, errors)
    validate_wildfire_debris(df, config, errors)
    
    for metal_col in config['metal_columns_to_validate']:
        metal_col = metal_col.replace('\\n', '\n')
        validate_metal(df, metal_col, config, errors)
        
    return errors


def generate_report(df: pd.DataFrame, errors: List[Dict], output_path: Path) -> Path:
    """
    Generate a JSON report of validation errors.
    
    Parameters
    ----------
    df : pd.DataFrame
        The validated dataframe
    errors : List[Dict]
        List of validation errors
    output_path : Path
        Path to save the report
        
    Returns
    -------
    Path
        Path to the saved report
    """
    # Count errors by severity
    severity_counts = {"ERROR": 0, "WARNING": 0, "INFO": 0}
    column_counts = {}
    
    for error in errors:
        severity = error["severity"]
        column = error["column"]
        
        severity_counts[severity] += 1
        
        if column not in column_counts:
            column_counts[column] = {"ERROR": 0, "WARNING": 0, "INFO": 0}
        
        column_counts[column][severity] += 1
    
    # Create the report
    report = {
        "summary": {
            "total_rows": len(df),
            "total_columns": len(df.columns),
            "validation_time": datetime.now().isoformat(),
            "total_errors": len(errors),
            "error_counts": severity_counts
        },
        "column_stats": column_counts,
        "errors": errors
    }
    
    # Save the report
    with open(output_path, 'w') as f:
        json.dump(report, f, indent=2)
    
    return output_path


@click.command()
@click.option('--input_file', type=click.Path(exists=True), required=True, help='Path to the input XLSX file')
@click.option('--output_dir', type=click.Path(), default='./reports', help='Directory to save the validation report')
@click.option('--config_file', type=click.Path(exists=True), default='config.yaml', help='Path to the configuration YAML file')
@click.option('--sheet_name', default='DATA_PRE_INT', help='Sheet name or index to process')
@click.option('--log_level', type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR']), default='INFO', help='Logging level')
def main(input_file: str, output_dir: str, config_file: str, sheet_name: Union[str, int], log_level: str):
    """
    Validate an XLSX file and generate a report of any issues found.
    """
    # Set logging level
    logger.setLevel(getattr(logging, log_level))
    
    # Create output directory if it doesn't exist
    output_dir_path = Path(output_dir)
    output_dir_path.mkdir(parents=True, exist_ok=True)
    
    # Generate timestamp for report filename
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    input_file_name = Path(input_file).stem
    output_file = output_dir_path / f"{input_file_name}_validation_{timestamp}.json"
    
    logger.info(f"Processing {input_file}, sheet {sheet_name}")
    
    # Read the input file
    try:
        df = pd.read_excel(input_file, sheet_name=sheet_name)
        logger.info(f"Loaded data with {len(df)} rows and {len(df.columns)} columns")
    except Exception as e:
        logger.error(f"Error reading input file: {e}")
        return
    
    # Load configuration and run validation
    with open(config_file, 'r') as f:
        config = yaml.safe_load(f)

    errors = validate_data(df, config)
    
    # Generate and save report
    report_path = generate_report(df, errors, output_file)
    
    # Print summary
    total_errors = len(errors)
    if total_errors > 0:
        logger.warning(f"Found {total_errors} issues. See {report_path} for details.")
    else:
        logger.info("No issues found.")


if __name__ == "__main__":
    main() 