import re
import csv
import json
import time
import logging
import requests
from pathlib import Path

# Set up logging

logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
logger.addHandler(console_handler)

# Configuration

MRS_FILE_PATH = "./configuration/metamorphic_relations.json"
GUARDME_URL = "http://localhost:8001/api/v1/metamorphic-tests/experiment"

MAX_RETRIES = 10
RETRY_DELAY = 20  # seconds

RQ_CONFIG = {
    "RQ_FILE_PATH": "./configuration/rq1.json",
    "INPUT_DIR": "./data/rq1/execution",
    "OUTPUT_DIR": "./data/rq1/evaluation/experiment",
}


def load_json(file_path):
    """
    Load a JSON file from the given path.
    Args:
        file_path (str or Path): Path to the JSON file.
    Returns:
        dict: Parsed JSON content.
    """
    with open(file_path, encoding="utf-8") as f:
        return json.load(f)


def save_to_csv(evaluations, file_path):
    """
    Save experiment evaluation results to a CSV file.
    Args:
        evaluations (list): List of evaluation dictionaries.
        file_path (str or Path): Output CSV file path.
    """
    if not evaluations:
        return

    header_keys = evaluations[0].keys()
    headers = ["test_id", "bias_type"]

    if "attribute" in header_keys:
        headers.append("attribute")

    if "attribute_1" in header_keys and "attribute_2" in header_keys:
        headers.extend(["attribute_1", "attribute_2"])

    headers.extend(
        [
            "scenario",
            "prompt_1",
            "response_1",
            "prompt_2",
            "response_2",
            "verdict",
            "severity",
        ]
    )

    if "generation_explanation" in header_keys:
        headers.append("generation_explanation")

    if "evaluation_explanation" in header_keys:
        headers.append("evaluation_explanation")

    headers.extend(["execution_timestamp", "evaluation_timestamp"])

    evaluations = [
        {key: evaluation[key] for key in headers} for evaluation in evaluations
    ]

    file_path = Path(file_path)
    file_path.parent.mkdir(parents=True, exist_ok=True)

    with open(file_path, mode="w", newline="", encoding="utf-8") as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=headers)
        writer.writeheader()
        writer.writerows(evaluations)


def execute_evaluation(request_body, test_id, mr, judge):
    """
    Evaluate a single metamophic test by sending a request to the GUARD-ME API.
    Args:
        request_body (dict): Request payload for the API.
        test_id (int): Metamorphic test identifier.
        mr (str): Metamorphic relation name.
        judge (str): Judge model name.
    Returns:
        dict or None: Evaluation result dictionary, or None if failed.
    """
    retries = 0
    while retries < MAX_RETRIES:
        try:
            response = requests.post(GUARDME_URL, json=request_body)
            evaluation_timestamp = time.time()
            response.raise_for_status()
            response_data = response.json()

            return {
                **request_body,
                "verdict": response_data["verdict"],
                "severity": response_data["severity"],
                "evaluation_explanation": re.sub(
                    r"^\[.*?\]:\s*", "", response_data["evaluation_explanation"]
                ),
                "evaluation_timestamp": evaluation_timestamp,
                "test_id": test_id,
            }
        except requests.exceptions.HTTPError:
            logger.error(response.text)
            retries += 1
            time.sleep(RETRY_DELAY)
            if retries == MAX_RETRIES:
                logger.info(
                    f"Failed to evaluate test {test_id} for {mr} with {judge} (experiment) after {MAX_RETRIES} retries"
                )


def launch_experiment():
    """
    Launch the experiment evaluation process for the selected RQ.
    Loads configuration and evaluates all experiment metamorphic tests.
    """
    metamorphic_relations = load_json(MRS_FILE_PATH)
    evaluation_config = load_json(RQ_CONFIG["RQ_FILE_PATH"]).get("evaluation", {})

    judge_models = evaluation_config.get(
        "judge_models", [evaluation_config.get("judge_model")]
    )
    n_evaluations = evaluation_config.get("executions", 1)
    judge_temperature = evaluation_config["judge_temperature"]

    for judge in judge_models:
        for mr, mr_info in metamorphic_relations.items():
            if mr_info.get("execute_experiment", False):
                input_file = Path(RQ_CONFIG["INPUT_DIR"]) / f"{mr}.csv"
                for evaluation_index in range(n_evaluations):
                    evaluations = []
                    if not input_file.exists():
                        continue

                    with open(input_file, encoding="utf-8") as f:
                        reader = csv.DictReader(f)
                        evaluation_method = mr_info.get(
                            "evaluation_method", "attribute_comparison"
                        )
                        request_body_template = {
                            "judge_model": judge,
                            "evaluation_method": evaluation_method,
                            "judge_temperature": judge_temperature,
                        }
                        for row in reader:
                            request_body = request_body_template.copy()
                            request_body.update(
                                {
                                    "bias_type": row["bias_type"],
                                    "prompt": row["prompt_2"],
                                    "response": row["response_2"],
                                }
                            )

                            evaluation = execute_evaluation(
                                request_body, row["test_id"], mr, judge
                            )
                            if evaluation:
                                evaluation["execution_timestamp"] = row[
                                    "execution_timestamp"
                                ]
                                evaluation["scenario"] = row["scenario"]
                                evaluation["prompt_1"] = row["prompt_1"]
                                evaluation["response_1"] = row["response_1"]
                                evaluation["prompt_2"] = row["prompt_2"]
                                evaluation["response_2"] = row["response_2"]
                                del evaluation["prompt"]
                                del evaluation["response"]
                                evaluations.append(evaluation)
                                logger.info(
                                    f"Evaluated test {row['test_id']} for {mr} with {judge} - experiment ({evaluation_index + 1}/{n_evaluations})"
                                )

                                output_file = (
                                    Path(RQ_CONFIG["OUTPUT_DIR"])
                                    / judge
                                    / mr
                                    / f"{evaluation_index + 1}.csv"
                                )
                    save_to_csv(evaluations, output_file)


if __name__ == "__main__":
    launch_experiment()
