"""Metrics utilities for two-agent framework.

Handles best iteration detection with simple scoring logic.
"""
from typing import Dict, Any, List, Optional
from datetime import datetime
import pandas as pd
import numpy as np


# Estimator reliability weights based on SCOPE-RL paper's documented properties
ESTIMATOR_PROPERTIES = {
    # Direct Method - "can produce large bias caused by approximation errors" (Section A.1)
    'dm': {
        'weight': 0.5,
        'variance_level': 'low',
        'bias_risk': 'high',
        'description': 'Model-based, low variance but high bias risk'
    },
    
    # Importance Sampling - "unbiased but can suffer from high variance" (Section A.1)
    'tis': {
        'weight': 0.8,
        'variance_level': 'very_high',
        'bias_risk': 'none',
        'description': 'Trajectory-wise IS, unbiased but extreme variance'
    },
    'pdis': {
        'weight': 1.0,
        'variance_level': 'high',
        'bias_risk': 'none',
        'description': 'Per-decision IS, unbiased with reduced variance vs TIS'
    },
    
    # Doubly Robust - "reduces variance while being unbiased" (Section A.1)
    'dr': {
        'weight': 2.0,
        'variance_level': 'medium',
        'bias_risk': 'low',
        'description': 'Combines DM and IS, best bias-variance tradeoff'
    },
    
    # Self-Normalized - "variance bounded by r²_max" (Section A.1)
    'sntis': {
        'weight': 1.2,
        'variance_level': 'high',
        'bias_risk': 'low',
        'description': 'Self-normalized TIS, bounded variance'
    },
    'snpdis': {
        'weight': 1.5,
        'variance_level': 'medium',
        'bias_risk': 'low',
        'description': 'Self-normalized PDIS, good variance reduction'
    },
    'sndr': {
        'weight': 2.0,
        'variance_level': 'low',
        'bias_risk': 'low',
        'description': 'Self-normalized DR, excellent properties'
    },
    
    # State Marginal methods - "beneficial when trajectory length T is large" (Section A.1)
    'sm_is': {'weight': 1.3, 'variance_level': 'medium', 'bias_risk': 'low'},
    'sm_dr': {'weight': 1.8, 'variance_level': 'low', 'bias_risk': 'low'},
    'sm_snis': {'weight': 1.5, 'variance_level': 'medium', 'bias_risk': 'low'},
    'sm_sndr': {'weight': 2.0, 'variance_level': 'low', 'bias_risk': 'low'},
    
    # State-Action Marginal methods
    'sam_is': {'weight': 1.3, 'variance_level': 'medium', 'bias_risk': 'low'},
    'sam_dr': {'weight': 1.8, 'variance_level': 'low', 'bias_risk': 'low'},
    'sam_snis': {'weight': 1.5, 'variance_level': 'medium', 'bias_risk': 'low'},
    'sam_sndr': {'weight': 2.0, 'variance_level': 'low', 'bias_risk': 'low'},
    
    # Default for unknown estimators
    'default': {'weight': 1.0, 'variance_level': 'unknown', 'bias_risk': 'unknown'}
}


def calculate_robust_score(results_series, metric_name: str = None) -> float:
    """Calculate appropriate score based on metric type.
    
    Args:
        results_series: Pandas series of numeric results
        metric_name: Name of metric for context-aware scoring
        
    Returns:
        Mean for relative metrics, median for raw metrics
    """
    # Convert to numeric, coercing errors to NaN
    numeric_results = pd.to_numeric(results_series, errors='coerce').dropna()
    
    if len(numeric_results) == 0:
        return np.nan
    
    # Metric-based selection: mean for relatives, median for raw
    if metric_name and 'relative' in metric_name.lower():
        return numeric_results.mean()
    else:
        return numeric_results.median()


def calculate_robust_iteration_score(results_df, iter_num, primary_metric, baseline_score):
    """Calculate robust score for an iteration using SCOPE-RL properties.
    
    Args:
        results_df: DataFrame containing all iteration results
        iter_num: Iteration number to score
        primary_metric: Primary metric name
        baseline_score: Baseline score for comparison
        
    Returns:
        Dictionary containing robust score and diagnostic information
    """
    iter_data = results_df[
        (results_df['iteration'] == iter_num) & 
        (results_df['metric'] == primary_metric)
    ]
    
    if iter_data.empty:
        return None
    
    # Group by estimator and calculate weighted score
    estimator_results = {}
    weighted_sum = 0
    weight_sum = 0
    
    for estimator in iter_data['estimator'].unique():
        est_data = iter_data[iter_data['estimator'] == estimator]
        estimator_lower = estimator.lower()
        
        # Get properties from SCOPE-RL knowledge
        properties = ESTIMATOR_PROPERTIES.get(
            estimator_lower, 
            ESTIMATOR_PROPERTIES['default']
        )
        weight = properties['weight']
        
        # Calculate estimator score
        if 'policy' in est_data.columns and len(est_data['policy'].unique()) > 1:
            # Average across policies
            score = est_data.groupby('policy')['result'].mean().mean()
        else:
            score = est_data['result'].mean()
        
        estimator_results[estimator] = {
            'score': score,
            'weight': weight,
            'properties': properties
        }
        
        weighted_sum += score * weight
        weight_sum += weight
    
    if weight_sum == 0:
        return None
    
    # Calculate base weighted score
    weighted_score = weighted_sum / weight_sum
    
    # Check consensus among highly reliable estimators (DR family)
    reliable_estimators = ['dr', 'sndr', 'sm_dr', 'sm_sndr', 'sam_dr', 'sam_sndr']
    reliable_scores = []
    
    for est, info in estimator_results.items():
        if est.lower() in reliable_estimators:
            reliable_scores.append(info['score'])
    
    # Calculate agreement and adjust score if needed
    agreement = 1.0
    if len(reliable_scores) >= 2:
        # Check coefficient of variation among reliable estimators
        reliable_mean = np.mean(reliable_scores)
        reliable_std = np.std(reliable_scores)
        cv = reliable_std / (abs(reliable_mean) + 1e-8)
        agreement = 1 / (1 + cv)  # High agreement = low CV
        
        # If reliable estimators disagree significantly with weighted average
        if abs(reliable_mean - weighted_score) / abs(weighted_score) > 0.15:
            # Trust reliable estimators more (70/30 split)
            weighted_score = 0.7 * reliable_mean + 0.3 * weighted_score
    
    # Determine confidence level
    confidence = determine_confidence(estimator_results, baseline_score, agreement)
    
    return {
        'robust_score': weighted_score,
        'estimator_results': estimator_results,
        'agreement': agreement,
        'confidence': confidence,
        'reliable_consensus': np.mean(reliable_scores) if reliable_scores else weighted_score
    }


def determine_confidence(estimator_results, baseline_score, agreement):
    """Determine confidence level based on multiple factors.
    
    Args:
        estimator_results: Dict of estimator results and properties
        baseline_score: Baseline score for comparison
        agreement: Agreement level among estimators
        
    Returns:
        Confidence level string ('HIGH', 'MEDIUM', 'LOW')
    """
    # High confidence if:
    # 1. High agreement among estimators (>80%)
    # 2. Reliable estimators show consistent improvement
    # 3. Not just driven by high-variance estimators
    
    if agreement > 0.8:
        # Check if improvement is driven by reliable estimators
        reliable_improved = 0
        unreliable_improved = 0
        
        for est, info in estimator_results.items():
            if baseline_score is not None:
                improved = info['score'] > baseline_score
                if info['properties'].get('variance_level') in ['low', 'medium']:
                    reliable_improved += improved
                else:
                    unreliable_improved += improved
        
        if reliable_improved > unreliable_improved:
            return 'HIGH'
    
    if agreement > 0.6:
        return 'MEDIUM'
    
    return 'LOW'


def detect_primary_metric_and_goal(available_metrics, override=None):
    """Detect primary metric and optimisation goal.
    
    Args:
        available_metrics: Set of available metric names
        override: Optional override for optimisation goal
        
    Returns:
        Tuple of (primary_metric, optimisation_goal, framework_type)
    """
    if override:
        optimisation_goal = override
        # Infer metric
        if 'relative_policy_value' in available_metrics:
            primary_metric = 'relative_policy_value'
        elif any('relative' in m and 'ee' in m for m in available_metrics):
            primary_metric = next(m for m in available_metrics if 'relative' in m and 'ee' in m)
        else:
            primary_metric = list(available_metrics)[0]
        framework_type = 'OVERRIDE'
    elif 'relative_policy_value' in available_metrics:
        primary_metric = 'relative_policy_value'
        optimisation_goal = 'maximise'
        framework_type = 'SCOPE_RL'
    elif any('relative' in m and 'ee' in m for m in available_metrics):
        primary_metric = next(m for m in available_metrics if 'relative' in m and 'ee' in m)
        optimisation_goal = 'minimise'
        framework_type = 'OBP'
    else:
        primary_metric = list(available_metrics)[0]
        optimisation_goal = 'maximise'
        framework_type = 'UNKNOWN'
    
    return primary_metric, optimisation_goal, framework_type


def calculate_baseline_score(results_df, primary_metric):
    """Calculate baseline score for reference.
    
    Args:
        results_df: DataFrame containing all results
        primary_metric: Primary metric name
        
    Returns:
        Baseline score or None if not available
    """
    baseline_data = results_df[
        (results_df['iteration'] == 0) & 
        (results_df['metric'] == primary_metric)
    ]
    
    if baseline_data.empty:
        return None
    
    # Use robust aggregation for baseline too
    return calculate_robust_score(baseline_data['result'], primary_metric)


def select_best_with_validation(scores, details, optimisation_goal, baseline_score):
    """Select best iteration with additional validation.
    
    Args:
        scores: Dict mapping iteration numbers to scores
        details: Dict mapping iteration numbers to detailed scoring info
        optimisation_goal: 'maximise' or 'minimise'
        baseline_score: Baseline score for comparison
        
    Returns:
        Best iteration number
    """
    if optimisation_goal == 'minimise':
        best_iter = min(scores, key=scores.get)
    else:
        best_iter = max(scores, key=scores.get)
    
    # Validate the selection
    best_details = details[best_iter]
    
    # Warning if low confidence
    if best_details['confidence'] == 'LOW':
        print(f"  Warning: Low confidence in selection due to estimator disagreement")
    
    # Check if improvement is meaningful
    if baseline_score is not None:
        improvement = abs(scores[best_iter] - baseline_score) / abs(baseline_score)
        if improvement < 0.01:
            print(f"  Warning: Best iteration shows minimal improvement ({improvement:.2%})")
    
    return best_iter


def get_all_iteration_results_df(all_iteration_metrics: List[Dict[str, Any]]):
    """Consolidate all iteration results from structured data into a single DataFrame.
    
    Args:
        all_iteration_metrics: List of metric dictionaries from iterations
        
    Returns:
        A pandas DataFrame containing all iteration results, or None if no results found.
    """
    all_rows = []
    
    # Use structured data from all_iteration_metrics
    for metrics_data in all_iteration_metrics:
        if 'csv_dataframe' in metrics_data and metrics_data['csv_dataframe'] is not None:
            df = metrics_data['csv_dataframe'].copy()
            df['iteration'] = metrics_data['iteration']
            all_rows.append(df)
    
    if not all_rows:
        print("No iteration results found in structured data")
        return None
    
    try:
        result_df = pd.concat(all_rows, ignore_index=True)
        print(f"Consolidated {len(result_df)} rows from {len(all_rows)} iterations")
        return result_df
    except Exception as e:
        print(f"Error consolidating iteration results: {e}")
        return None


def find_best_iteration(all_iteration_metrics: List[Dict[str, Any]], 
                       input_path: str,
                       docs_dir,
                       optimisation_goal_override: Optional[str] = None,
                       use_robust_scoring: bool = True) -> Optional[int]:
    """Find best iteration using SCOPE-RL informed robust scoring or legacy method.
    
    Args:
        all_iteration_metrics: List of metric dictionaries from iterations
        input_path: Path to input file
        docs_dir: Documentation directory path  
        optimisation_goal_override: Override optimisation goal ('maximise' or 'minimise')
        use_robust_scoring: Whether to use SCOPE-RL enhanced robust scoring
                                    
    Returns:
        Best iteration number or None if cannot determine
    """
    try:
        # Consolidate all iteration results
        results_df = get_all_iteration_results_df(all_iteration_metrics)
        
        if results_df is None or results_df.empty:
            print("No iteration results found to analyse.")
            return None
        
        # 1. Framework Detection and Metric Selection
        available_metrics = set(results_df['metric'].unique())
        print(f"Available metrics: {available_metrics}")
        
        # Detect primary metric and goal
        primary_metric, optimisation_goal, framework_type = detect_primary_metric_and_goal(
            available_metrics, optimisation_goal_override
        )
        
        if not optimisation_goal_override:
            print(f"Framework detected: {framework_type}")
        else:
            print(f"Using override optimisation goal: {optimisation_goal}")
        print(f"Primary metric: {primary_metric}")
        print(f"Optimisation goal: {optimisation_goal}")

        # 2. Calculate baseline score (reference only)
        baseline_score = calculate_baseline_score(results_df, primary_metric)
        if baseline_score is not None:
            print(f"Baseline score (reference): {baseline_score:.6f}")
        else:
            print("No baseline data found")

        if use_robust_scoring:
            print("\n=== Using SCOPE-RL Enhanced Robust Scoring ===")
            return find_best_iteration_robust(
                results_df, primary_metric, optimisation_goal, baseline_score
            )
        else:
            print("\n=== Using Legacy Simple Averaging ===")
            return find_best_iteration_legacy(
                results_df, primary_metric, optimisation_goal, baseline_score
            )

    except Exception as e:
        print(f"Error in find_best_iteration: {e}")
        return None


def find_best_iteration_robust(results_df, primary_metric, optimisation_goal, baseline_score):
    """Find best iteration using SCOPE-RL informed robust scoring.
    
    Args:
        results_df: DataFrame containing all results
        primary_metric: Primary metric name
        optimisation_goal: 'maximise' or 'minimise'
        baseline_score: Baseline score for comparison
        
    Returns:
        Best iteration number
    """
    # Score each iteration with robust method
    iteration_scores = {}
    iteration_details = {}
    
    for iter_num in sorted(results_df['iteration'].unique()):
        if iter_num == 0:  # Skip baseline
            continue
            
        score_info = calculate_robust_iteration_score(
            results_df, iter_num, primary_metric, baseline_score
        )
        
        if score_info is not None:
            iteration_scores[iter_num] = score_info['robust_score']
            iteration_details[iter_num] = score_info
            
            print(f"\nIteration {iter_num}:")
            print(f"  Robust score: {score_info['robust_score']:.6f}")
            print(f"  Estimator agreement: {score_info['agreement']:.2%}")
            print(f"  Confidence: {score_info['confidence']}")
            
            # Show estimator breakdown for transparency
            print(f"  Estimator details:")
            for est, info in score_info['estimator_results'].items():
                weight = info['weight']
                score = info['score']
                print(f"    {est}: {score:.6f} (weight: {weight:.1f})")
    
    # Select best iteration with additional validation
    if not iteration_scores:
        print("\nNo valid iteration scores calculated.")
        return None
    
    best_iter = select_best_with_validation(
        iteration_scores, iteration_details, optimisation_goal, baseline_score
    )
    
    print(f"\n=== Selected Best Iteration: {best_iter} ===")
    best_details = iteration_details[best_iter]
    print(f"Final robust score: {iteration_scores[best_iter]:.6f}")
    print(f"Confidence level: {best_details['confidence']}")
    
    if baseline_score is not None:
        if optimisation_goal == 'maximise':
            improvement = iteration_scores[best_iter] - baseline_score
        else:
            improvement = baseline_score - iteration_scores[best_iter]
        print(f"Improvement over baseline: {improvement:+.6f}")
    
    return best_iter


def find_best_iteration_legacy(results_df, primary_metric, optimisation_goal, baseline_score):
    """Legacy method: Find best iteration using simple averaging.
    
    Args:
        results_df: DataFrame containing all results
        primary_metric: Primary metric name
        optimisation_goal: 'maximise' or 'minimise'
        baseline_score: Baseline score for comparison
        
    Returns:
        Best iteration number
    """
    # 3. Calculate iteration scores (iterations 1-7 only) - Legacy method
    iteration_scores = {}
    print("\nCalculating iteration scores (legacy method)...")
    
    for iter_num in sorted(results_df['iteration'].unique()):
        if iter_num == 0:  # Skip baseline
            continue
        
        iteration_data = results_df[
            (results_df['iteration'] == iter_num) & 
            (results_df['metric'] == primary_metric)
        ]
        
        if not iteration_data.empty:
            # Policy grouping if available
            if 'policy' in iteration_data.columns:
                # Group by policy and average scores
                policy_scores = []
                for policy_name, group in iteration_data.groupby('policy'):
                    policy_score = calculate_robust_score(group['result'], primary_metric)
                    if pd.notna(policy_score):
                        policy_scores.append(policy_score)
                
                if policy_scores:
                    grouped_series = pd.Series(policy_scores)
                    robust_score = calculate_robust_score(grouped_series, primary_metric)
                else:
                    robust_score = np.nan
            else:
                # No policy grouping needed
                robust_score = calculate_robust_score(iteration_data['result'], primary_metric)
            
            if pd.notna(robust_score):
                iteration_scores[iter_num] = robust_score
                print(f"  Iteration {iter_num}: {robust_score:.6f}")
            else:
                print(f"  Iteration {iter_num}: Could not calculate score")
        else:
            print(f"  Iteration {iter_num}: No data found")
    
    print(f"\nIteration scores: {iteration_scores}")

    # 4. Select best iteration
    if not iteration_scores:
        print("No valid iteration scores calculated.")
        return None
    
    if optimisation_goal == 'minimise':
        best_iteration_num = min(iteration_scores, key=iteration_scores.get)
    else:  # maximise
        best_iteration_num = max(iteration_scores, key=iteration_scores.get)
    
    best_score = iteration_scores[best_iteration_num]
    print(f"\nSelected best iteration: {best_iteration_num} (score: {best_score:.6f})")
    
    # 5. Baseline comparison (informational)
    if baseline_score is not None:
        if optimisation_goal == 'maximise':
            improvement = best_score - baseline_score
        else:  # minimise
            improvement = baseline_score - best_score
        
        print(f"Improvement over baseline: {improvement:+.6f}")
    
    return best_iteration_num


def generate_comprehensive_results_table(all_iteration_metrics: List[Dict[str, Any]], 
                                       framework: str, 
                                       model: str) -> str:
    """Generate results summary table with top estimators and best iteration markers.
    
    Args:
        all_iteration_metrics: List of metric dictionaries from iterations
        framework: Framework type
        model: Model name used
        
    Returns:
        Formatted table string with iteration comparison and best markers
    """
    try:
        import pandas as pd
        
        # Get baseline and iteration metrics
        baseline_metrics = next((m for m in all_iteration_metrics if m.get('status') == 'baseline'), {})
        iteration_metrics = [m for m in all_iteration_metrics if m.get('status') == 'success']
        
        # Consolidate all results for analysis
        results_df = get_all_iteration_results_df(all_iteration_metrics)
        
        if results_df is None or results_df.empty:
            return f"""# Two-Agent Framework - Results Summary
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Framework: {framework.upper()}
Model: {model}

No valid results found for analysis.
"""
        
        # Detect available estimators and metrics
        available_estimators = sorted(list(set(results_df['estimator'].unique())))
        available_metrics = sorted(list(set(results_df['metric'].unique())))
        
        # Choose primary metric for display
        if 'relative_policy_value' in available_metrics:
            primary_metric = 'relative_policy_value'
        elif any('relative' in m and 'ee' in m for m in available_metrics):
            primary_metric = next(m for m in available_metrics if 'relative' in m and 'ee' in m)
        elif 'policy_value' in available_metrics:
            primary_metric = 'policy_value'
        else:
            primary_metric = available_metrics[0] if available_metrics else 'result'
        
        # Estimators
        key_estimators = available_estimators  # Use all available estimators
        
        # Header
        table = f"""# Two-Agent Framework - Results Summary
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Framework: {framework.upper()}
Model: {model}
Total Iterations: {len(iteration_metrics)}
Available Estimators: {', '.join(available_estimators)}
Primary Metric: {primary_metric}

## Performance Summary Table

| Iteration |"""
        for estimator in key_estimators:
            table += f" {estimator.upper()} |"
        table += "\n"
        
        table += "|-----------|"
        for _ in key_estimators:
            table += "------|"
        table += "\n"
        
        # Helper function to extract metric value
        def get_metric_value(iter_num, estimator, metric):
            """Extract metric value for specific iteration and estimator."""
            filtered = results_df[
                (results_df['iteration'] == iter_num) & 
                (results_df['estimator'] == estimator) & 
                (results_df['metric'] == metric)
            ]
            if not filtered.empty:
                value = filtered['result'].iloc[0]
                if isinstance(value, (int, float)):
                    return f"{value:.4f}"
                return str(value)
            return 'N/A'
        
        # Find best performers for each estimator
        best_iterations = {}
        higher_is_better = not ('ee' in primary_metric.lower() or 'error' in primary_metric.lower())
        
        for estimator in key_estimators:
            best_value = None
            best_iter = -1
            
            for metrics in iteration_metrics:
                iter_num = metrics['iteration']
                value_str = get_metric_value(iter_num, estimator, primary_metric)
                try:
                    value = float(value_str)
                    if best_value is None:
                        best_value = value
                        best_iter = iter_num
                    elif higher_is_better and value > best_value:
                        best_value = value
                        best_iter = iter_num
                    elif not higher_is_better and value < best_value:
                        best_value = value
                        best_iter = iter_num
                except:
                    continue
            
            best_iterations[estimator] = (best_iter, best_value)
        
        # Baseline row
        if baseline_metrics:
            table += "| Baseline  |"
            for estimator in key_estimators:
                value = get_metric_value(0, estimator, primary_metric)
                table += f" {value} |"
            table += "\n"
        
        # Iteration rows
        for metrics in iteration_metrics:
            iter_num = metrics['iteration']
            table += f"| {iter_num}         |"
            
            for estimator in key_estimators:
                value = get_metric_value(iter_num, estimator, primary_metric)
                
                # Mark best value with *BEST*
                best_iter, _ = best_iterations.get(estimator, (-1, None))
                mark = " *BEST*" if iter_num == best_iter else ""
                
                table += f" {value}{mark} |"
            table += "\n"
        
        # Best performers summary
        table += f"""
## Best Performers by Estimator

"""
        for estimator in key_estimators:
            best_iter, best_value = best_iterations.get(estimator, (-1, None))
            if best_iter > 0 and best_value is not None:
                table += f"- **{estimator.upper()}**: Iteration {best_iter} (Value: {best_value:.4f})\n"
        
        # Overall performance summary using ROBUST SCORING (same as find_best_iteration)
        table += f"""
## Overall Performance Summary

Primary metric: {primary_metric}
Optimisation direction: {'Higher is better' if higher_is_better else 'Lower is better'}

"""
        
        # Calculate iteration scores using SAME METHOD as find_best_iteration
        iteration_scores = {}
        
        # Detect primary metric and goal for consistent scoring
        available_metrics_set = set(results_df['metric'].unique())
        detected_primary_metric, optimisation_goal, framework_type = detect_primary_metric_and_goal(
            available_metrics_set, None
        )
        
        # Calculate baseline score for reference
        baseline_score = calculate_baseline_score(results_df, detected_primary_metric)
        
        for metrics in iteration_metrics:
            iter_num = metrics['iteration']
            
            # Use the SAME robust scoring method as find_best_iteration
            score_info = calculate_robust_iteration_score(
                results_df, iter_num, detected_primary_metric, baseline_score
            )
            
            if score_info is not None:
                robust_score = score_info['robust_score']
                iteration_scores[iter_num] = robust_score
                confidence = score_info['confidence']
                agreement = score_info['agreement']
                
                table += f"- **Iteration {iter_num}**: {robust_score:.6f} (robust score, {confidence} confidence, {agreement:.1%} agreement)\n"
            else:
                # Fallback to simple calculation if robust scoring fails
                iteration_data = results_df[
                    (results_df['iteration'] == iter_num) & 
                    (results_df['metric'] == detected_primary_metric)
                ]
                if not iteration_data.empty:
                    simple_score = calculate_robust_score(iteration_data['result'], detected_primary_metric)
                    if pd.notna(simple_score):
                        iteration_scores[iter_num] = simple_score
                        table += f"- **Iteration {iter_num}**: {simple_score:.6f} (fallback scoring)\n"
        
        # Identify best overall iteration using SAME LOGIC as find_best_iteration
        if iteration_scores:
            if optimisation_goal == 'minimise':
                best_overall_iter = min(iteration_scores, key=iteration_scores.get)
            else:  # maximise
                best_overall_iter = max(iteration_scores, key=iteration_scores.get)
            
            best_overall_score = iteration_scores[best_overall_iter]
            table += f"\n**Best Overall Performance**: Iteration {best_overall_iter} (Score: {best_overall_score:.6f})\n"
            table += f"**Scoring Method**: Robust scoring with {framework_type} framework detection\n"
        
        return table
        
    except Exception as e:
        print(f"Error generating comprehensive results table: {e}")
        # Fallback - create a simple summary
        return f"""# Two-Agent Framework - Results Summary
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Framework: {framework.upper()}
Model: {model}
Total Iterations: {len([m for m in all_iteration_metrics if m.get('status') == 'success'])}

Note: Detailed results table generation failed. Check individual CSV files for metrics.
Error: {str(e)}
"""


def detect_project_optimisation_goal(input_path: str, docs_dir) -> Optional[str]:
    """Detect the appropriate optimisation goal based on project characteristics.
    
    Args:
        input_path: Path to input file
        docs_dir: Documentation directory path
        
    Returns:
        'minimise' for accuracy optimisation projects, None for automatic detection
    """
    import os
    
    # Check if this is a multiclass/accuracy optimisation project
    input_basename = os.path.basename(input_path).lower()
    
    if 'multiclass' in input_basename:
        print(f"Detected accuracy optimisation project from filename: {input_basename}")
        return 'minimise'
    
    # Default: use automatic detection
    return None
