#!/usr/bin/env python3
import os
import json
import glob
import base64
from datetime import datetime
from pathlib import Path
import numpy as np
import argparse

DATASET_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "dataset")
OUTPUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "analysis_results")
REPORT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "reports")

os.makedirs(REPORT_DIR, exist_ok=True)

ANALYSIS_MODES = {
    "standard": "Standard Analysis",
    "ignore_from_analysis": "Ignore From Analysis",
    "ignore_from_metadata": "Ignore From Metadata", 
    "analyze_and_ignore": "Analyze and Ignore"
}

def find_latest_analysis_files():
    """Find the latest analysis files for each mode and model combination."""
    latest_files = {}
    
    for mode in ANALYSIS_MODES.keys():
        latest_files[mode] = {}
        for model_suffix in ["gemma3_4b", "gemma3_12b"]:
            pattern = os.path.join(OUTPUT_DIR, f"{mode}_{model_suffix}_*.json")
            files = glob.glob(pattern)
            
            if files:
                files.sort(key=os.path.getmtime, reverse=True)
                latest_files[mode][model_suffix] = files[0]
    
    return latest_files

def load_analysis_data(filepath):
    if os.path.exists(filepath):
        try:
            with open(filepath, 'r') as f:
                return json.load(f)
        except Exception as e:
            print(f"Error reading analysis file {filepath}: {e}")
    return {}

def load_analyses_by_test_dir():
    latest_files = find_latest_analysis_files()
    analyses_by_test = {}
    
    print("\nLoading latest analysis files:")
    for mode, model_files in latest_files.items():
        for model, filepath in model_files.items():
            print(f"  - {mode} | {model}: {os.path.basename(filepath)}")
            data = load_analysis_data(filepath)
            if not data:
                continue
            
            for test_dir, analysis in data.items():
                if test_dir not in analyses_by_test:
                    analyses_by_test[test_dir] = {}
                
                if mode not in analyses_by_test[test_dir]:
                    analyses_by_test[test_dir][mode] = {}
                
                analyses_by_test[test_dir][mode][model] = analysis
    
    return analyses_by_test

def image_to_base64(image_path):
    try:
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode("utf-8")
    except Exception as e:
        print(f"Error encoding image {image_path}: {e}")
        return ""

def format_reason_tag(reason, possible_categories):
    is_unknown = reason.startswith("UNKNOWN-")
    class_name = "reason-tag unknown-reason" if is_unknown else "reason-tag"
    
    display_reason = reason
    if is_unknown:
        display_reason = reason[8:].replace("_", " ").title()
    
    return f'<span class="{class_name}">{display_reason}</span>'

def parse_timestamps(timestamp_str):
    try:
        return datetime.fromisoformat(timestamp_str)
    except (ValueError, TypeError):
        try:
            return datetime.strptime(timestamp_str, "%Y-%m-%dT%H:%M:%S.%f")
        except (ValueError, TypeError):
            return None

def calculate_metrics(all_analyses):
    metrics = {}
    
    if not all_analyses:
        print("Warning: No analyses provided to calculate_metrics")
        return {}
        
    model_set = set()
    for a in all_analyses:
        if "_metadata" in a and "model" in a["_metadata"]:
            model_set.add(a["_metadata"]["model"])
        else:
            print(f"Warning: Analysis missing model metadata: {a.get('_metadata', {})}")
    
    if not model_set:
        print("Warning: No models found in analyses")
        return {}
            
    print(f"Found models: {', '.join(model_set)}")
    
    for model in model_set:
        print(f"Processing metrics for model: {model}")
        model_analyses = [a for a in all_analyses if a.get("_metadata", {}).get("model") == model]
        total = len(model_analyses)
        
        if total == 0:
            print(f"Warning: No analyses found for model {model}")
            continue
            
        accuracy_stats = {
            "total_predictions": 0,
            "correct_predictions": 0,
            "non_unknown_predictions": 0,
            "correct_non_unknown": 0,
            "total_unknown_reasons": 0,
            "total_known_reasons": 0,
            "analyses_with_unknowns": 0,
            "invalid_categories": 0
        }
        
        response_times = []
        valid_diffs = []
        semantic_diffs = []
        
        all_unknown_reasons = []
        
        for analysis in model_analyses:
            response_time = analysis.get("_metadata", {}).get("performance", {}).get("response_time_seconds", 0)
            if response_time:
                response_times.append(response_time)
            
            reasons = analysis.get("reasons", [])
            if not isinstance(reasons, list):
                reasons = [reasons] if reasons else []
                
            predicted_labels = set(r for r in reasons if r)
            
            ground_truth = set()
            metadata = analysis.get("_metadata", {})
            
            test_specific = metadata.get("possible_categories", [])
            all_possible = metadata.get("all_possible_categories", [])
            
            if test_specific:
                ground_truth = set(test_specific)
                if not ground_truth:
                    print(f"  Warning: Test-specific 'possible_categories' is empty. Falling back to all_possible.")
                    if all_possible:
                        ground_truth = set(all_possible)
                    else:
                        print(f"  Warning: Both test-specific and all_possible categories are empty.")
            elif all_possible:
                ground_truth = set(all_possible)
            else:
                print(f"  Warning: No ground truth categories ('possible_categories' or 'all_possible_categories') found in metadata.")
            
            unknown_labels = set(r for r in predicted_labels if r and r.startswith("UNKNOWN-"))
            valid_labels = set(r for r in predicted_labels if r in ground_truth)
            invalid_labels = predicted_labels - unknown_labels - valid_labels
            
            if invalid_labels:
                accuracy_stats["invalid_categories"] += len(invalid_labels)
                print(f"Found {len(invalid_labels)} invalid categories: {', '.join(invalid_labels)}")
            
            for unknown in unknown_labels:
                all_unknown_reasons.append(unknown)
            
            if ground_truth:
                accuracy_stats["total_predictions"] += len(ground_truth)
                accuracy_stats["correct_predictions"] += len(valid_labels)
                
                if valid_labels:
                    accuracy_stats["non_unknown_predictions"] += 1
                    accuracy_stats["correct_non_unknown"] += 1
            
            if unknown_labels:
                accuracy_stats["analyses_with_unknowns"] += 1
            accuracy_stats["total_unknown_reasons"] += len(unknown_labels)
            accuracy_stats["total_known_reasons"] += len(valid_labels)
            
            pixel_diff = analysis.get("pixel_diff")
            ground_truth_diff = analysis.get("_metadata", {}).get("ground_truth", {}).get("pixel_difference")
            
            if pixel_diff is not None and ground_truth_diff is not None:
                try:
                    pixel_diff = float(pixel_diff)
                    ground_truth_diff = float(ground_truth_diff)
                    valid_diffs.append((pixel_diff, ground_truth_diff))
                except (ValueError, TypeError):
                    print(f"Warning: Invalid pixel diff values: {pixel_diff}, {ground_truth_diff}")
                    
            semantic_diff = analysis.get("semantic_diff")
            if semantic_diff is not None:
                try:
                    semantic_diff = float(semantic_diff)
                    semantic_diffs.append(semantic_diff)
                except (ValueError, TypeError):
                    print(f"Warning: Invalid semantic diff value: {semantic_diff}")
        
        if all_unknown_reasons:
            unique_unknowns = set(all_unknown_reasons)
            to_show = list(unique_unknowns)[:10]
            print(f"Unique unknown reasons for {model} ({len(unique_unknowns)} total): {', '.join(to_show)}")
        
        if valid_diffs:
            predicted_diffs = [pred for pred, _ in valid_diffs]
            ground_truth_diffs = [actual for _, actual in valid_diffs]
            diff_errors = [abs(pred - actual) for pred, actual in valid_diffs]
            
            avg_predicted_diff = np.mean(predicted_diffs)
            std_predicted_diff = np.std(predicted_diffs)
            avg_ground_truth_diff = np.mean(ground_truth_diffs)
            std_ground_truth_diff = np.std(ground_truth_diffs)
            avg_diff_error = np.mean(diff_errors)
            std_diff_error = np.std(diff_errors)
        else:
            avg_predicted_diff = 0
            std_predicted_diff = 0
            avg_ground_truth_diff = 0
            std_ground_truth_diff = 0
            avg_diff_error = 0
            std_diff_error = 0
        
        if semantic_diffs:
            avg_semantic_diff = np.mean(semantic_diffs)
            std_semantic_diff = np.std(semantic_diffs)
        else:
            avg_semantic_diff = 0
            std_semantic_diff = 0
            
        if response_times:
            total_time = sum(response_times)
            avg_time = np.mean(response_times)
            std_time = np.std(response_times)
        else:
            total_time = 0
            avg_time = 0
            std_time = 0
        
        metrics[model] = {
            "total_analyses": total,
            "top1_accuracy": accuracy_stats["correct_non_unknown"] / total if total > 0 else 0,
            "label_accuracy": accuracy_stats["correct_predictions"] / accuracy_stats["total_predictions"] if accuracy_stats["total_predictions"] > 0 else 0,
            "non_unknown_rate": accuracy_stats["non_unknown_predictions"] / total if total > 0 else 0,
            "unknown_rate": accuracy_stats["analyses_with_unknowns"] / total if total > 0 else 0,
            "unknown_density": accuracy_stats["total_unknown_reasons"] / total if total > 0 else 0,
            "unknown_to_known_ratio": accuracy_stats["total_unknown_reasons"] / accuracy_stats["total_known_reasons"] if accuracy_stats["total_known_reasons"] > 0 else 0,
            "invalid_category_rate": accuracy_stats["invalid_categories"] / total if total > 0 else 0,
            "avg_ground_truth_diff": avg_ground_truth_diff,
            "std_ground_truth_diff": std_ground_truth_diff,
            "avg_predicted_diff": avg_predicted_diff,
            "std_predicted_diff": std_predicted_diff,
            "avg_diff_error": avg_diff_error,
            "std_diff_error": std_diff_error,
            "avg_semantic_diff": avg_semantic_diff,
            "std_semantic_diff": std_semantic_diff,
            "total_time": total_time,
            "avg_time": avg_time,
            "std_time": std_time,
            "version": model_analyses[0].get("_metadata", {}).get("model_version", "unknown") if model_analyses else "unknown"
        }
        
        print(f"Calculated metrics for {model}: {len(metrics[model])} values")
        
    return metrics

def format_metric_value(value):
    if isinstance(value, (int, float)):
        return f"{value:.2f}"
    return "N/A"

def generate_ignore_analysis_section(test_dir, dir_name, original_categories):
    ignore_analysis_path = os.path.join(OUTPUT_DIR, "ignore_analysis_results.json")
    if not os.path.exists(ignore_analysis_path):
        return ""
        
    try:
        with open(ignore_analysis_path, 'r') as f:
            all_ignore_analyses = json.load(f)
    except Exception:
        return ""
    
    if dir_name not in all_ignore_analyses:
        return ""
    
    analyses = all_ignore_analyses[dir_name]
    if not analyses:
        return ""
    
    html = """
        <div class="analysis-container">
            <div class="analysis-header">
                <h2>Ignore Reason Analyses</h2>
            </div>
    """
    
    analyses_by_reason = {}
    for analysis in analyses:
        ignored_reason = analysis.get("_metadata", {}).get("ignored_reason", "Unknown")
        if ignored_reason not in analyses_by_reason:
            analyses_by_reason[ignored_reason] = []
        analyses_by_reason[ignored_reason].append(analysis)
    
    for ignored_reason, reason_analyses in analyses_by_reason.items():
        html += f"""
            <div class="model-analysis">
                <h3 class="model-heading">Ignoring: {ignored_reason}</h3>
        """
        
        reason_analyses.sort(key=lambda x: 0 if "12b" in x.get("_metadata", {}).get("model", "") else 1)
        
        for analysis in reason_analyses:
            model = analysis.get("_metadata", {}).get("model", "Unknown model")
            timestamp = analysis.get("_metadata", {}).get("timestamp", "Unknown time")
            pixel_diff = analysis.get("pixel_diff", "N/A")
            semantic_diff = analysis.get("semantic_diff", "N/A")
            reasons = analysis.get("reasons", [])
            affected_elements = analysis.get("affected_elements", [])
            explanation = analysis.get("explanation", "No explanation provided")
            
            try:
                dt = parse_timestamps(timestamp)
                if dt:
                    timestamp = dt.strftime("%Y-%m-%d %H:%M:%S")
            except Exception:
                pass
            
            remaining_categories = [r for r in reasons if r in original_categories]
            new_issues = [r for r in reasons if r not in original_categories]
            
            html += f"""
                <div class="model-result">
                    <h4>{model} Analysis (Generated: {timestamp})</h4>
                    
                    <div class="metrics-section">
                        <div class="metric">
                            <span class="metric-label">Predicted Pixel Difference</span>
                            <span class="metric-value">{pixel_diff}</span>
                        </div>
                        <div class="metric">
                            <span class="metric-label">Actual Pixel Difference</span>
                            <span class="metric-value">{analysis.get("_metadata", {}).get("ground_truth", {}).get("pixel_difference", "N/A")}</span>
                        </div>
                        <div class="metric">
                            <span class="metric-label">Semantic Difference</span>
                            <span class="metric-value">{semantic_diff}</span>
                        </div>
                    </div>
                    
                    <div class="reasons">
                        <h5>Remaining Original Issues:</h5>
                        {' '.join([f'<span class="reason-tag">{r}</span>' for r in remaining_categories]) or '<p>No remaining original issues</p>'}
                    </div>
                    
                    <div class="reasons">
                        <h5>New Issues Found:</h5>
                        {' '.join([f'<span class="reason-tag unknown-reason">{r}</span>' for r in new_issues]) or '<p>No new issues found</p>'}
                    </div>
                    
                    <div class="affected-elements">
                        <h5>Affected UI Elements:</h5>
                        {' '.join([f'<span class="element-tag">{e}</span>' for e in affected_elements]) or '<p>No specific elements identified</p>'}
                    </div>
                    
                    <div class="explanation">
                        <h5>Analysis:</h5>
                        <p>{explanation}</p>
                    </div>
                </div>
            """
        
        html += "</div>"
    
    html += "</div>"
    return html

def generate_metrics_section(metrics):
    """Generate HTML for the metrics section"""
    if not metrics:
        return "<p>No metrics available.</p>"
        
    html = """
        <div class="metrics-container">
            <div class="metrics-grid">
    """
    
    for model, model_metrics in metrics.items():
        if "error" in model_metrics:
            html += f"""
                <div class="model-metrics">
                    <h3>{model}</h3>
                    <div class="metric-group">
                        <h4>Error</h4>
                        <div class="metric">
                            <span class="metric-value">{model_metrics["error"]}</span>
                        </div>
                    </div>
                </div>
            """
            continue
            
        html += f"""
            <div class="model-metrics">
                <h3>{model}</h3>
                <div class="metric-group">
                    <h4>Timing Analysis</h4>
                    <div class="metric">
                        <span class="metric-label">Total Time:</span>
                        <span class="metric-value">{model_metrics.get('total_time', 'N/A'):.1f}s</span>
                    </div>
                    <div class="metric">
                        <span class="metric-label">Per Test:</span>
                        <span class="metric-value">{model_metrics.get('avg_time', 'N/A'):.2f}s ± {model_metrics.get('std_time', 'N/A'):.2f}s</span>
                    </div>
                </div>
                <div class="metric-group">
                    <h4>Accuracy Metrics</h4>
                    <div class="metric">
                        <span class="metric-label">Test Case Hit Rate:</span>
                        <span class="metric-value">{model_metrics.get('top1_accuracy', 'N/A'):.2%}</span>
                    </div>
                    <div class="metric">
                        <span class="metric-label">Label Accuracy:</span>
                        <span class="metric-value">{model_metrics.get('label_accuracy', 'N/A'):.2%}</span>
                    </div>
                </div>
                <div class="metric-group">
                    <h4>Unknown Analysis</h4>
                    <div class="metric">
                        <span class="metric-label">Unknown Rate:</span>
                        <span class="metric-value">{model_metrics.get('unknown_rate', 'N/A'):.2%}</span>
                    </div>
                    <div class="metric">
                        <span class="metric-label">Unknown Density:</span>
                        <span class="metric-value">{model_metrics.get('unknown_density', 'N/A'):.2f}</span>
                    </div>
                    <div class="metric">
                        <span class="metric-label">Unknown/Known Ratio:</span>
                        <span class="metric-value">{model_metrics.get('unknown_to_known_ratio', 'N/A'):.2f}</span>
                    </div>
                </div>
                <div class="metric-group">
                    <h4>Difference Analysis</h4>
                    <div class="metric">
                        <span class="metric-label">Ground Truth:</span>
                        <span class="metric-value">{model_metrics.get('avg_ground_truth_diff', 'N/A'):.3f} ± {model_metrics.get('std_ground_truth_diff', 'N/A'):.3f}</span>
                    </div>
                    <div class="metric">
                        <span class="metric-label">Predicted:</span>
                        <span class="metric-value">{model_metrics.get('avg_predicted_diff', 'N/A'):.3f} ± {model_metrics.get('std_predicted_diff', 'N/A'):.3f}</span>
                    </div>
                    <div class="metric">
                        <span class="metric-label">Error:</span>
                        <span class="metric-value">{model_metrics.get('avg_diff_error', 'N/A'):.3f} ± {model_metrics.get('std_diff_error', 'N/A'):.3f}</span>
                    </div>
                    <div class="metric">
                        <span class="metric-label">Semantic:</span>
                        <span class="metric-value">{model_metrics.get('avg_semantic_diff', 'N/A'):.3f} ± {model_metrics.get('std_semantic_diff', 'N/A'):.3f}</span>
                    </div>
                </div>
            </div>
        """
    
    html += """
            </div>
        </div>
    """
    return html

def calculate_metrics_for_mode(analyses, expected_mode):
    filtered_analyses = []
    for analysis in analyses:
        mode = analysis.get("_metadata", {}).get("analysis_mode", "unknown")
        
        if expected_mode == "standard" and mode in ["standard", "analyze"]:
            filtered_analyses.append(analysis)
        elif mode == expected_mode:
            filtered_analyses.append(analysis)
        else:
            print(f"Warning: Found analysis with incorrect mode: expected '{expected_mode}', got '{mode}'")
    
    if len(filtered_analyses) != len(analyses):
        print(f"Warning: Filtered out {len(analyses) - len(filtered_analyses)} analyses with incorrect mode for '{expected_mode}'")
    
    if len(filtered_analyses) == 0:
        print(f"Error: No valid analyses for mode '{expected_mode}' after filtering")
        return {}
        
    metrics = calculate_metrics(filtered_analyses)
    
    print(f"\nMetrics for {expected_mode}:")
    for model, model_metrics in metrics.items():
        print(f"  - {model}: {len([a for a in filtered_analyses if a.get('_metadata', {}).get('model') == model])} analyses")
        print(f"    Top metrics: hit_rate={model_metrics.get('top1_accuracy', 'N/A'):.2f}, "
              f"pixel_diff={model_metrics.get('avg_predicted_diff', 'N/A'):.2f}")
    
    return metrics

def generate_report():
    """Generate the HTML report for all test cases"""
    print(f"Generating report...")
    
    parser = argparse.ArgumentParser(description='Generate snapshot analysis report')
    parser.add_argument('--debug', action='store_true', help='Debug mode: process only first 2 test directories')
    parser.add_argument('--verbose', action='store_true', help='Print verbose debug information')
    args = parser.parse_args()
    
    test_dirs = [d for d in glob.glob(os.path.join(DATASET_DIR, "*")) if os.path.isdir(d)]
    
    if not test_dirs:
        print("No test directories found")
        return
        
    if args.debug:
        test_dirs = test_dirs[:2]
        print(f"🐞 DEBUG MODE: Processing only {len(test_dirs)} test directories")
    
    analyses_by_test = load_analyses_by_test_dir()
    
    if args.verbose:
        print("\nAnalyses by Test (first level structure):")
        for test_dir, modes in analyses_by_test.items():
            print(f"  {test_dir}:")
            for mode, models in modes.items():
                print(f"    {mode}: {', '.join(models.keys())}")
    
    all_models = set()
    test_cases_html = []
    
    standard_analyses = []
    ignore_from_analysis_list = []
    ignore_from_metadata_list = []
    analyze_and_ignore_list = []
    
    mode_counts = {
        "standard": 0,
        "ignore_from_analysis": 0,
        "ignore_from_metadata": 0,
        "analyze_and_ignore": 0
    }
    
    unique_tests_by_mode = {
        "standard": set(),
        "ignore_from_analysis": set(),
        "ignore_from_metadata": set(),
        "analyze_and_ignore": set()
    }
    
    for test_dir in test_dirs:
        dir_name = os.path.basename(test_dir)
        print(f"Processing {dir_name}...")
        
        reference_path = os.path.join(test_dir, "reference.png")
        failure_path = os.path.join(test_dir, "failure.png")
        diff_path = os.path.join(test_dir, "diff.png")
        metadata_path = os.path.join(test_dir, "metadata.json")
        
        if not all(os.path.exists(p) for p in [reference_path, failure_path, diff_path, metadata_path]):
            print(f"Skipping {dir_name} - missing required files")
            continue
        
        try:
            with open(metadata_path, 'r') as f:
                metadata = json.load(f)
            flag_name = metadata.get("flagName", "N/A")
            categories = metadata.get("categories", [])
        except Exception as e:
            print(f"Error reading metadata for {dir_name}: {e}")
            flag_name = "Error reading metadata"
            categories = []
        
        categories_html = ""
        for category in categories:
            categories_html += f'<span class="category-tag">{category}</span>'
        
        reference_b64 = image_to_base64(reference_path)
        failure_b64 = image_to_base64(failure_path)
        diff_b64 = image_to_base64(diff_path)
        
        full_analysis_html = ""
        
        if dir_name in analyses_by_test:
            test_analyses = analyses_by_test[dir_name]
            
            for mode, title in ANALYSIS_MODES.items():
                if mode in test_analyses:
                    mode_analyses = test_analyses[mode]
                    
                    models_processed = []
                    
                    analyses = []
                    for model, analysis in mode_analyses.items():
                        models_processed.append(model)
                        analysis["_model"] = model
                        analyses.append(analysis)
                        
                        if mode == "standard":
                            standard_analyses.append(analysis)
                            unique_tests_by_mode["standard"].add(dir_name)
                            mode_counts["standard"] += 1
                        elif mode == "ignore_from_analysis":
                            ignore_from_analysis_list.append(analysis)
                            unique_tests_by_mode["ignore_from_analysis"].add(dir_name)
                            mode_counts["ignore_from_analysis"] += 1 
                        elif mode == "ignore_from_metadata":
                            ignore_from_metadata_list.append(analysis)
                            unique_tests_by_mode["ignore_from_metadata"].add(dir_name)
                            mode_counts["ignore_from_metadata"] += 1
                        elif mode == "analyze_and_ignore":
                            analyze_and_ignore_list.append(analysis)
                            unique_tests_by_mode["analyze_and_ignore"].add(dir_name)
                            mode_counts["analyze_and_ignore"] += 1
                            
                        all_models.add(model)
                    
                    analyses.sort(key=lambda x: 0 if "12b" in x.get("_model", "") else 1)
                    
                    full_analysis_html += f"""
                        <div class="analysis-section">
                            <h3>{title}</h3>
                    """
                    
                    for analysis in analyses:
                        model = analysis.get("_model", "unknown")
                        metadata = analysis.get("_metadata", {})
                        timestamp = metadata.get("timestamp", "")
                        
                        formatted_time = "Unknown time"
                        if timestamp:
                            try:
                                dt = parse_timestamps(timestamp)
                                if dt:
                                    formatted_time = dt.strftime("%Y-%m-%d %H:%M:%S")
                            except Exception:
                                pass
                        
                        reasons = analysis.get("reasons", [])
                        reasons_html = ""
                        for reason in reasons:
                            reasons_html += format_reason_tag(reason, categories) + " "
                        
                        pixel_diff = analysis.get("pixel_diff", "N/A")
                        ground_truth_diff = analysis.get("_metadata", {}).get("ground_truth", {}).get("pixel_difference", "N/A")
                        semantic_diff = analysis.get("semantic_diff", "N/A")
                        explanation = analysis.get("explanation", "No explanation provided")
                        affected_elements = analysis.get("affected_elements", [])
                        affected_html = ""
                        for element in affected_elements:
                            affected_html += f'<span class="element-tag">{element}</span> '
                            
                        ignored_reason_html = ""
                        if mode in ["ignore_from_analysis", "ignore_from_metadata", "analyze_and_ignore"]:
                            ignored_reason = metadata.get("ignored_reason", "Unknown")
                            ignored_reason_html = f"""
                                <div class="ignored-reason">
                                    <span class="metric-label">Ignored:</span>
                                    <span class="metric-value">{ignored_reason}</span>
                                </div>
                            """
                        
                        model_analysis_html = f"""
                            <div class="model-analysis">
                                <h3 class="model-heading">{model.replace('_', ':')} (Generated: {formatted_time})</h3>
                                
                                {ignored_reason_html}
                                
                                <div class="metrics-section">
                                    <div class="metric">
                                        <span class="metric-label">Pixel Difference (Model):</span>
                                        <span class="metric-value">{pixel_diff}</span>
                                    </div>
                                    <div class="metric">
                                        <span class="metric-label">Pixel Difference (Ground Truth):</span>
                                        <span class="metric-value">{ground_truth_diff}</span>
                                    </div>
                                    <div class="metric">
                                        <span class="metric-label">Semantic Difference:</span>
                                        <span class="metric-value">{semantic_diff}</span>
                                    </div>
                                </div>
                                
                                <div class="reasons">
                                    <h4>Detected Issues:</h4>
                                    <div class="tag-container">
                                        {reasons_html or '<p>No issues detected</p>'}
                                    </div>
                                </div>
                                
                                <div class="affected-elements">
                                    <h4>Affected UI Elements:</h4>
                                    <div class="tag-container">
                                        {affected_html or '<p>No specific elements identified</p>'}
                                    </div>
                                </div>
                                
                                <div class="explanation">
                                    <h4>Explanation:</h4>
                                    <p>{explanation}</p>
                                </div>
                            </div>
                        """
                        full_analysis_html += model_analysis_html
                    
                    full_analysis_html += "</div>"
        
        if not full_analysis_html:
            full_analysis_html = """
                <div class="empty-state">
                    <h3>No analysis results available</h3>
                    <p>Run an analysis on this test case to see results here.</p>
                </div>
            """
        
        test_case_html = f"""
            <div class="test-case">
                <div class="test-header">
                    <div class="test-name">{dir_name}</div>
                    <div class="test-meta">{flag_name}</div>
                </div>
                <div class="test-content" style="display: none;">
                    <div class="test-metadata">
                        <h3>Test Metadata</h3>
                        <p><strong>Flag Name:</strong> {flag_name}</p>
                        <div class="categories">
                            <strong>Categories:</strong> {categories_html or '<span>None</span>'}
                        </div>
                    </div>
                    <div class="images-container">
                        <div class="image-box">
                            <h3>Reference</h3>
                            <img src="data:image/png;base64,{reference_b64}" alt="Reference">
                        </div>
                        <div class="image-box">
                            <h3>Failure</h3>
                            <img src="data:image/png;base64,{failure_b64}" alt="Failure">
                        </div>
                        <div class="image-box">
                            <h3>Diff</h3>
                            <img src="data:image/png;base64,{diff_b64}" alt="Diff">
                        </div>
                    </div>
                    
                    <div class="analysis-container">
                        <div class="analysis-header">
                            <h2>Analysis Results</h2>
                        </div>
                        {full_analysis_html}
                    </div>
                </div>
            </div>
        """
        test_cases_html.append(test_case_html)
    
    print("\n--- Analysis Mode Counts ---")
    for mode, count in mode_counts.items():
        print(f"{mode}: {count} analyses")
        
    print("\n--- Unique Test Cases by Mode ---")
    for mode, tests in unique_tests_by_mode.items():
        print(f"{mode}: {len(tests)} test cases")
    
    print("\n--- Flattened Analysis Counts ---")
    print(f"Standard: {len(standard_analyses)}")
    print(f"Ignore from analysis: {len(ignore_from_analysis_list)}")
    print(f"Ignore from metadata: {len(ignore_from_metadata_list)}")
    print(f"Analyze and ignore: {len(analyze_and_ignore_list)}")
    
    print("\n--- Checking for analysis overlaps ---")
    def count_overlaps(list1, list2, name1, name2):
        list1_ids = set()
        list2_ids = set()
        
        for a in list1:
            if "_metadata" in a and "id" in a["_metadata"]:
                list1_ids.add(a["_metadata"]["id"])
            else:
                list1_ids.add(str(a))
        
        for a in list2:
            if "_metadata" in a and "id" in a["_metadata"]:
                list2_ids.add(a["_metadata"]["id"])
            else:
                list2_ids.add(str(a))
        
        overlaps = list1_ids.intersection(list2_ids)
        if overlaps:
            print(f"WARNING: Found {len(overlaps)} overlapping analyses between {name1} and {name2}")
            print(f"First overlap ID: {next(iter(overlaps))}")
    
    count_overlaps(ignore_from_metadata_list, analyze_and_ignore_list, "ignore_from_metadata", "analyze_and_ignore")
    count_overlaps(ignore_from_analysis_list, ignore_from_metadata_list, "ignore_from_analysis", "ignore_from_metadata")
    count_overlaps(ignore_from_analysis_list, analyze_and_ignore_list, "ignore_from_analysis", "analyze_and_ignore")
    
    print("\n=== CALCULATING METRICS ===")
    print(f"Standard analyses: {len(standard_analyses)}")
    print(f"Ignore from analysis: {len(ignore_from_analysis_list)}")
    print(f"Ignore from metadata: {len(ignore_from_metadata_list)}")
    print(f"Analyze and ignore: {len(analyze_and_ignore_list)}")
    
    print("\nCalculating standard metrics...")
    standard_metrics = calculate_metrics_for_mode(standard_analyses, "standard")
    
    print("\nCalculating ignore_from_analysis metrics...")
    ignore_from_analysis_metrics = calculate_metrics_for_mode(ignore_from_analysis_list, "ignore_from_analysis")
    
    print("\nCalculating ignore_from_metadata metrics...")
    ignore_from_metadata_metrics = calculate_metrics_for_mode(ignore_from_metadata_list, "ignore_from_metadata")
    
    print("\nCalculating analyze_and_ignore metrics...")
    analyze_and_ignore_metrics = calculate_metrics_for_mode(analyze_and_ignore_list, "analyze_and_ignore")
    
    print("\n--- Generated Metrics Sections ---")
    print(f"Standard metrics: {'Not empty' if standard_metrics else 'Empty'}")
    print(f"Ignore from analysis metrics: {'Not empty' if ignore_from_analysis_metrics else 'Empty'}")
    print(f"Ignore from metadata metrics: {'Not empty' if ignore_from_metadata_metrics else 'Empty'}")
    print(f"Analyze and ignore metrics: {'Not empty' if analyze_and_ignore_metrics else 'Empty'}")
    
    if not standard_metrics:
        standard_metrics = {"gemma3:4b": {"error": "No metrics available"}}
    if not ignore_from_analysis_metrics:
        ignore_from_analysis_metrics = {"gemma3:4b": {"error": "No metrics available"}}
    if not ignore_from_metadata_metrics:
        ignore_from_metadata_metrics = {"gemma3:4b": {"error": "No metrics available"}}
    if not analyze_and_ignore_metrics:
        analyze_and_ignore_metrics = {"gemma3:4b": {"error": "No metrics available"}}
    
    metrics_grid_html = f"""
        <div class="all-analyses-grid">
            <div class="analysis-box">
                <h3>Standard Analysis Metrics</h3>
                {generate_metrics_section(standard_metrics)}
            </div>
            <div class="analysis-box">
                <h3>Ignore From Analysis Metrics</h3>
                {generate_metrics_section(ignore_from_analysis_metrics)}
            </div>
            <div class="analysis-box">
                <h3>Ignore From Metadata Metrics</h3>
                {generate_metrics_section(ignore_from_metadata_metrics)}
            </div>
            <div class="analysis-box">
                <h3>Analyze and Ignore Metrics</h3>
                {generate_metrics_section(analyze_and_ignore_metrics)}
            </div>
        </div>
    """
    
    all_models = set()
    for analyses in [standard_analyses, ignore_from_analysis_list, ignore_from_metadata_list, analyze_and_ignore_list]:
        for analysis in analyses:
            model = analysis.get("_metadata", {}).get("model", "unknown")
            all_models.add(model)
    
    sorted_models = sorted(all_models, key=lambda x: 0 if "12b" in x else 1)
    model_options_html = "\n".join([f'<option value="{model}">{model}</option>' for model in sorted_models])
    
    tab_layout = f"""
    <div class="tab-container">
        <div class="tab-navigation">
            <button class="tab-button active" data-tab="metrics">Metrics</button>
            <button class="tab-button" data-tab="runs">Test Runs</button>
        </div>
        
        <div id="metrics-tab" class="tab-content active">
            <div class="metrics-grid-container">
                {metrics_grid_html}
            </div>
        </div>
        
        <div id="runs-tab" class="tab-content">
            <div class="filter-container">
                <div>
                    <label for="modelFilter">Filter by Model:</label>
                    <select id="modelFilter">
                        <option value="all">All Models</option>
                        {model_options_html}
                    </select>
                </div>
                <div>
                    <button id="expandAll">Expand All</button>
                    <button id="collapseAll">Collapse All</button>
                </div>
            </div>
            
            <div id="test-cases-container">
                {"".join(test_cases_html)}
            </div>
        </div>
    </div>
    """
    
    html_template = generate_html()
    html_content = html_template.replace("@GENERATION_TIME@", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    html_content = html_content.replace("@TEST_COUNT@", str(len(test_cases_html)))
    html_content = html_content.replace("@TAB_CONTENT@", tab_layout)
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    report_path = os.path.join(REPORT_DIR, f"snapshot_report_{timestamp}.html")
    
    with open(report_path, 'w') as f:
        f.write(html_content)
    
    print(f"✅ Report generated successfully at: {report_path}")
    
    try:
        import webbrowser
        print(f"Opening report in your default browser...")
        webbrowser.open(f"file://{os.path.abspath(report_path)}")
    except Exception as e:
        print(f"Could not automatically open the report: {e}")
        print(f"Please open the file manually in your browser.")
    
    return report_path

def get_latest_analysis(analyses):
    if not analyses:
        return None
    
    latest = None
    latest_time = None
    
    for analysis in analyses:
        timestamp = analysis.get("_metadata", {}).get("timestamp", "")
        if not latest_time or timestamp > latest_time:
            latest = analysis
            latest_time = timestamp
    
    return latest

def get_latest_analysis_by_mode(analyses, mode):
    if not analyses:
        return None
    
    latest = None
    latest_time = None
    
    for analysis in analyses:
        if analysis.get("_metadata", {}).get("analysis_mode") == mode:
            timestamp = analysis.get("_metadata", {}).get("timestamp", "")
            if not latest_time or timestamp > latest_time:
                latest = analysis
                latest_time = timestamp
    
    return latest

def generate_html():
    html_content = """<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Snapshot Analysis Report</title>
    <style>
        body {
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
            line-height: 1.6;
            color: #333;
            max-width: 1200px;
            margin: 0 auto;
            padding: 20px;
            background-color: #f5f5f5;
        }
        
        h1, h2, h3 {
            color: #2c3e50;
            margin-bottom: 15px;
        }
        
        h1 {
            text-align: center;
            border-bottom: 2px solid #3498db;
            padding-bottom: 10px;
        }
        
        .tab-container {
            width: 100%;
            margin-bottom: 30px;
        }
        
        .tab-navigation {
            display: flex;
            background: #f8f9fa;
            border-radius: 8px 8px 0 0;
            overflow: hidden;
            border-bottom: 2px solid #3498db;
        }
        
        .tab-button {
            padding: 15px 30px;
            background: #f8f9fa;
            border: none;
            cursor: pointer;
            font-size: 16px;
            font-weight: 500;
            color: #7f8c8d;
            transition: all 0.3s ease;
        }
        
        .tab-button:hover {
            background: #edf0f2;
            color: #2c3e50;
        }
        
        .tab-button.active {
            background: #3498db;
            color: white;
        }
        
        .tab-content {
            display: none;
            padding: 20px;
            background: white;
            border-radius: 0 0 8px 8px;
            box-shadow: 0 2px 4px rgba(0,0,0,0.05);
        }
        
        .tab-content.active {
            display: block;
        }
        
        .metrics-grid-container {
            width: 100%;
            background: white;
            padding: 20px 0;
            margin-bottom: 20px;
        }
        
        .all-analyses-grid {
            display: grid;
            grid-template-columns: repeat(4, 1fr);
            gap: 20px;
        }
        
        .test-case {
            background-color: white;
            margin-bottom: 25px;
            border-radius: 8px;
            overflow: hidden;
            box-shadow: 0 4px 8px rgba(0,0,0,0.1);
        }
        
        .test-header {
            background-color: #3498db;
            color: white;
            padding: 15px;
            font-size: 18px;
            font-weight: bold;
            display: flex;
            justify-content: space-between;
            align-items: center;
            cursor: pointer;
        }
        
        .test-content {
            padding: 20px;
            display: none;
        }
        
        .test-metadata {
            background-color: #f8f9fa;
            padding: 10px 15px;
            border-bottom: 1px solid #eee;
            margin-bottom: 15px;
        }
        
        .test-metadata h3 {
            margin-top: 0;
            margin-bottom: 10px;
            font-size: 16px;
            color: #2c3e50;
        }
        
        .category-tag {
            display: inline-block;
            background-color: #9b59b6;
            color: white;
            padding: 3px 8px;
            border-radius: 4px;
            margin-right: 5px;
            margin-bottom: 5px;
            font-size: 12px;
        }
        
        .images-container {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
            gap: 20px;
            margin-bottom: 20px;
        }
        
        .image-box {
            text-align: center;
        }
        
        .image-box img {
            max-width: 100%;
            border: 1px solid #ddd;
            border-radius: 4px;
            box-shadow: 0 2px 4px rgba(0,0,0,0.05);
        }
        
        .filter-container {
            display: flex;
            justify-content: space-between;
            align-items: center;
            margin-bottom: 20px;
            background-color: white;
            padding: 15px;
            border-radius: 8px;
            box-shadow: 0 2px 4px rgba(0,0,0,0.05);
        }
        
        select, button {
            padding: 8px 12px;
            border-radius: 4px;
            border: 1px solid #ddd;
            background-color: white;
            font-family: inherit;
        }
        
        button {
            background-color: #3498db;
            color: white;
            border: none;
            cursor: pointer;
            transition: background-color 0.3s;
        }
        
        button:hover {
            background-color: #2980b9;
        }
        
        .analysis-box {
            background: #f8f9fa;
            padding: 15px;
            border-radius: 6px;
            box-shadow: 0 1px 3px rgba(0,0,0,0.1);
        }
        
        .analysis-box h3 {
            margin-top: 0;
            margin-bottom: 15px;
            padding-bottom: 10px;
            border-bottom: 2px solid #3498db;
            color: #2c3e50;
            font-size: 1.1em;
        }
        
        .analysis-section {
            margin-bottom: 25px;
            padding: 15px;
            background-color: #f8f9fa;
            border-radius: 6px;
        }
        
        .analysis-section h3 {
            margin-top: 0;
            margin-bottom: 15px;
            padding-bottom: 8px;
            border-bottom: 1px solid #ddd;
            color: #2c3e50;
        }
        
        .empty-state {
            padding: 20px;
            text-align: center;
            background-color: #f8f9fa;
            border-radius: 6px;
            color: #7f8c8d;
        }
        
        .ignored-reason {
            margin: 10px 0;
            padding: 8px;
            background-color: #f8d7da;
            border-radius: 4px;
            color: #721c24;
        }
        
        .model-metrics {
            background-color: white;
            padding: 15px;
            border-radius: 8px;
            box-shadow: 0 2px 4px rgba(0,0,0,0.05);
            margin-bottom: 20px;
        }
        
        .metrics-container {
            margin: 0;
            padding: 15px;
            background: #f8f9fa;
            border-radius: 8px;
        }
        
        .metrics-grid {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
            gap: 20px;
        }
        
        .metric-group {
            margin: 15px 0;
        }
        
        .metric-group h4 {
            color: #555;
            margin-bottom: 10px;
            padding-bottom: 5px;
            border-bottom: 1px solid #eee;
        }
        
        .metric {
            display: flex;
            justify-content: space-between;
            margin: 8px 0;
        }
        
        .metric-label {
            color: #666;
        }
        
        .metric-value {
            font-weight: 500;
            color: #2c3e50;
        }
        
        .reason-tag, .element-tag {
            display: inline-block;
            padding: 4px 8px;
            margin: 2px 4px;
            border-radius: 4px;
            font-size: 13px;
            background: #e9f5fe;
            border: 1px solid #bfe2ff;
            color: #3498db;
        }
        
        .unknown-reason {
            background: #fff4e9;
            border: 1px solid #ffd6a5;
            color: #e67e22;
        }
        
        .tag-container {
            display: flex;
            flex-wrap: wrap;
            gap: 6px;
            margin: 8px 0;
        }
        
        .element-tag {
            background: #f0f8ff;
            border: 1px solid #cfe2f3;
            color: #5a7b9c;
        }
        
        .report-meta {
            text-align: center;
            margin-bottom: 20px;
            color: #7f8c8d;
        }
        
        @media (max-width: 1200px) {
            .all-analyses-grid {
                grid-template-columns: repeat(2, 1fr);
            }
        }
        
        @media (max-width: 768px) {
            .all-analyses-grid {
                grid-template-columns: 1fr;
            }
        }
    </style>
</head>
<body>
    <h1>Snapshot Analysis Report</h1>
    
    <div class="report-meta">
        <p>Generated on: @GENERATION_TIME@</p>
        <p>Total test cases: @TEST_COUNT@</p>
    </div>
    
    @TAB_CONTENT@
    
    <script>
    document.addEventListener('DOMContentLoaded', function() {
        const tabButtons = document.querySelectorAll('.tab-button');
        const tabContents = document.querySelectorAll('.tab-content');
        
        tabButtons.forEach(button => {
            button.addEventListener('click', function() {
                const tabId = this.getAttribute('data-tab');
                
                tabButtons.forEach(btn => btn.classList.remove('active'));
                tabContents.forEach(content => content.classList.remove('active'));
                
                this.classList.add('active');
                document.getElementById(tabId + '-tab').classList.add('active');
            });
        });
        
        const modelFilter = document.getElementById('modelFilter');
        if (modelFilter) {
            modelFilter.addEventListener('change', function() {
                const selectedModel = this.value;
                document.querySelectorAll('.model-analysis').forEach(analysis => {
                    const modelHeading = analysis.querySelector('.model-heading');
                    const modelText = modelHeading ? modelHeading.textContent : '';
                    const testCase = analysis.closest('.test-case');
                    
                    if (selectedModel === 'all' || modelText.includes(selectedModel)) {
                        analysis.style.display = 'block';
                        if (testCase) testCase.style.display = 'block';
                    } else {
                        analysis.style.display = 'none';
                        if (testCase && !testCase.querySelector('.model-analysis[style="display: block"]')) {
                            testCase.style.display = 'none';
                        }
                    }
                });
            });
        }
        
        const expandAll = document.getElementById('expandAll');
        const collapseAll = document.getElementById('collapseAll');
        
        if (expandAll) {
            expandAll.addEventListener('click', function() {
                document.querySelectorAll('.test-content').forEach(content => {
                    content.style.display = 'block';
                });
            });
        }
        
        if (collapseAll) {
            collapseAll.addEventListener('click', function() {
                document.querySelectorAll('.test-content').forEach(content => {
                    content.style.display = 'none';
                });
            });
        }
        
        document.querySelectorAll('.test-header').forEach(header => {
            header.addEventListener('click', function() {
                const content = this.nextElementSibling;
                if (content.style.display === 'none' || content.style.display === '') {
                    content.style.display = 'block';
                } else {
                    content.style.display = 'none';
                }
            });
        });
    });
    </script>
</body>
</html>"""
    return html_content

def generate_standard_analysis_html(analysis):
    if not analysis:
        return "<p>No standard analysis available</p>"
    
    html = ""
    for reason in analysis.get("reasons", []):
        html += f'<span class="reason-tag">{reason}</span>'
    
    pixel_diff = analysis.get("pixel_diff", "N/A")
    semantic_diff = analysis.get("semantic_diff", "N/A")
    
    return f"""
        <div class="analysis-summary">
            <div class="metric">
                <span class="metric-label">Pixel Diff:</span>
                <span class="metric-value">{pixel_diff}</span>
            </div>
            <div class="metric">
                <span class="metric-label">Semantic Diff:</span>
                <span class="metric-value">{semantic_diff}</span>
            </div>
            <div class="reasons">
                <h4>Reasons:</h4>
                {html}
            </div>
        </div>
    """

def generate_ignore_analysis_html(analysis, type_name):
    if not analysis:
        return f"<p>No {type_name} analysis available</p>"
    
    ignored_reason = analysis.get("_metadata", {}).get("ignored_reason", "Unknown")
    remaining_html = ""
    for reason in analysis.get("reasons", []):
        remaining_html += f'<span class="reason-tag">{reason}</span>'
    
    return f"""
        <div class="analysis-summary">
            <div class="ignored-reason">
                <span class="metric-label">Ignored:</span>
                <span class="metric-value">{ignored_reason}</span>
            </div>
            <div class="reasons">
                <h4>Remaining Issues:</h4>
                {remaining_html or '<p>No remaining issues</p>'}
            </div>
        </div>
    """

if __name__ == "__main__":
    report_path = generate_report()
    if report_path:
        print(f"\nOpen this file in a web browser to view the report:\n{report_path}") 