Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 76.1905 ± 0.0000
total_entries: 126.0000 ± 0.0000
correct_matches: 96.0000 ± 0.0000
average_execution_time: 1.5574 ± 0.1365
metrics_by_complexity:
  simple:
    count: 120.0000 ± 0.0000
    full_matches: 93.0000 ± 0.0000
    pv_exact_matched: 93.0000 ± 0.0000
    pv_exact_mismatched: 27.0000 ± 0.0000
    pv_exact_match_rate: 77.5000 ± 0.0000
    average_pv_match_rate: 77.5000 ± 0.0000
    average_pv_mismatch_rate: 22.5000 ± 0.0000
    timing_matched: 93.0000 ± 0.0000
    timing_mismatched: 27.0000 ± 0.0000
    timing_match_rate: 77.5000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 93.0000 ± 0.0000
    full_mismatched: 27.0000 ± 0.0000
    full_match_rate: 77.5000 ± 0.0000
    accuracy: 77.5000 ± 0.0000
    average_best_codebleu:
      codebleu: 0.5364 ± 0.0000
      ngram_match_score: 0.1529 ± 0.0000
      weighted_ngram_match_score: 0.0726 ± 0.0000
      syntax_match_score: 0.9200 ± 0.0000
      dataflow_match_score: 0.1000 ± 0.0000
    average_best_levenshtein: 3.4917 ± 0.0000
    average_best_normalized_levenshtein: 0.1208 ± 0.0000
    average_execution_time: 1.3770 ± 0.0630
  complex:
    count: 6.0000 ± 0.0000
    full_matches: 3.0000 ± 0.0000
    pv_exact_matched: 3.0000 ± 0.0000
    pv_exact_mismatched: 3.0000 ± 0.0000
    pv_exact_match_rate: 50.0000 ± 0.0000
    average_pv_match_rate: 73.2061 ± 0.0000
    average_pv_mismatch_rate: 26.7939 ± 0.0000
    timing_matched: 3.0000 ± 0.0000
    timing_mismatched: 3.0000 ± 0.0000
    timing_match_rate: 50.0000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 3.0000 ± 0.0000
    full_mismatched: 3.0000 ± 0.0000
    full_match_rate: 50.0000 ± 0.0000
    accuracy: 50.0000 ± 0.0000
    average_best_codebleu:
      codebleu: 0.6893 ± 0.0082
      ngram_match_score: 0.4544 ± 0.0127
      weighted_ngram_match_score: 0.5389 ± 0.0164
      syntax_match_score: 0.8649 ± 0.0024
      dataflow_match_score: 0.8991 ± 0.0012
    average_best_levenshtein: 31.7778 ± 0.1925
    average_best_normalized_levenshtein: 0.1675 ± 0.0018
    average_execution_time: 5.1667 ± 1.6470
==================================================
