Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 88.7255 ± 1.5306
total_entries: 136.0000 ± 0.0000
correct_matches: 120.6667 ± 2.0817
average_execution_time: 5.6023 ± 0.2024
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 111.0000 ± 2.0000
    exact_code_matches: 106.3333 ± 2.0817
    exact_code_match_rate: 91.6667 ± 1.7945
    pv_exact_matched: 113.6667 ± 1.5275
    pv_exact_mismatched: 2.3333 ± 1.5275
    pv_exact_match_rate: 97.9885 ± 1.3168
    average_pv_match_rate: 99.0900 ± 0.8419
    average_pv_mismatch_rate: 0.9100 ± 0.8419
    timing_matched: 111.6667 ± 1.1547
    timing_mismatched: 4.3333 ± 1.1547
    timing_match_rate: 96.2644 ± 0.9954
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 111.0000 ± 2.0000
    full_mismatched: 5.0000 ± 2.0000
    full_match_rate: 95.6897 ± 1.7241
    accuracy: 95.6897 ± 1.7241
    average_timing_score: 0.9806 ± 0.0053
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9888 ± 0.0078
    average_best_codebleu:
      average_CodeBLEU: 0.6320 ± 0.0064
      comb_7_CodeBLEU: 0.8507 ± 0.0042
      dataflow_match_score: 0.3046 ± 0.0100
      ngram_match_score: 0.3120 ± 0.0110
      syntax_match_score: 0.9930 ± 0.0053
      weighted_ngram_match_score: 0.2231 ± 0.0097
    average_best_levenshtein: 1.2328 ± 0.1781
    average_best_normalized_levenshtein: 0.0187 ± 0.0069
    average_inference_time: 4.9973 ± 0.2391
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 9.6667 ± 0.5774
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 11.6667 ± 0.5774
    pv_exact_mismatched: 8.3333 ± 0.5774
    pv_exact_match_rate: 58.3333 ± 2.8868
    average_pv_match_rate: 91.8265 ± 1.3752
    average_pv_mismatch_rate: 8.1735 ± 1.3752
    timing_matched: 16.3333 ± 0.5774
    timing_mismatched: 3.6667 ± 0.5774
    timing_match_rate: 81.6667 ± 2.8868
    temp_matched: 8.3333 ± 0.5774
    temp_mismatched: 0.6667 ± 0.5774
    temp_match_rate: 92.5926 ± 6.4150
    full_matched: 9.6667 ± 0.5774
    full_mismatched: 10.3333 ± 0.5774
    full_match_rate: 48.3333 ± 2.8868
    accuracy: 48.3333 ± 2.8868
    average_timing_score: 0.8555 ± 0.0125
    average_temp_score: 0.9860 ± 0.0115
    average_full_score: 0.9073 ± 0.0131
    average_best_codebleu:
      average_CodeBLEU: 0.4875 ± 0.0116
      comb_7_CodeBLEU: 0.6144 ± 0.0108
      dataflow_match_score: 0.5828 ± 0.0140
      ngram_match_score: 0.2496 ± 0.0160
      syntax_match_score: 0.7150 ± 0.0068
      weighted_ngram_match_score: 0.3028 ± 0.0101
    average_best_levenshtein: 135.7000 ± 14.4074
    average_best_normalized_levenshtein: 0.4077 ± 0.0214
    average_inference_time: 9.1111 ± 0.4007
==================================================
