Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 83.5784 ± 2.9717
total_entries: 136.0000 ± 0.0000
correct_matches: 113.6667 ± 4.0415
average_execution_time: 1.6818 ± 0.0416
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 107.0000 ± 2.6458
    exact_code_matches: 97.6667 ± 2.5166
    exact_code_match_rate: 84.1954 ± 2.1695
    pv_exact_matched: 109.0000 ± 2.6458
    pv_exact_mismatched: 7.0000 ± 2.6458
    pv_exact_match_rate: 93.9655 ± 2.2808
    average_pv_match_rate: 95.7328 ± 2.2432
    average_pv_mismatch_rate: 4.2672 ± 2.2432
    timing_matched: 108.3333 ± 2.5166
    timing_mismatched: 7.6667 ± 2.5166
    timing_match_rate: 93.3908 ± 2.1695
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 107.0000 ± 2.6458
    full_mismatched: 9.0000 ± 2.6458
    full_match_rate: 92.2414 ± 2.2808
    accuracy: 92.2414 ± 2.2808
    average_timing_score: 0.9534 ± 0.0217
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9565 ± 0.0222
    average_best_codebleu:
      average_CodeBLEU: 0.5950 ± 0.0094
      comb_7_CodeBLEU: 0.8216 ± 0.0103
      dataflow_match_score: 0.2759 ± 0.0000
      ngram_match_score: 0.2564 ± 0.0095
      syntax_match_score: 0.9454 ± 0.0219
      weighted_ngram_match_score: 0.1782 ± 0.0071
    average_best_levenshtein: 5.9080 ± 1.3090
    average_best_normalized_levenshtein: 0.0653 ± 0.0175
    average_inference_time: 1.2721 ± 0.0817
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 6.6667 ± 1.5275
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 7.6667 ± 2.3094
    pv_exact_mismatched: 12.3333 ± 2.3094
    pv_exact_match_rate: 38.3333 ± 11.5470
    average_pv_match_rate: 83.6603 ± 4.4532
    average_pv_mismatch_rate: 16.3397 ± 4.4532
    timing_matched: 17.0000 ± 1.0000
    timing_mismatched: 3.0000 ± 1.0000
    timing_match_rate: 85.0000 ± 5.0000
    temp_matched: 8.0000 ± 1.0000
    temp_mismatched: 1.0000 ± 1.0000
    temp_match_rate: 88.8889 ± 11.1111
    full_matched: 6.6667 ± 1.5275
    full_mismatched: 13.3333 ± 1.5275
    full_match_rate: 33.3333 ± 7.6376
    accuracy: 33.3333 ± 7.6376
    average_timing_score: 0.8155 ± 0.0232
    average_temp_score: 0.9831 ± 0.0134
    average_full_score: 0.8386 ± 0.0376
    average_best_codebleu:
      average_CodeBLEU: 0.3742 ± 0.0141
      comb_7_CodeBLEU: 0.4977 ± 0.0126
      dataflow_match_score: 0.4867 ± 0.0317
      ngram_match_score: 0.1460 ± 0.0182
      syntax_match_score: 0.5902 ± 0.0237
      weighted_ngram_match_score: 0.1907 ± 0.0164
    average_best_levenshtein: 220.0333 ± 20.7356
    average_best_normalized_levenshtein: 0.5170 ± 0.0118
    average_inference_time: 4.0582 ± 0.3519
==================================================
