Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 90.4412 ± 2.2059
total_entries: 136.0000 ± 0.0000
correct_matches: 123.0000 ± 3.0000
average_execution_time: 4.4297 ± 1.0961
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 111.0000 ± 3.0000
    exact_code_matches: 104.0000 ± 1.0000
    exact_code_match_rate: 89.6552 ± 0.8621
    pv_exact_matched: 112.6667 ± 1.1547
    pv_exact_mismatched: 3.3333 ± 1.1547
    pv_exact_match_rate: 97.1264 ± 0.9954
    average_pv_match_rate: 98.8506 ± 0.5973
    average_pv_mismatch_rate: 1.1494 ± 0.5973
    timing_matched: 113.3333 ± 2.5166
    timing_mismatched: 2.6667 ± 2.5166
    timing_match_rate: 97.7011 ± 2.1695
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 111.0000 ± 3.0000
    full_mismatched: 5.0000 ± 3.0000
    full_match_rate: 95.6897 ± 2.5862
    accuracy: 95.6897 ± 2.5862
    average_timing_score: 0.9860 ± 0.0083
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9880 ± 0.0063
    average_best_codebleu:
      average_CodeBLEU: 0.6255 ± 0.0022
      comb_7_CodeBLEU: 0.8421 ± 0.0014
      dataflow_match_score: 0.3010 ± 0.0087
      ngram_match_score: 0.3074 ± 0.0032
      syntax_match_score: 0.9735 ± 0.0027
      weighted_ngram_match_score: 0.2218 ± 0.0038
    average_best_levenshtein: 1.6695 ± 0.0199
    average_best_normalized_levenshtein: 0.0250 ± 0.0021
    average_inference_time: 3.4277 ± 1.1654
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 12.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 13.6667 ± 0.5774
    pv_exact_mismatched: 6.3333 ± 0.5774
    pv_exact_match_rate: 68.3333 ± 2.8868
    average_pv_match_rate: 90.0467 ± 0.7945
    average_pv_mismatch_rate: 9.9533 ± 0.7945
    timing_matched: 16.0000 ± 0.0000
    timing_mismatched: 4.0000 ± 0.0000
    timing_match_rate: 80.0000 ± 0.0000
    temp_matched: 8.0000 ± 0.0000
    temp_mismatched: 1.0000 ± 0.0000
    temp_match_rate: 88.8889 ± 0.0000
    full_matched: 12.0000 ± 0.0000
    full_mismatched: 8.0000 ± 0.0000
    full_match_rate: 60.0000 ± 0.0000
    accuracy: 60.0000 ± 0.0000
    average_timing_score: 0.8869 ± 0.0126
    average_temp_score: 0.9826 ± 0.0018
    average_full_score: 0.9026 ± 0.0070
    average_best_codebleu:
      average_CodeBLEU: 0.4392 ± 0.0197
      comb_7_CodeBLEU: 0.5739 ± 0.0178
      dataflow_match_score: 0.5403 ± 0.0441
      ngram_match_score: 0.1906 ± 0.0142
      syntax_match_score: 0.6706 ± 0.0248
      weighted_ngram_match_score: 0.2384 ± 0.0324
    average_best_levenshtein: 148.9833 ± 1.9788
    average_best_normalized_levenshtein: 0.4742 ± 0.0097
    average_inference_time: 10.2410 ± 0.7069
==================================================
