archived
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 0.0000 ± 0.0000
total_entries: 136.0000 ± 0.0000
correct_matches: 0.0000 ± 0.0000
average_execution_time: 24.5999 ± 0.0558
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 0.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 0.0000 ± 0.0000
    pv_exact_mismatched: 116.0000 ± 0.0000
    pv_exact_match_rate: 0.0000 ± 0.0000
    average_pv_match_rate: 0.0000 ± 0.0000
    average_pv_mismatch_rate: 100.0000 ± 0.0000
    timing_matched: 0.0000 ± 0.0000
    timing_mismatched: 116.0000 ± 0.0000
    timing_match_rate: 0.0000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 0.0000 ± 0.0000
    full_mismatched: 116.0000 ± 0.0000
    full_match_rate: 0.0000 ± 0.0000
    accuracy: 0.0000 ± 0.0000
    average_timing_score: 0.0000 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.0000 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.3052 ± 0.0000
      comb_7_CodeBLEU: 0.4883 ± 0.0000
      dataflow_match_score: 0.2586 ± 0.0000
      ngram_match_score: 0.0000 ± 0.0000
      syntax_match_score: 0.2207 ± 0.0000
      weighted_ngram_match_score: 0.0000 ± 0.0000
    average_best_levenshtein: 1486.4914 ± 0.0000
    average_best_normalized_levenshtein: 0.9729 ± 0.0000
    average_inference_time: 21.4821 ± 0.0673
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 0.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 0.0000 ± 0.0000
    pv_exact_mismatched: 20.0000 ± 0.0000
    pv_exact_match_rate: 0.0000 ± 0.0000
    average_pv_match_rate: 0.0000 ± 0.0000
    average_pv_mismatch_rate: 100.0000 ± 0.0000
    timing_matched: 0.0000 ± 0.0000
    timing_mismatched: 20.0000 ± 0.0000
    timing_match_rate: 0.0000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 9.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 0.0000 ± 0.0000
    full_mismatched: 20.0000 ± 0.0000
    full_match_rate: 0.0000 ± 0.0000
    accuracy: 0.0000 ± 0.0000
    average_timing_score: 0.0000 ± 0.0000
    average_temp_score: 0.5500 ± 0.0000
    average_full_score: 0.0000 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.2349 ± 0.0000
      comb_7_CodeBLEU: 0.3652 ± 0.0000
      dataflow_match_score: 0.6386 ± 0.0000
      ngram_match_score: 0.0024 ± 0.0000
      syntax_match_score: 0.1657 ± 0.0000
      weighted_ngram_match_score: 0.0327 ± 0.0000
    average_best_levenshtein: 1636.0000 ± 0.0000
    average_best_normalized_levenshtein: 0.8852 ± 0.0000
    average_inference_time: 42.6834 ± 0.0289
==================================================
================================================================================

gemini-2.5-pro-native
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 87.2549 ± 1.6981
average_execution_time: 6.7128 ± 0.1270
correct_matches: 118.6667 ± 2.3094
metrics_by_complexity:
  complex:
    accuracy: 58.3333 ± 2.8868
    average_best_codebleu:
      average_CodeBLEU: 0.4840 ± 0.0196
      comb_7_CodeBLEU: 0.6026 ± 0.0203
      dataflow_match_score: 0.5577 ± 0.0247
      ngram_match_score: 0.2561 ± 0.0174
      syntax_match_score: 0.7057 ± 0.0170
      weighted_ngram_match_score: 0.3166 ± 0.0195
    average_best_levenshtein: 135.1667 ± 13.6520
    average_best_normalized_levenshtein: 0.3965 ± 0.0095
    average_full_score: 0.9031 ± 0.0059
    average_inference_time: 16.2372 ± 0.8937
    average_pv_match_rate: 91.5118 ± 0.6376
    average_pv_mismatch_rate: 8.4882 ± 0.6376
    average_temp_score: 0.9698 ± 0.0001
    average_timing_score: 0.8572 ± 0.0042
    count: 20.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    full_match_rate: 58.3333 ± 2.8868
    full_matched: 11.6667 ± 0.5774
    full_matches: 11.6667 ± 0.5774
    full_mismatched: 8.3333 ± 0.5774
    pv_exact_match_rate: 63.3333 ± 2.8868
    pv_exact_matched: 12.6667 ± 0.5774
    pv_exact_mismatched: 7.3333 ± 0.5774
    temp_match_rate: 77.7778 ± 0.0000
    temp_matched: 7.0000 ± 0.0000
    temp_mismatched: 2.0000 ± 0.0000
    timing_match_rate: 80.0000 ± 0.0000
    timing_matched: 16.0000 ± 0.0000
    timing_mismatched: 4.0000 ± 0.0000
  simple:
    accuracy: 92.2414 ± 1.4931
    average_best_codebleu:
      average_CodeBLEU: 0.5975 ± 0.0015
      comb_7_CodeBLEU: 0.8189 ± 0.0011
      dataflow_match_score: 0.2385 ± 0.0050
      ngram_match_score: 0.2606 ± 0.0035
      syntax_match_score: 0.9329 ± 0.0035
      weighted_ngram_match_score: 0.1967 ± 0.0038
    average_best_levenshtein: 3.2701 ± 0.0389
    average_best_normalized_levenshtein: 0.0695 ± 0.0007
    average_full_score: 0.9820 ± 0.0038
    average_inference_time: 5.0707 ± 0.0245
    average_pv_match_rate: 98.4195 ± 0.4479
    average_pv_mismatch_rate: 1.5805 ± 0.4479
    average_temp_score: 1.0000 ± 0.0000
    average_timing_score: 0.9734 ± 0.0014
    count: 116.0000 ± 0.0000
    exact_code_match_rate: 71.8391 ± 0.4977
    exact_code_matches: 83.3333 ± 0.5774
    full_match_rate: 92.2414 ± 1.4931
    full_matched: 107.0000 ± 1.7321
    full_matches: 107.0000 ± 1.7321
    full_mismatched: 9.0000 ± 1.7321
    pv_exact_match_rate: 95.4023 ± 1.9909
    pv_exact_matched: 110.6667 ± 2.3094
    pv_exact_mismatched: 5.3333 ± 2.3094
    temp_match_rate: 0.0000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    timing_match_rate: 94.2529 ± 0.9954
    timing_matched: 109.3333 ± 1.1547
    timing_mismatched: 6.6667 ± 1.1547
total_entries: 136.0000 ± 0.0000
==================================================
================================================================================

o3-high
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 88.4804 ± 1.1232
total_entries: 136.0000 ± 0.0000
correct_matches: 120.3333 ± 1.5275
average_execution_time: 6.4398 ± 0.3531
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 111.3333 ± 0.5774
    exact_code_matches: 104.0000 ± 2.0000
    exact_code_match_rate: 89.6552 ± 1.7241
    pv_exact_matched: 114.6667 ± 0.5774
    pv_exact_mismatched: 1.3333 ± 0.5774
    pv_exact_match_rate: 98.8506 ± 0.4977
    average_pv_match_rate: 99.4253 ± 0.4977
    average_pv_mismatch_rate: 0.5747 ± 0.4977
    timing_matched: 111.3333 ± 0.5774
    timing_mismatched: 4.6667 ± 0.5774
    timing_match_rate: 95.9770 ± 0.4977
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 111.3333 ± 0.5774
    full_mismatched: 4.6667 ± 0.5774
    full_match_rate: 95.9770 ± 0.4977
    accuracy: 95.9770 ± 0.4977
    average_timing_score: 0.9802 ± 0.0044
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9914 ± 0.0048
    average_best_codebleu:
      average_CodeBLEU: 0.6203 ± 0.0049
      comb_7_CodeBLEU: 0.8467 ± 0.0021
      dataflow_match_score: 0.2730 ± 0.0050
      ngram_match_score: 0.2847 ± 0.0118
      syntax_match_score: 0.9952 ± 0.0005
      weighted_ngram_match_score: 0.2015 ± 0.0074
    average_best_levenshtein: 2.0833 ± 0.8539
    average_best_normalized_levenshtein: 0.0251 ± 0.0050
    average_inference_time: 4.8513 ± 0.3657
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 9.0000 ± 1.7321
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 9.6667 ± 1.5275
    pv_exact_mismatched: 10.3333 ± 1.5275
    pv_exact_match_rate: 48.3333 ± 7.6376
    average_pv_match_rate: 84.4687 ± 2.7633
    average_pv_mismatch_rate: 15.5313 ± 2.7633
    timing_matched: 15.6667 ± 1.1547
    timing_mismatched: 4.3333 ± 1.1547
    timing_match_rate: 78.3333 ± 5.7735
    temp_matched: 8.3333 ± 1.1547
    temp_mismatched: 0.6667 ± 1.1547
    temp_match_rate: 92.5926 ± 12.8300
    full_matched: 9.0000 ± 1.7321
    full_mismatched: 11.0000 ± 1.7321
    full_match_rate: 45.0000 ± 8.6603
    accuracy: 45.0000 ± 8.6603
    average_timing_score: 0.8061 ± 0.0266
    average_temp_score: 0.9874 ± 0.0150
    average_full_score: 0.8471 ± 0.0160
    average_best_codebleu:
      average_CodeBLEU: 0.4334 ± 0.0182
      comb_7_CodeBLEU: 0.5672 ± 0.0106
      dataflow_match_score: 0.5379 ± 0.0115
      ngram_match_score: 0.1970 ± 0.0318
      syntax_match_score: 0.6582 ± 0.0060
      weighted_ngram_match_score: 0.2239 ± 0.0304
    average_best_levenshtein: 149.1333 ± 6.4781
    average_best_normalized_levenshtein: 0.4291 ± 0.0153
    average_inference_time: 15.6529 ± 0.3791
==================================================
================================================================================

llama3.3
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 35.2941 ± 0.0000
total_entries: 136.0000 ± 0.0000
correct_matches: 48.0000 ± 0.0000
average_execution_time: 5.6858 ± 0.1171
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 46.0000 ± 0.0000
    exact_code_matches: 41.0000 ± 0.0000
    exact_code_match_rate: 35.3448 ± 0.0000
    pv_exact_matched: 46.0000 ± 0.0000
    pv_exact_mismatched: 70.0000 ± 0.0000
    pv_exact_match_rate: 39.6552 ± 0.0000
    average_pv_match_rate: 51.7241 ± 0.0000
    average_pv_mismatch_rate: 48.2759 ± 0.0000
    timing_matched: 57.0000 ± 0.0000
    timing_mismatched: 59.0000 ± 0.0000
    timing_match_rate: 49.1379 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 46.0000 ± 0.0000
    full_mismatched: 70.0000 ± 0.0000
    full_match_rate: 39.6552 ± 0.0000
    accuracy: 39.6552 ± 0.0000
    average_timing_score: 0.5519 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.5242 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.4970 ± 0.0000
      comb_7_CodeBLEU: 0.7630 ± 0.0000
      dataflow_match_score: 0.1293 ± 0.0000
      ngram_match_score: 0.0788 ± 0.0000
      syntax_match_score: 0.8805 ± 0.0000
      weighted_ngram_match_score: 0.0289 ± 0.0000
    average_best_levenshtein: 7.0862 ± 0.0000
    average_best_normalized_levenshtein: 0.2556 ± 0.0000
    average_inference_time: 5.0643 ± 0.0157
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 2.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 2.0000 ± 0.0000
    pv_exact_mismatched: 18.0000 ± 0.0000
    pv_exact_match_rate: 10.0000 ± 0.0000
    average_pv_match_rate: 46.6534 ± 0.9618
    average_pv_mismatch_rate: 53.3466 ± 0.9618
    timing_matched: 11.3333 ± 0.5774
    timing_mismatched: 8.6667 ± 0.5774
    timing_match_rate: 56.6667 ± 2.8868
    temp_matched: 3.3333 ± 0.5774
    temp_mismatched: 5.6667 ± 0.5774
    temp_match_rate: 37.0370 ± 6.4150
    full_matched: 2.0000 ± 0.0000
    full_mismatched: 18.0000 ± 0.0000
    full_match_rate: 10.0000 ± 0.0000
    accuracy: 10.0000 ± 0.0000
    average_timing_score: 0.6523 ± 0.0005
    average_temp_score: 0.7371 ± 0.0273
    average_full_score: 0.4950 ± 0.0109
    average_best_codebleu:
      average_CodeBLEU: 0.3829 ± 0.0034
      comb_7_CodeBLEU: 0.4990 ± 0.0039
      dataflow_match_score: 0.4796 ± 0.0012
      ngram_match_score: 0.1773 ± 0.0027
      syntax_match_score: 0.6233 ± 0.0093
      weighted_ngram_match_score: 0.2014 ± 0.0029
    average_best_levenshtein: 170.3833 ± 1.6540
    average_best_normalized_levenshtein: 0.4935 ± 0.0023
    average_inference_time: 9.2909 ± 0.7105
==================================================
================================================================================

athene-v2
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 46.3235 ± 0.0000
total_entries: 136.0000 ± 0.0000
correct_matches: 63.0000 ± 0.0000
average_execution_time: 5.7335 ± 0.1158
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 62.0000 ± 0.0000
    exact_code_matches: 57.0000 ± 0.0000
    exact_code_match_rate: 49.1379 ± 0.0000
    pv_exact_matched: 62.0000 ± 0.0000
    pv_exact_mismatched: 54.0000 ± 0.0000
    pv_exact_match_rate: 53.4483 ± 0.0000
    average_pv_match_rate: 59.9569 ± 0.0000
    average_pv_mismatch_rate: 40.0431 ± 0.0000
    timing_matched: 66.0000 ± 0.0000
    timing_mismatched: 50.0000 ± 0.0000
    timing_match_rate: 56.8966 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 62.0000 ± 0.0000
    full_mismatched: 54.0000 ± 0.0000
    full_match_rate: 53.4483 ± 0.0000
    accuracy: 53.4483 ± 0.0000
    average_timing_score: 0.6074 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.6011 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.5149 ± 0.0000
      comb_7_CodeBLEU: 0.7630 ± 0.0000
      dataflow_match_score: 0.2586 ± 0.0000
      ngram_match_score: 0.1263 ± 0.0000
      syntax_match_score: 0.8566 ± 0.0000
      weighted_ngram_match_score: 0.0769 ± 0.0000
    average_best_levenshtein: 19.0431 ± 0.0000
    average_best_normalized_levenshtein: 0.2097 ± 0.0000
    average_inference_time: 5.1991 ± 0.0196
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 1.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 1.0000 ± 0.0000
    pv_exact_mismatched: 19.0000 ± 0.0000
    pv_exact_match_rate: 5.0000 ± 0.0000
    average_pv_match_rate: 55.6272 ± 0.0000
    average_pv_mismatch_rate: 44.3728 ± 0.0000
    timing_matched: 12.0000 ± 0.0000
    timing_mismatched: 8.0000 ± 0.0000
    timing_match_rate: 60.0000 ± 0.0000
    temp_matched: 4.0000 ± 0.0000
    temp_mismatched: 5.0000 ± 0.0000
    temp_match_rate: 44.4444 ± 0.0000
    full_matched: 1.0000 ± 0.0000
    full_mismatched: 19.0000 ± 0.0000
    full_match_rate: 5.0000 ± 0.0000
    accuracy: 5.0000 ± 0.0000
    average_timing_score: 0.7056 ± 0.0008
    average_temp_score: 0.7604 ± 0.0000
    average_full_score: 0.5807 ± 0.0002
    average_best_codebleu:
      average_CodeBLEU: 0.4466 ± 0.0000
      comb_7_CodeBLEU: 0.5716 ± 0.0000
      dataflow_match_score: 0.5979 ± 0.0000
      ngram_match_score: 0.2284 ± 0.0000
      syntax_match_score: 0.7121 ± 0.0000
      weighted_ngram_match_score: 0.2481 ± 0.0000
    average_best_levenshtein: 154.9333 ± 0.1443
    average_best_normalized_levenshtein: 0.4450 ± 0.0001
    average_inference_time: 8.8329 ± 0.6769
==================================================
================================================================================

qwen3-coder
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 41.6667 ± 0.4245
total_entries: 136.0000 ± 0.0000
correct_matches: 56.6667 ± 0.5774
average_execution_time: 2.1045 ± 0.0645
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 55.6667 ± 0.5774
    exact_code_matches: 46.0000 ± 0.0000
    exact_code_match_rate: 39.6552 ± 0.0000
    pv_exact_matched: 55.6667 ± 0.5774
    pv_exact_mismatched: 60.3333 ± 0.5774
    pv_exact_match_rate: 47.9885 ± 0.4977
    average_pv_match_rate: 54.7701 ± 0.4977
    average_pv_mismatch_rate: 45.2299 ± 0.4977
    timing_matched: 62.6667 ± 0.5774
    timing_mismatched: 53.3333 ± 0.5774
    timing_match_rate: 54.0230 ± 0.4977
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 6.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 55.6667 ± 0.5774
    full_mismatched: 60.3333 ± 0.5774
    full_match_rate: 47.9885 ± 0.4977
    accuracy: 47.9885 ± 0.4977
    average_timing_score: 0.5753 ± 0.0050
    average_temp_score: 0.9483 ± 0.0000
    average_full_score: 0.5470 ± 0.0050
    average_best_codebleu:
      average_CodeBLEU: 0.5305 ± 0.0000
      comb_7_CodeBLEU: 0.7696 ± 0.0000
      dataflow_match_score: 0.2845 ± 0.0000
      ngram_match_score: 0.1537 ± 0.0000
      syntax_match_score: 0.8580 ± 0.0000
      weighted_ngram_match_score: 0.1102 ± 0.0000
    average_best_levenshtein: 8.3966 ± 0.0000
    average_best_normalized_levenshtein: 0.2366 ± 0.0000
    average_inference_time: 1.9494 ± 0.0429
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 1.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 1.0000 ± 0.0000
    pv_exact_mismatched: 19.0000 ± 0.0000
    pv_exact_match_rate: 5.0000 ± 0.0000
    average_pv_match_rate: 51.9401 ± 1.5221
    average_pv_mismatch_rate: 48.0599 ± 1.5221
    timing_matched: 8.0000 ± 0.0000
    timing_mismatched: 12.0000 ± 0.0000
    timing_match_rate: 40.0000 ± 0.0000
    temp_matched: 7.0000 ± 0.0000
    temp_mismatched: 2.0000 ± 0.0000
    temp_match_rate: 77.7778 ± 0.0000
    full_matched: 1.0000 ± 0.0000
    full_mismatched: 19.0000 ± 0.0000
    full_match_rate: 5.0000 ± 0.0000
    accuracy: 5.0000 ± 0.0000
    average_timing_score: 0.6881 ± 0.0110
    average_temp_score: 0.8992 ± 0.0000
    average_full_score: 0.5739 ± 0.0113
    average_best_codebleu:
      average_CodeBLEU: 0.3301 ± 0.0006
      comb_7_CodeBLEU: 0.4654 ± 0.0007
      dataflow_match_score: 0.4240 ± 0.0008
      ngram_match_score: 0.0945 ± 0.0010
      syntax_match_score: 0.5874 ± 0.0006
      weighted_ngram_match_score: 0.1145 ± 0.0003
    average_best_levenshtein: 163.2667 ± 1.7863
    average_best_normalized_levenshtein: 0.5112 ± 0.0044
    average_inference_time: 3.0043 ± 0.2045
==================================================
================================================================================

phi3.5
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 0.2451 ± 0.4245
total_entries: 136.0000 ± 0.0000
correct_matches: 0.3333 ± 0.5774
average_execution_time: 0.9749 ± 0.5373
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 0.3333 ± 0.5774
    exact_code_matches: 0.3333 ± 0.5774
    exact_code_match_rate: 0.2874 ± 0.4977
    pv_exact_matched: 0.3333 ± 0.5774
    pv_exact_mismatched: 115.6667 ± 0.5774
    pv_exact_match_rate: 0.2874 ± 0.4977
    average_pv_match_rate: 0.4598 ± 0.7963
    average_pv_mismatch_rate: 99.5402 ± 0.7963
    timing_matched: 0.3333 ± 0.5774
    timing_mismatched: 115.6667 ± 0.5774
    timing_match_rate: 0.2874 ± 0.4977
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 0.3333 ± 0.5774
    full_mismatched: 115.6667 ± 0.5774
    full_match_rate: 0.2874 ± 0.4977
    accuracy: 0.2874 ± 0.4977
    average_timing_score: 0.0039 ± 0.0068
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.0045 ± 0.0077
    average_best_codebleu:
      average_CodeBLEU: 0.2561 ± 0.0106
      comb_7_CodeBLEU: 0.4092 ± 0.0160
      dataflow_match_score: 0.0287 ± 0.0498
      ngram_match_score: 0.0010 ± 0.0016
      syntax_match_score: 0.0226 ± 0.0392
      weighted_ngram_match_score: 0.0008 ± 0.0014
    average_best_levenshtein: 250.9655 ± 398.9839
    average_best_normalized_levenshtein: 0.9896 ± 0.0180
    average_inference_time: 1.0283 ± 0.6298
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 0.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 0.0000 ± 0.0000
    pv_exact_mismatched: 20.0000 ± 0.0000
    pv_exact_match_rate: 0.0000 ± 0.0000
    average_pv_match_rate: 0.0000 ± 0.0000
    average_pv_mismatch_rate: 100.0000 ± 0.0000
    timing_matched: 0.0000 ± 0.0000
    timing_mismatched: 20.0000 ± 0.0000
    timing_match_rate: 0.0000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 9.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 0.0000 ± 0.0000
    full_mismatched: 20.0000 ± 0.0000
    full_match_rate: 0.0000 ± 0.0000
    accuracy: 0.0000 ± 0.0000
    average_timing_score: 0.0000 ± 0.0000
    average_temp_score: 0.5500 ± 0.0000
    average_full_score: 0.0000 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.2500 ± 0.0000
      comb_7_CodeBLEU: 0.4000 ± 0.0000
      dataflow_match_score: 0.0000 ± 0.0000
      ngram_match_score: 0.0000 ± 0.0000
      syntax_match_score: 0.0000 ± 0.0000
      weighted_ngram_match_score: 0.0000 ± 0.0000
    average_best_levenshtein: 256.0000 ± 0.0000
    average_best_normalized_levenshtein: 1.0000 ± 0.0000
    average_inference_time: 0.6649 ± 0.0005
==================================================
================================================================================

qwen2
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 32.3529 ± 0.0000
total_entries: 136.0000 ± 0.0000
correct_matches: 44.0000 ± 0.0000
average_execution_time: 0.9502 ± 0.0409
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 43.0000 ± 0.0000
    exact_code_matches: 37.0000 ± 0.0000
    exact_code_match_rate: 31.8966 ± 0.0000
    pv_exact_matched: 43.0000 ± 0.0000
    pv_exact_mismatched: 73.0000 ± 0.0000
    pv_exact_match_rate: 37.0690 ± 0.0000
    average_pv_match_rate: 46.3362 ± 0.0000
    average_pv_mismatch_rate: 53.6638 ± 0.0000
    timing_matched: 51.0000 ± 0.0000
    timing_mismatched: 65.0000 ± 0.0000
    timing_match_rate: 43.9655 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 43.0000 ± 0.0000
    full_mismatched: 73.0000 ± 0.0000
    full_match_rate: 37.0690 ± 0.0000
    accuracy: 37.0690 ± 0.0000
    average_timing_score: 0.4853 ± 0.0001
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.4678 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.5011 ± 0.0000
      comb_7_CodeBLEU: 0.7601 ± 0.0000
      dataflow_match_score: 0.2069 ± 0.0000
      ngram_match_score: 0.0898 ± 0.0000
      syntax_match_score: 0.8656 ± 0.0000
      weighted_ngram_match_score: 0.0490 ± 0.0000
    average_best_levenshtein: 43.4828 ± 0.0000
    average_best_normalized_levenshtein: 0.2818 ± 0.0000
    average_inference_time: 0.8311 ± 0.0163
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 1.0000 ± 0.0000
    exact_code_matches: 1.0000 ± 0.0000
    exact_code_match_rate: 5.0000 ± 0.0000
    pv_exact_matched: 1.0000 ± 0.0000
    pv_exact_mismatched: 19.0000 ± 0.0000
    pv_exact_match_rate: 5.0000 ± 0.0000
    average_pv_match_rate: 24.2812 ± 0.7485
    average_pv_mismatch_rate: 75.7188 ± 0.7485
    timing_matched: 4.0000 ± 0.0000
    timing_mismatched: 16.0000 ± 0.0000
    timing_match_rate: 20.0000 ± 0.0000
    temp_matched: 1.0000 ± 0.0000
    temp_mismatched: 8.0000 ± 0.0000
    temp_match_rate: 11.1111 ± 0.0000
    full_matched: 1.0000 ± 0.0000
    full_mismatched: 19.0000 ± 0.0000
    full_match_rate: 5.0000 ± 0.0000
    accuracy: 5.0000 ± 0.0000
    average_timing_score: 0.3989 ± 0.0050
    average_temp_score: 0.6170 ± 0.0000
    average_full_score: 0.2585 ± 0.0047
    average_best_codebleu:
      average_CodeBLEU: 0.3395 ± 0.0024
      comb_7_CodeBLEU: 0.4576 ± 0.0033
      dataflow_match_score: 0.4840 ± 0.0060
      ngram_match_score: 0.1179 ± 0.0020
      syntax_match_score: 0.5385 ± 0.0018
      weighted_ngram_match_score: 0.1676 ± 0.0009
    average_best_levenshtein: 194.6333 ± 3.4269
    average_best_normalized_levenshtein: 0.5990 ± 0.0051
    average_inference_time: 1.6413 ± 0.1841
==================================================
================================================================================

grok-code-fast-1-or
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 84.8039 ± 2.2464
average_execution_time: 3.5659 ± 0.1636
correct_matches: 115.3333 ± 3.0551
metrics_by_complexity:
  complex:
    accuracy: 46.6667 ± 2.8868
    average_best_codebleu:
      average_CodeBLEU: 0.4586 ± 0.0196
      comb_7_CodeBLEU: 0.5839 ± 0.0238
      dataflow_match_score: 0.5320 ± 0.0591
      ngram_match_score: 0.2275 ± 0.0104
      syntax_match_score: 0.6861 ± 0.0231
      weighted_ngram_match_score: 0.2722 ± 0.0152
    average_best_levenshtein: 140.8500 ± 7.4960
    average_best_normalized_levenshtein: 0.4447 ± 0.0127
    average_full_score: 0.8517 ± 0.0228
    average_inference_time: 7.3849 ± 1.4641
    average_pv_match_rate: 85.7219 ± 2.1669
    average_pv_mismatch_rate: 14.2781 ± 2.1669
    average_temp_score: 0.9385 ± 0.0309
    average_timing_score: 0.8303 ± 0.0085
    count: 20.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    full_match_rate: 46.6667 ± 2.8868
    full_matched: 9.3333 ± 0.5774
    full_matches: 9.3333 ± 0.5774
    full_mismatched: 10.6667 ± 0.5774
    pv_exact_match_rate: 55.0000 ± 8.6603
    pv_exact_matched: 11.0000 ± 1.7321
    pv_exact_mismatched: 9.0000 ± 1.7321
    temp_match_rate: 74.0741 ± 6.4150
    temp_matched: 6.6667 ± 0.5774
    temp_mismatched: 2.3333 ± 0.5774
    timing_match_rate: 76.6667 ± 2.8868
    timing_matched: 15.3333 ± 0.5774
    timing_mismatched: 4.6667 ± 0.5774
  simple:
    accuracy: 91.3793 ± 2.5862
    average_best_codebleu:
      average_CodeBLEU: 0.6051 ± 0.0098
      comb_7_CodeBLEU: 0.8155 ± 0.0086
      dataflow_match_score: 0.3333 ± 0.0100
      ngram_match_score: 0.2964 ± 0.0136
      syntax_match_score: 0.9113 ± 0.0156
      weighted_ngram_match_score: 0.2129 ± 0.0104
    average_best_levenshtein: 2.9224 ± 0.6300
    average_best_normalized_levenshtein: 0.0538 ± 0.0109
    average_full_score: 0.9748 ± 0.0086
    average_inference_time: 2.9074 ± 0.0662
    average_pv_match_rate: 97.9215 ± 0.7924
    average_pv_mismatch_rate: 2.0785 ± 0.7924
    average_temp_score: 0.9971 ± 0.0050
    average_timing_score: 0.9590 ± 0.0088
    count: 116.0000 ± 0.0000
    exact_code_match_rate: 81.6092 ± 3.4840
    exact_code_matches: 94.6667 ± 4.0415
    full_match_rate: 91.3793 ± 2.5862
    full_matched: 106.0000 ± 3.0000
    full_matches: 106.0000 ± 3.0000
    full_mismatched: 10.0000 ± 3.0000
    pv_exact_match_rate: 96.5517 ± 1.4931
    pv_exact_matched: 112.0000 ± 1.7321
    pv_exact_mismatched: 4.0000 ± 1.7321
    temp_match_rate: 0.0000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.3333 ± 0.5774
    timing_match_rate: 92.5287 ± 1.9909
    timing_matched: 107.3333 ± 2.3094
    timing_mismatched: 8.6667 ± 2.3094
total_entries: 136.0000 ± 0.0000
==================================================
================================================================================

claude-3.5-sonnet
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 90.1961 ± 0.4245
total_entries: 136.0000 ± 0.0000
correct_matches: 122.6667 ± 0.5774
average_execution_time: 1.8485 ± 0.0511
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 112.0000 ± 0.0000
    exact_code_matches: 104.0000 ± 0.0000
    exact_code_match_rate: 89.6552 ± 0.0000
    pv_exact_matched: 115.0000 ± 0.0000
    pv_exact_mismatched: 1.0000 ± 0.0000
    pv_exact_match_rate: 99.1379 ± 0.0000
    average_pv_match_rate: 99.7126 ± 0.0000
    average_pv_mismatch_rate: 0.2874 ± 0.0000
    timing_matched: 112.0000 ± 0.0000
    timing_mismatched: 4.0000 ± 0.0000
    timing_match_rate: 96.5517 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 112.0000 ± 0.0000
    full_mismatched: 4.0000 ± 0.0000
    full_match_rate: 96.5517 ± 0.0000
    accuracy: 96.5517 ± 0.0000
    average_timing_score: 0.9854 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9948 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.6150 ± 0.0000
      comb_7_CodeBLEU: 0.8407 ± 0.0000
      dataflow_match_score: 0.2759 ± 0.0000
      ngram_match_score: 0.2837 ± 0.0000
      syntax_match_score: 0.9822 ± 0.0000
      weighted_ngram_match_score: 0.1942 ± 0.0000
    average_best_levenshtein: 1.3448 ± 0.0000
    average_best_normalized_levenshtein: 0.0235 ± 0.0000
    average_inference_time: 1.6254 ± 0.0409
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 10.6667 ± 0.5774
    exact_code_matches: 1.0000 ± 0.0000
    exact_code_match_rate: 5.0000 ± 0.0000
    pv_exact_matched: 12.6667 ± 0.5774
    pv_exact_mismatched: 7.3333 ± 0.5774
    pv_exact_match_rate: 63.3333 ± 2.8868
    average_pv_match_rate: 88.5145 ± 0.5951
    average_pv_mismatch_rate: 11.4855 ± 0.5951
    timing_matched: 15.0000 ± 0.0000
    timing_mismatched: 5.0000 ± 0.0000
    timing_match_rate: 75.0000 ± 0.0000
    temp_matched: 8.0000 ± 0.0000
    temp_mismatched: 1.0000 ± 0.0000
    temp_match_rate: 88.8889 ± 0.0000
    full_matched: 10.6667 ± 0.5774
    full_mismatched: 9.3333 ± 0.5774
    full_match_rate: 53.3333 ± 2.8868
    accuracy: 53.3333 ± 2.8868
    average_timing_score: 0.8248 ± 0.0013
    average_temp_score: 0.9759 ± 0.0000
    average_full_score: 0.8812 ± 0.0050
    average_best_codebleu:
      average_CodeBLEU: 0.4629 ± 0.0081
      comb_7_CodeBLEU: 0.5724 ± 0.0079
      dataflow_match_score: 0.4978 ± 0.0116
      ngram_match_score: 0.2750 ± 0.0087
      syntax_match_score: 0.6929 ± 0.0044
      weighted_ngram_match_score: 0.2858 ± 0.0098
    average_best_levenshtein: 132.0500 ± 3.5118
    average_best_normalized_levenshtein: 0.3909 ± 0.0110
    average_inference_time: 3.1424 ± 0.1101
==================================================
================================================================================

claude-sonnet-4-bedrock
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 87.9902 ± 0.4245
total_entries: 136.0000 ± 0.0000
correct_matches: 119.6667 ± 0.5774
average_execution_time: 26.2296 ± 0.2915
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 113.0000 ± 0.0000
    exact_code_matches: 109.0000 ± 0.0000
    exact_code_match_rate: 93.9655 ± 0.0000
    pv_exact_matched: 114.0000 ± 0.0000
    pv_exact_mismatched: 2.0000 ± 0.0000
    pv_exact_match_rate: 98.2759 ± 0.0000
    average_pv_match_rate: 99.2816 ± 0.0000
    average_pv_mismatch_rate: 0.7184 ± 0.0000
    timing_matched: 114.0000 ± 0.0000
    timing_mismatched: 2.0000 ± 0.0000
    timing_match_rate: 98.2759 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 113.0000 ± 0.0000
    full_mismatched: 3.0000 ± 0.0000
    full_match_rate: 97.4138 ± 0.0000
    accuracy: 97.4138 ± 0.0000
    average_timing_score: 0.9900 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9922 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.6204 ± 0.0000
      comb_7_CodeBLEU: 0.8452 ± 0.0000
      dataflow_match_score: 0.2845 ± 0.0000
      ngram_match_score: 0.2935 ± 0.0000
      syntax_match_score: 0.9903 ± 0.0000
      weighted_ngram_match_score: 0.1976 ± 0.0000
    average_best_levenshtein: 0.7931 ± 0.0000
    average_best_normalized_levenshtein: 0.0158 ± 0.0000
    average_inference_time: 28.8275 ± 0.0292
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 6.6667 ± 0.5774
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 10.0000 ± 0.0000
    pv_exact_mismatched: 10.0000 ± 0.0000
    pv_exact_match_rate: 50.0000 ± 0.0000
    average_pv_match_rate: 81.4779 ± 0.1443
    average_pv_mismatch_rate: 18.5221 ± 0.1443
    timing_matched: 12.6667 ± 0.5774
    timing_mismatched: 7.3333 ± 0.5774
    timing_match_rate: 63.3333 ± 2.8868
    temp_matched: 8.0000 ± 0.0000
    temp_mismatched: 1.0000 ± 0.0000
    temp_match_rate: 88.8889 ± 0.0000
    full_matched: 6.6667 ± 0.5774
    full_mismatched: 13.3333 ± 0.5774
    full_match_rate: 33.3333 ± 2.8868
    accuracy: 33.3333 ± 2.8868
    average_timing_score: 0.7360 ± 0.0056
    average_temp_score: 0.9802 ± 0.0015
    average_full_score: 0.8004 ± 0.0018
    average_best_codebleu:
      average_CodeBLEU: 0.4587 ± 0.0040
      comb_7_CodeBLEU: 0.5770 ± 0.0024
      dataflow_match_score: 0.5536 ± 0.0036
      ngram_match_score: 0.2392 ± 0.0087
      syntax_match_score: 0.7082 ± 0.0005
      weighted_ngram_match_score: 0.2837 ± 0.0056
    average_best_levenshtein: 156.3167 ± 3.6250
    average_best_normalized_levenshtein: 0.4189 ± 0.0055
    average_inference_time: 11.1618 ± 2.1209
==================================================
================================================================================

claude-sonnet-4-5-20250929-thinking
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 88.7255 ± 1.5306
total_entries: 136.0000 ± 0.0000
correct_matches: 120.6667 ± 2.0817
average_execution_time: 8.3125 ± 0.7417
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 111.0000 ± 1.0000
    exact_code_matches: 109.0000 ± 2.6458
    exact_code_match_rate: 93.9655 ± 2.2808
    pv_exact_matched: 113.0000 ± 0.0000
    pv_exact_mismatched: 3.0000 ± 0.0000
    pv_exact_match_rate: 97.4138 ± 0.0000
    average_pv_match_rate: 99.1571 ± 0.0830
    average_pv_mismatch_rate: 0.8429 ± 0.0830
    timing_matched: 112.6667 ± 1.5275
    timing_mismatched: 3.3333 ± 1.5275
    timing_match_rate: 97.1264 ± 1.3168
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 111.0000 ± 1.0000
    full_mismatched: 5.0000 ± 1.0000
    full_match_rate: 95.6897 ± 0.8621
    accuracy: 95.6897 ± 0.8621
    average_timing_score: 0.9874 ± 0.0051
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9907 ± 0.0004
    average_best_codebleu:
      average_CodeBLEU: 0.6041 ± 0.0023
      comb_7_CodeBLEU: 0.8375 ± 0.0040
      dataflow_match_score: 0.2529 ± 0.0050
      ngram_match_score: 0.2641 ± 0.0039
      syntax_match_score: 0.9864 ± 0.0110
      weighted_ngram_match_score: 0.1659 ± 0.0049
    average_best_levenshtein: 1.3534 ± 0.5762
    average_best_normalized_levenshtein: 0.0223 ± 0.0113
    average_inference_time: 7.0893 ± 0.7660
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 9.6667 ± 1.1547
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 12.6667 ± 2.0817
    pv_exact_mismatched: 7.3333 ± 2.0817
    pv_exact_match_rate: 63.3333 ± 10.4083
    average_pv_match_rate: 90.0253 ± 5.2513
    average_pv_mismatch_rate: 9.9747 ± 5.2513
    timing_matched: 14.6667 ± 1.5275
    timing_mismatched: 5.3333 ± 1.5275
    timing_match_rate: 73.3333 ± 7.6376
    temp_matched: 7.3333 ± 0.5774
    temp_mismatched: 1.6667 ± 0.5774
    temp_match_rate: 81.4815 ± 6.4150
    full_matched: 9.6667 ± 1.1547
    full_mismatched: 10.3333 ± 1.1547
    full_match_rate: 48.3333 ± 5.7735
    accuracy: 48.3333 ± 5.7735
    average_timing_score: 0.8123 ± 0.0333
    average_temp_score: 0.9719 ± 0.0091
    average_full_score: 0.8840 ± 0.0467
    average_best_codebleu:
      average_CodeBLEU: 0.4880 ± 0.0289
      comb_7_CodeBLEU: 0.6016 ± 0.0205
      dataflow_match_score: 0.5294 ± 0.0291
      ngram_match_score: 0.2778 ± 0.0377
      syntax_match_score: 0.7085 ± 0.0319
      weighted_ngram_match_score: 0.3197 ± 0.0524
    average_best_levenshtein: 129.8833 ± 3.6350
    average_best_normalized_levenshtein: 0.4017 ± 0.0172
    average_inference_time: 15.4073 ± 0.6734
==================================================
================================================================================

qwen3-coder-480b-or
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 86.7647 ± 0.7353
total_entries: 136.0000 ± 0.0000
correct_matches: 118.0000 ± 1.0000
average_execution_time: 2.4135 ± 0.5093
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 111.6667 ± 1.1547
    exact_code_matches: 105.0000 ± 1.0000
    exact_code_match_rate: 90.5172 ± 0.8621
    pv_exact_matched: 111.6667 ± 1.1547
    pv_exact_mismatched: 4.3333 ± 1.1547
    pv_exact_match_rate: 96.2644 ± 0.9954
    average_pv_match_rate: 97.7682 ± 0.6636
    average_pv_mismatch_rate: 2.2318 ± 0.6636
    timing_matched: 113.6667 ± 1.1547
    timing_mismatched: 2.3333 ± 1.1547
    timing_match_rate: 97.9885 ± 0.9954
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 111.6667 ± 1.1547
    full_mismatched: 4.3333 ± 1.1547
    full_match_rate: 96.2644 ± 0.9954
    accuracy: 96.2644 ± 0.9954
    average_timing_score: 0.9812 ± 0.0078
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9784 ± 0.0069
    average_best_codebleu:
      average_CodeBLEU: 0.5987 ± 0.0023
      comb_7_CodeBLEU: 0.8344 ± 0.0022
      dataflow_match_score: 0.2471 ± 0.0100
      ngram_match_score: 0.2533 ± 0.0037
      syntax_match_score: 0.9829 ± 0.0058
      weighted_ngram_match_score: 0.1587 ± 0.0059
    average_best_levenshtein: 1.4511 ± 0.3964
    average_best_normalized_levenshtein: 0.0324 ± 0.0065
    average_inference_time: 1.9962 ± 0.5459
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 6.3333 ± 0.5774
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 6.6667 ± 0.5774
    pv_exact_mismatched: 13.3333 ± 0.5774
    pv_exact_match_rate: 33.3333 ± 2.8868
    average_pv_match_rate: 80.8513 ± 2.5694
    average_pv_mismatch_rate: 19.1487 ± 2.5694
    timing_matched: 14.0000 ± 1.0000
    timing_mismatched: 6.0000 ± 1.0000
    timing_match_rate: 70.0000 ± 5.0000
    temp_matched: 7.3333 ± 0.5774
    temp_mismatched: 1.6667 ± 0.5774
    temp_match_rate: 81.4815 ± 6.4150
    full_matched: 6.3333 ± 0.5774
    full_mismatched: 13.6667 ± 0.5774
    full_match_rate: 31.6667 ± 2.8868
    accuracy: 31.6667 ± 2.8868
    average_timing_score: 0.8083 ± 0.0375
    average_temp_score: 0.9623 ± 0.0269
    average_full_score: 0.8156 ± 0.0252
    average_best_codebleu:
      average_CodeBLEU: 0.3738 ± 0.0024
      comb_7_CodeBLEU: 0.4990 ± 0.0081
      dataflow_match_score: 0.4563 ± 0.0298
      ngram_match_score: 0.1589 ± 0.0135
      syntax_match_score: 0.5919 ± 0.0047
      weighted_ngram_match_score: 0.1714 ± 0.0148
    average_best_levenshtein: 163.5833 ± 15.7611
    average_best_normalized_levenshtein: 0.4631 ± 0.0135
    average_inference_time: 4.8335 ± 0.3109
==================================================
================================================================================

qwen3
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 0.0000 ± 0.0000
total_entries: 136.0000 ± 0.0000
correct_matches: 0.0000 ± 0.0000
average_execution_time: 7.6466 ± 0.0119
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 0.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 0.0000 ± 0.0000
    pv_exact_mismatched: 116.0000 ± 0.0000
    pv_exact_match_rate: 0.0000 ± 0.0000
    average_pv_match_rate: 0.0000 ± 0.0000
    average_pv_mismatch_rate: 100.0000 ± 0.0000
    timing_matched: 0.0000 ± 0.0000
    timing_mismatched: 116.0000 ± 0.0000
    timing_match_rate: 0.0000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 0.0000 ± 0.0000
    full_mismatched: 116.0000 ± 0.0000
    full_match_rate: 0.0000 ± 0.0000
    accuracy: 0.0000 ± 0.0000
    average_timing_score: 0.0000 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.0000 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.3059 ± 0.0000
      comb_7_CodeBLEU: 0.4894 ± 0.0000
      dataflow_match_score: 0.2759 ± 0.0000
      ngram_match_score: 0.0000 ± 0.0000
      syntax_match_score: 0.2235 ± 0.0000
      weighted_ngram_match_score: 0.0000 ± 0.0000
    average_best_levenshtein: 1092.9368 ± 8.7250
    average_best_normalized_levenshtein: 0.9644 ± 0.0002
    average_inference_time: 6.5531 ± 0.0101
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 0.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 0.0000 ± 0.0000
    pv_exact_mismatched: 20.0000 ± 0.0000
    pv_exact_match_rate: 0.0000 ± 0.0000
    average_pv_match_rate: 0.0000 ± 0.0000
    average_pv_mismatch_rate: 100.0000 ± 0.0000
    timing_matched: 0.0000 ± 0.0000
    timing_mismatched: 20.0000 ± 0.0000
    timing_match_rate: 0.0000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 9.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 0.0000 ± 0.0000
    full_mismatched: 20.0000 ± 0.0000
    full_match_rate: 0.0000 ± 0.0000
    accuracy: 0.0000 ± 0.0000
    average_timing_score: 0.0000 ± 0.0000
    average_temp_score: 0.5500 ± 0.0000
    average_full_score: 0.0000 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.2513 ± 0.0000
      comb_7_CodeBLEU: 0.3836 ± 0.0000
      dataflow_match_score: 0.6310 ± 0.0000
      ngram_match_score: 0.0052 ± 0.0000
      syntax_match_score: 0.2125 ± 0.0000
      weighted_ngram_match_score: 0.0565 ± 0.0000
    average_best_levenshtein: 1777.6000 ± 0.0000
    average_best_normalized_levenshtein: 0.8919 ± 0.0000
    average_inference_time: 13.9890 ± 0.1178
==================================================
================================================================================

claude-v4-sonnet-abacus
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 88.2353 ± 0.0000
total_entries: 136.0000 ± 0.0000
correct_matches: 120.0000 ± 0.0000
average_execution_time: 2.8403 ± 0.0809
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 113.0000 ± 0.0000
    exact_code_matches: 109.0000 ± 0.0000
    exact_code_match_rate: 93.9655 ± 0.0000
    pv_exact_matched: 114.0000 ± 0.0000
    pv_exact_mismatched: 2.0000 ± 0.0000
    pv_exact_match_rate: 98.2759 ± 0.0000
    average_pv_match_rate: 99.2816 ± 0.0000
    average_pv_mismatch_rate: 0.7184 ± 0.0000
    timing_matched: 114.0000 ± 0.0000
    timing_mismatched: 2.0000 ± 0.0000
    timing_match_rate: 98.2759 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 113.0000 ± 0.0000
    full_mismatched: 3.0000 ± 0.0000
    full_match_rate: 97.4138 ± 0.0000
    accuracy: 97.4138 ± 0.0000
    average_timing_score: 0.9900 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9922 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.6235 ± 0.0000
      comb_7_CodeBLEU: 0.8465 ± 0.0000
      dataflow_match_score: 0.2845 ± 0.0000
      ngram_match_score: 0.2994 ± 0.0000
      syntax_match_score: 0.9903 ± 0.0000
      weighted_ngram_match_score: 0.2041 ± 0.0000
    average_best_levenshtein: 0.7931 ± 0.0000
    average_best_normalized_levenshtein: 0.0158 ± 0.0000
    average_inference_time: 2.5981 ± 0.0556
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 7.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 10.0000 ± 0.0000
    pv_exact_mismatched: 10.0000 ± 0.0000
    pv_exact_match_rate: 50.0000 ± 0.0000
    average_pv_match_rate: 81.5612 ± 0.0000
    average_pv_mismatch_rate: 18.4388 ± 0.0000
    timing_matched: 13.0000 ± 0.0000
    timing_mismatched: 7.0000 ± 0.0000
    timing_match_rate: 65.0000 ± 0.0000
    temp_matched: 8.0000 ± 0.0000
    temp_mismatched: 1.0000 ± 0.0000
    temp_match_rate: 88.8889 ± 0.0000
    full_matched: 7.0000 ± 0.0000
    full_mismatched: 13.0000 ± 0.0000
    full_match_rate: 35.0000 ± 0.0000
    accuracy: 35.0000 ± 0.0000
    average_timing_score: 0.7479 ± 0.0006
    average_temp_score: 0.9807 ± 0.0000
    average_full_score: 0.8036 ± 0.0001
    average_best_codebleu:
      average_CodeBLEU: 0.4508 ± 0.0000
      comb_7_CodeBLEU: 0.5699 ± 0.0000
      dataflow_match_score: 0.5466 ± 0.0000
      ngram_match_score: 0.2289 ± 0.0000
      syntax_match_score: 0.7019 ± 0.0000
      weighted_ngram_match_score: 0.2760 ± 0.0000
    average_best_levenshtein: 156.5500 ± 0.0000
    average_best_normalized_levenshtein: 0.4198 ± 0.0000
    average_inference_time: 4.2450 ± 0.3676
==================================================
================================================================================

grok-4-or
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 88.2353 ± 0.0000
average_execution_time: 24.6341 ± 1.7236
correct_matches: 120.0000 ± 0.0000
metrics_by_complexity:
  complex:
    accuracy: 45.0000 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.4413 ± 0.0272
      comb_7_CodeBLEU: 0.5594 ± 0.0208
      dataflow_match_score: 0.4745 ± 0.0194
      ngram_match_score: 0.2212 ± 0.0368
      syntax_match_score: 0.6517 ± 0.0138
      weighted_ngram_match_score: 0.2680 ± 0.0387
    average_best_levenshtein: 144.2167 ± 13.2791
    average_best_normalized_levenshtein: 0.4433 ± 0.0378
    average_full_score: 0.8713 ± 0.0112
    average_inference_time: 74.8301 ± 7.1712
    average_pv_match_rate: 88.1306 ± 1.5174
    average_pv_mismatch_rate: 11.8694 ± 1.5174
    average_temp_score: 0.9748 ± 0.0053
    average_timing_score: 0.8044 ± 0.0183
    count: 20.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    full_match_rate: 45.0000 ± 0.0000
    full_matched: 9.0000 ± 0.0000
    full_matches: 9.0000 ± 0.0000
    full_mismatched: 11.0000 ± 0.0000
    pv_exact_match_rate: 65.0000 ± 0.0000
    pv_exact_matched: 13.0000 ± 0.0000
    pv_exact_mismatched: 7.0000 ± 0.0000
    temp_match_rate: 81.4815 ± 6.4150
    temp_matched: 7.3333 ± 0.5774
    temp_mismatched: 1.6667 ± 0.5774
    timing_match_rate: 70.0000 ± 8.6603
    timing_matched: 14.0000 ± 1.7321
    timing_mismatched: 6.0000 ± 1.7321
  simple:
    accuracy: 95.6897 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.6603 ± 0.0036
      comb_7_CodeBLEU: 0.8613 ± 0.0007
      dataflow_match_score: 0.3736 ± 0.0050
      ngram_match_score: 0.3658 ± 0.0085
      syntax_match_score: 0.9906 ± 0.0023
      weighted_ngram_match_score: 0.2850 ± 0.0080
    average_best_levenshtein: 1.8736 ± 0.1593
    average_best_normalized_levenshtein: 0.0215 ± 0.0012
    average_full_score: 0.9932 ± 0.0002
    average_inference_time: 15.9796 ± 0.7844
    average_pv_match_rate: 99.5402 ± 0.0000
    average_pv_mismatch_rate: 0.4598 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_timing_score: 0.9846 ± 0.0009
    count: 116.0000 ± 0.0000
    exact_code_match_rate: 91.3793 ± 1.4931
    exact_code_matches: 106.0000 ± 1.7321
    full_match_rate: 95.6897 ± 0.0000
    full_matched: 111.0000 ± 0.0000
    full_matches: 111.0000 ± 0.0000
    full_mismatched: 5.0000 ± 0.0000
    pv_exact_match_rate: 98.2759 ± 0.0000
    pv_exact_matched: 114.0000 ± 0.0000
    pv_exact_mismatched: 2.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    timing_match_rate: 96.5517 ± 0.0000
    timing_matched: 112.0000 ± 0.0000
    timing_mismatched: 4.0000 ± 0.0000
total_entries: 136.0000 ± 0.0000
==================================================
================================================================================

gpt-5-high
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 89.4608 ± 1.5306
total_entries: 136.0000 ± 0.0000
correct_matches: 121.6667 ± 2.0817
average_execution_time: 43.0139 ± 3.6541
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 111.3333 ± 0.5774
    exact_code_matches: 97.6667 ± 2.5166
    exact_code_match_rate: 84.1954 ± 2.1695
    pv_exact_matched: 114.0000 ± 1.0000
    pv_exact_mismatched: 2.0000 ± 1.0000
    pv_exact_match_rate: 98.2759 ± 0.8621
    average_pv_match_rate: 99.0805 ± 0.5542
    average_pv_mismatch_rate: 0.9195 ± 0.5542
    timing_matched: 111.3333 ± 0.5774
    timing_mismatched: 4.6667 ± 0.5774
    timing_match_rate: 95.9770 ± 0.4977
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 111.3333 ± 0.5774
    full_mismatched: 4.6667 ± 0.5774
    full_match_rate: 95.9770 ± 0.4977
    accuracy: 95.9770 ± 0.4977
    average_timing_score: 0.9781 ± 0.0060
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9883 ± 0.0056
    average_best_codebleu:
      average_CodeBLEU: 0.6170 ± 0.0091
      comb_7_CodeBLEU: 0.8396 ± 0.0058
      dataflow_match_score: 0.2471 ± 0.0249
      ngram_match_score: 0.2724 ± 0.0146
      syntax_match_score: 0.9760 ± 0.0072
      weighted_ngram_match_score: 0.2198 ± 0.0166
    average_best_levenshtein: 6.5977 ± 2.0817
    average_best_normalized_levenshtein: 0.0626 ± 0.0143
    average_inference_time: 31.7732 ± 2.1071
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 10.3333 ± 1.5275
    exact_code_matches: 0.3333 ± 0.5774
    exact_code_match_rate: 1.6667 ± 2.8868
    pv_exact_matched: 11.6667 ± 2.0817
    pv_exact_mismatched: 8.3333 ± 2.0817
    pv_exact_match_rate: 58.3333 ± 10.4083
    average_pv_match_rate: 89.0946 ± 2.6720
    average_pv_mismatch_rate: 10.9054 ± 2.6720
    timing_matched: 15.3333 ± 1.1547
    timing_mismatched: 4.6667 ± 1.1547
    timing_match_rate: 76.6667 ± 5.7735
    temp_matched: 7.3333 ± 0.5774
    temp_mismatched: 1.6667 ± 0.5774
    temp_match_rate: 81.4815 ± 6.4150
    full_matched: 10.3333 ± 1.5275
    full_mismatched: 9.6667 ± 1.5275
    full_match_rate: 51.6667 ± 7.6376
    accuracy: 51.6667 ± 7.6376
    average_timing_score: 0.8355 ± 0.0294
    average_temp_score: 0.9752 ± 0.0097
    average_full_score: 0.8856 ± 0.0215
    average_best_codebleu:
      average_CodeBLEU: 0.3868 ± 0.0283
      comb_7_CodeBLEU: 0.5304 ± 0.0279
      dataflow_match_score: 0.4665 ± 0.0353
      ngram_match_score: 0.1314 ± 0.0312
      syntax_match_score: 0.6190 ± 0.0596
      weighted_ngram_match_score: 0.1635 ± 0.0316
    average_best_levenshtein: 196.5833 ± 3.6116
    average_best_normalized_levenshtein: 0.5045 ± 0.0270
    average_inference_time: 108.2100 ± 12.9103
==================================================
================================================================================

qwen2.5
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 33.0882 ± 0.0000
total_entries: 136.0000 ± 0.0000
correct_matches: 45.0000 ± 0.0000
average_execution_time: 0.9632 ± 0.0751
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 45.0000 ± 0.0000
    exact_code_matches: 39.0000 ± 0.0000
    exact_code_match_rate: 33.6207 ± 0.0000
    pv_exact_matched: 45.0000 ± 0.0000
    pv_exact_mismatched: 71.0000 ± 0.0000
    pv_exact_match_rate: 38.7931 ± 0.0000
    average_pv_match_rate: 50.0000 ± 0.0000
    average_pv_mismatch_rate: 50.0000 ± 0.0000
    timing_matched: 56.0000 ± 0.0000
    timing_mismatched: 60.0000 ± 0.0000
    timing_match_rate: 48.2759 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 45.0000 ± 0.0000
    full_mismatched: 71.0000 ± 0.0000
    full_match_rate: 38.7931 ± 0.0000
    accuracy: 38.7931 ± 0.0000
    average_timing_score: 0.5344 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.5069 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.4917 ± 0.0000
      comb_7_CodeBLEU: 0.7442 ± 0.0000
      dataflow_match_score: 0.2328 ± 0.0000
      ngram_match_score: 0.0918 ± 0.0000
      syntax_match_score: 0.8250 ± 0.0000
      weighted_ngram_match_score: 0.0498 ± 0.0000
    average_best_levenshtein: 7.1552 ± 0.0000
    average_best_normalized_levenshtein: 0.2337 ± 0.0000
    average_inference_time: 0.8253 ± 0.0101
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 0.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 0.0000 ± 0.0000
    pv_exact_mismatched: 20.0000 ± 0.0000
    pv_exact_match_rate: 0.0000 ± 0.0000
    average_pv_match_rate: 41.3497 ± 0.3045
    average_pv_mismatch_rate: 58.6503 ± 0.3045
    timing_matched: 11.3333 ± 0.5774
    timing_mismatched: 8.6667 ± 0.5774
    timing_match_rate: 56.6667 ± 2.8868
    temp_matched: 2.6667 ± 0.5774
    temp_mismatched: 6.3333 ± 0.5774
    temp_match_rate: 29.6296 ± 6.4150
    full_matched: 0.0000 ± 0.0000
    full_mismatched: 20.0000 ± 0.0000
    full_match_rate: 0.0000 ± 0.0000
    accuracy: 0.0000 ± 0.0000
    average_timing_score: 0.7292 ± 0.0107
    average_temp_score: 0.7279 ± 0.0156
    average_full_score: 0.4748 ± 0.0008
    average_best_codebleu:
      average_CodeBLEU: 0.3652 ± 0.0208
      comb_7_CodeBLEU: 0.4720 ± 0.0201
      dataflow_match_score: 0.5025 ± 0.0369
      ngram_match_score: 0.1799 ± 0.0223
      syntax_match_score: 0.5671 ± 0.0314
      weighted_ngram_match_score: 0.1948 ± 0.0214
    average_best_levenshtein: 144.2500 ± 1.6454
    average_best_normalized_levenshtein: 0.4849 ± 0.0098
    average_inference_time: 1.7636 ± 0.4577
==================================================
================================================================================

gpt-oss:120b
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 24.2647 ± 0.0000
total_entries: 136.0000 ± 0.0000
correct_matches: 33.0000 ± 0.0000
average_execution_time: 236.6805 ± 0.3500
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 32.0000 ± 0.0000
    exact_code_matches: 30.0000 ± 0.0000
    exact_code_match_rate: 25.8621 ± 0.0000
    pv_exact_matched: 35.0000 ± 0.0000
    pv_exact_mismatched: 81.0000 ± 0.0000
    pv_exact_match_rate: 30.1724 ± 0.0000
    average_pv_match_rate: 33.1466 ± 0.0000
    average_pv_mismatch_rate: 66.8534 ± 0.0000
    timing_matched: 35.0000 ± 0.0000
    timing_mismatched: 81.0000 ± 0.0000
    timing_match_rate: 30.1724 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 32.0000 ± 0.0000
    full_mismatched: 84.0000 ± 0.0000
    full_match_rate: 27.5862 ± 0.0000
    accuracy: 27.5862 ± 0.0000
    average_timing_score: 0.3326 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.3317 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.4030 ± 0.0000
      comb_7_CodeBLEU: 0.5881 ± 0.0000
      dataflow_match_score: 0.1638 ± 0.0000
      ngram_match_score: 0.1048 ± 0.0000
      syntax_match_score: 0.4231 ± 0.0000
      weighted_ngram_match_score: 0.0841 ± 0.0000
    average_best_levenshtein: 81.2759 ± 0.0000
    average_best_normalized_levenshtein: 0.5981 ± 0.0000
    average_inference_time: 240.7601 ± 0.3138
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 1.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 2.0000 ± 0.0000
    pv_exact_mismatched: 18.0000 ± 0.0000
    pv_exact_match_rate: 10.0000 ± 0.0000
    average_pv_match_rate: 30.9853 ± 0.0000
    average_pv_mismatch_rate: 69.0147 ± 0.0000
    timing_matched: 4.0000 ± 0.0000
    timing_mismatched: 16.0000 ± 0.0000
    timing_match_rate: 20.0000 ± 0.0000
    temp_matched: 5.0000 ± 0.0000
    temp_mismatched: 4.0000 ± 0.0000
    temp_match_rate: 55.5556 ± 0.0000
    full_matched: 1.0000 ± 0.0000
    full_mismatched: 19.0000 ± 0.0000
    full_match_rate: 5.0000 ± 0.0000
    accuracy: 5.0000 ± 0.0000
    average_timing_score: 0.3082 ± 0.0000
    average_temp_score: 0.8320 ± 0.0000
    average_full_score: 0.3239 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.3466 ± 0.0000
      comb_7_CodeBLEU: 0.4848 ± 0.0000
      dataflow_match_score: 0.2506 ± 0.0000
      ngram_match_score: 0.1151 ± 0.0000
      syntax_match_score: 0.3033 ± 0.0000
      weighted_ngram_match_score: 0.1174 ± 0.0000
    average_best_levenshtein: 191.5500 ± 0.0000
    average_best_normalized_levenshtein: 0.7216 ± 0.0000
    average_inference_time: 213.0194 ± 0.5734
==================================================
================================================================================

athene-v2-agent
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 44.8529 ± 0.0000
total_entries: 136.0000 ± 0.0000
correct_matches: 61.0000 ± 0.0000
average_execution_time: 5.6723 ± 0.0986
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 61.0000 ± 0.0000
    exact_code_matches: 54.0000 ± 0.0000
    exact_code_match_rate: 46.5517 ± 0.0000
    pv_exact_matched: 62.0000 ± 0.0000
    pv_exact_mismatched: 54.0000 ± 0.0000
    pv_exact_match_rate: 53.4483 ± 0.0000
    average_pv_match_rate: 62.8448 ± 0.0000
    average_pv_mismatch_rate: 37.1552 ± 0.0000
    timing_matched: 65.0000 ± 0.0000
    timing_mismatched: 51.0000 ± 0.0000
    timing_match_rate: 56.0345 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 61.0000 ± 0.0000
    full_mismatched: 55.0000 ± 0.0000
    full_match_rate: 52.5862 ± 0.0000
    accuracy: 52.5862 ± 0.0000
    average_timing_score: 0.6299 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.6287 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.5057 ± 0.0000
      comb_7_CodeBLEU: 0.7510 ± 0.0000
      dataflow_match_score: 0.2586 ± 0.0000
      ngram_match_score: 0.1213 ± 0.0000
      syntax_match_score: 0.8291 ± 0.0000
      weighted_ngram_match_score: 0.0722 ± 0.0000
    average_best_levenshtein: 31.8793 ± 0.0000
    average_best_normalized_levenshtein: 0.2301 ± 0.0000
    average_inference_time: 5.2643 ± 0.0179
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 0.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 1.0000 ± 0.0000
    pv_exact_mismatched: 19.0000 ± 0.0000
    pv_exact_match_rate: 5.0000 ± 0.0000
    average_pv_match_rate: 52.4140 ± 0.0000
    average_pv_mismatch_rate: 47.5860 ± 0.0000
    timing_matched: 9.3333 ± 0.5774
    timing_mismatched: 10.6667 ± 0.5774
    timing_match_rate: 46.6667 ± 2.8868
    temp_matched: 1.0000 ± 0.0000
    temp_mismatched: 8.0000 ± 0.0000
    temp_match_rate: 11.1111 ± 0.0000
    full_matched: 0.0000 ± 0.0000
    full_mismatched: 20.0000 ± 0.0000
    full_match_rate: 0.0000 ± 0.0000
    accuracy: 0.0000 ± 0.0000
    average_timing_score: 0.6370 ± 0.0094
    average_temp_score: 0.7018 ± 0.0000
    average_full_score: 0.5326 ± 0.0019
    average_best_codebleu:
      average_CodeBLEU: 0.4343 ± 0.0021
      comb_7_CodeBLEU: 0.5580 ± 0.0016
      dataflow_match_score: 0.5719 ± 0.0000
      ngram_match_score: 0.2206 ± 0.0028
      syntax_match_score: 0.7091 ± 0.0026
      weighted_ngram_match_score: 0.2354 ± 0.0030
    average_best_levenshtein: 137.0333 ± 2.8001
    average_best_normalized_levenshtein: 0.4388 ± 0.0067
    average_inference_time: 8.0385 ± 0.5670
==================================================
================================================================================

claude-v4-opus-abacus
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 86.7647 ± 0.0000
total_entries: 136.0000 ± 0.0000
correct_matches: 118.0000 ± 0.0000
average_execution_time: 3.5212 ± 0.1931
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 111.0000 ± 0.0000
    exact_code_matches: 104.0000 ± 0.0000
    exact_code_match_rate: 89.6552 ± 0.0000
    pv_exact_matched: 114.0000 ± 0.0000
    pv_exact_mismatched: 2.0000 ± 0.0000
    pv_exact_match_rate: 98.2759 ± 0.0000
    average_pv_match_rate: 99.2816 ± 0.0000
    average_pv_mismatch_rate: 0.7184 ± 0.0000
    timing_matched: 112.0000 ± 0.0000
    timing_mismatched: 4.0000 ± 0.0000
    timing_match_rate: 96.5517 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 111.0000 ± 0.0000
    full_mismatched: 5.0000 ± 0.0000
    full_match_rate: 95.6897 ± 0.0000
    accuracy: 95.6897 ± 0.0000
    average_timing_score: 0.9833 ± 0.0004
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9909 ± 0.0001
    average_best_codebleu:
      average_CodeBLEU: 0.6287 ± 0.0000
      comb_7_CodeBLEU: 0.8495 ± 0.0000
      dataflow_match_score: 0.3103 ± 0.0000
      ngram_match_score: 0.3040 ± 0.0000
      syntax_match_score: 0.9933 ± 0.0000
      weighted_ngram_match_score: 0.2176 ± 0.0000
    average_best_levenshtein: 1.5862 ± 0.0000
    average_best_normalized_levenshtein: 0.0230 ± 0.0000
    average_inference_time: 2.8836 ± 0.1458
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 7.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 9.0000 ± 0.0000
    pv_exact_mismatched: 11.0000 ± 0.0000
    pv_exact_match_rate: 45.0000 ± 0.0000
    average_pv_match_rate: 83.3532 ± 0.4535
    average_pv_mismatch_rate: 16.6468 ± 0.4535
    timing_matched: 14.0000 ± 0.0000
    timing_mismatched: 6.0000 ± 0.0000
    timing_match_rate: 70.0000 ± 0.0000
    temp_matched: 8.0000 ± 0.0000
    temp_mismatched: 1.0000 ± 0.0000
    temp_match_rate: 88.8889 ± 0.0000
    full_matched: 7.0000 ± 0.0000
    full_mismatched: 13.0000 ± 0.0000
    full_match_rate: 35.0000 ± 0.0000
    accuracy: 35.0000 ± 0.0000
    average_timing_score: 0.7838 ± 0.0048
    average_temp_score: 0.9859 ± 0.0000
    average_full_score: 0.8276 ± 0.0042
    average_best_codebleu:
      average_CodeBLEU: 0.4201 ± 0.0022
      comb_7_CodeBLEU: 0.5447 ± 0.0023
      dataflow_match_score: 0.4951 ± 0.0028
      ngram_match_score: 0.1864 ± 0.0020
      syntax_match_score: 0.6606 ± 0.0024
      weighted_ngram_match_score: 0.2383 ± 0.0022
    average_best_levenshtein: 166.7667 ± 0.4368
    average_best_normalized_levenshtein: 0.4684 ± 0.0016
    average_inference_time: 7.2197 ± 0.5859
==================================================
================================================================================

mistral-nemo
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 15.4412 ± 0.0000
total_entries: 136.0000 ± 0.0000
correct_matches: 21.0000 ± 0.0000
average_execution_time: 1.2914 ± 0.0836
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 20.0000 ± 0.0000
    exact_code_matches: 13.0000 ± 0.0000
    exact_code_match_rate: 11.2069 ± 0.0000
    pv_exact_matched: 26.0000 ± 0.0000
    pv_exact_mismatched: 90.0000 ± 0.0000
    pv_exact_match_rate: 22.4138 ± 0.0000
    average_pv_match_rate: 26.9397 ± 0.0000
    average_pv_mismatch_rate: 73.0603 ± 0.0000
    timing_matched: 25.0000 ± 0.0000
    timing_mismatched: 91.0000 ± 0.0000
    timing_match_rate: 21.5517 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 6.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 20.0000 ± 0.0000
    full_mismatched: 96.0000 ± 0.0000
    full_match_rate: 17.2414 ± 0.0000
    accuracy: 17.2414 ± 0.0000
    average_timing_score: 0.2641 ± 0.0000
    average_temp_score: 0.9483 ± 0.0000
    average_full_score: 0.2586 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.4503 ± 0.0000
      comb_7_CodeBLEU: 0.6980 ± 0.0000
      dataflow_match_score: 0.2241 ± 0.0000
      ngram_match_score: 0.0379 ± 0.0000
      syntax_match_score: 0.7261 ± 0.0000
      weighted_ngram_match_score: 0.0373 ± 0.0000
    average_best_levenshtein: 15.0172 ± 0.0000
    average_best_normalized_levenshtein: 0.3860 ± 0.0000
    average_inference_time: 1.1393 ± 0.0274
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 1.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 1.0000 ± 0.0000
    pv_exact_mismatched: 19.0000 ± 0.0000
    pv_exact_match_rate: 5.0000 ± 0.0000
    average_pv_match_rate: 28.7667 ± 2.2453
    average_pv_mismatch_rate: 71.2333 ± 2.2453
    timing_matched: 10.0000 ± 0.0000
    timing_mismatched: 10.0000 ± 0.0000
    timing_match_rate: 50.0000 ± 0.0000
    temp_matched: 1.0000 ± 0.0000
    temp_mismatched: 8.0000 ± 0.0000
    temp_match_rate: 11.1111 ± 0.0000
    full_matched: 1.0000 ± 0.0000
    full_mismatched: 19.0000 ± 0.0000
    full_match_rate: 5.0000 ± 0.0000
    accuracy: 5.0000 ± 0.0000
    average_timing_score: 0.5738 ± 0.0126
    average_temp_score: 0.6110 ± 0.0000
    average_full_score: 0.3361 ± 0.0160
    average_best_codebleu:
      average_CodeBLEU: 0.3949 ± 0.0140
      comb_7_CodeBLEU: 0.5224 ± 0.0125
      dataflow_match_score: 0.5796 ± 0.0304
      ngram_match_score: 0.1726 ± 0.0162
      syntax_match_score: 0.6019 ± 0.0216
      weighted_ngram_match_score: 0.1921 ± 0.0170
    average_best_levenshtein: 152.3667 ± 0.9018
    average_best_normalized_levenshtein: 0.4631 ± 0.0050
    average_inference_time: 2.1737 ± 0.4104
==================================================
================================================================================

grok-4-fast-or
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 90.4412 ± 2.2059
total_entries: 136.0000 ± 0.0000
correct_matches: 123.0000 ± 3.0000
average_execution_time: 4.4297 ± 1.0961
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 111.0000 ± 3.0000
    exact_code_matches: 104.0000 ± 1.0000
    exact_code_match_rate: 89.6552 ± 0.8621
    pv_exact_matched: 112.6667 ± 1.1547
    pv_exact_mismatched: 3.3333 ± 1.1547
    pv_exact_match_rate: 97.1264 ± 0.9954
    average_pv_match_rate: 98.8506 ± 0.5973
    average_pv_mismatch_rate: 1.1494 ± 0.5973
    timing_matched: 113.3333 ± 2.5166
    timing_mismatched: 2.6667 ± 2.5166
    timing_match_rate: 97.7011 ± 2.1695
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 111.0000 ± 3.0000
    full_mismatched: 5.0000 ± 3.0000
    full_match_rate: 95.6897 ± 2.5862
    accuracy: 95.6897 ± 2.5862
    average_timing_score: 0.9860 ± 0.0083
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9880 ± 0.0063
    average_best_codebleu:
      average_CodeBLEU: 0.6255 ± 0.0022
      comb_7_CodeBLEU: 0.8421 ± 0.0014
      dataflow_match_score: 0.3010 ± 0.0087
      ngram_match_score: 0.3074 ± 0.0032
      syntax_match_score: 0.9735 ± 0.0027
      weighted_ngram_match_score: 0.2218 ± 0.0038
    average_best_levenshtein: 1.6695 ± 0.0199
    average_best_normalized_levenshtein: 0.0250 ± 0.0021
    average_inference_time: 3.4277 ± 1.1654
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 12.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 13.6667 ± 0.5774
    pv_exact_mismatched: 6.3333 ± 0.5774
    pv_exact_match_rate: 68.3333 ± 2.8868
    average_pv_match_rate: 90.0467 ± 0.7945
    average_pv_mismatch_rate: 9.9533 ± 0.7945
    timing_matched: 16.0000 ± 0.0000
    timing_mismatched: 4.0000 ± 0.0000
    timing_match_rate: 80.0000 ± 0.0000
    temp_matched: 8.0000 ± 0.0000
    temp_mismatched: 1.0000 ± 0.0000
    temp_match_rate: 88.8889 ± 0.0000
    full_matched: 12.0000 ± 0.0000
    full_mismatched: 8.0000 ± 0.0000
    full_match_rate: 60.0000 ± 0.0000
    accuracy: 60.0000 ± 0.0000
    average_timing_score: 0.8869 ± 0.0126
    average_temp_score: 0.9826 ± 0.0018
    average_full_score: 0.9026 ± 0.0070
    average_best_codebleu:
      average_CodeBLEU: 0.4392 ± 0.0197
      comb_7_CodeBLEU: 0.5739 ± 0.0178
      dataflow_match_score: 0.5403 ± 0.0441
      ngram_match_score: 0.1906 ± 0.0142
      syntax_match_score: 0.6706 ± 0.0248
      weighted_ngram_match_score: 0.2384 ± 0.0324
    average_best_levenshtein: 148.9833 ± 1.9788
    average_best_normalized_levenshtein: 0.4742 ± 0.0097
    average_inference_time: 10.2410 ± 0.7069
==================================================
================================================================================

bs_2
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 5.8824 ± 0.0000
total_entries: 136.0000 ± 0.0000
correct_matches: 8.0000 ± 0.0000
average_execution_time: 0.0000 ± 0.0000
metrics_by_complexity:
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 8.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 13.0000 ± 0.0000
    pv_exact_mismatched: 7.0000 ± 0.0000
    pv_exact_match_rate: 65.0000 ± 0.0000
    average_pv_match_rate: 85.2366 ± 0.0000
    average_pv_mismatch_rate: 14.7634 ± 0.0000
    timing_matched: 12.0000 ± 0.0000
    timing_mismatched: 8.0000 ± 0.0000
    timing_match_rate: 60.0000 ± 0.0000
    temp_matched: 7.0000 ± 0.0000
    temp_mismatched: 2.0000 ± 0.0000
    temp_match_rate: 77.7778 ± 0.0000
    full_matched: 8.0000 ± 0.0000
    full_mismatched: 12.0000 ± 0.0000
    full_match_rate: 40.0000 ± 0.0000
    accuracy: 40.0000 ± 0.0000
    average_timing_score: 0.7954 ± 0.0000
    average_temp_score: 0.9659 ± 0.0000
    average_full_score: 0.8458 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.3423 ± 0.0000
      comb_7_CodeBLEU: 0.4863 ± 0.0000
      dataflow_match_score: 0.5264 ± 0.0000
      ngram_match_score: 0.0805 ± 0.0000
      syntax_match_score: 0.5882 ± 0.0000
      weighted_ngram_match_score: 0.1240 ± 0.0000
    average_best_levenshtein: 138.3500 ± 0.0000
    average_best_normalized_levenshtein: 0.4283 ± 0.0000
    average_inference_time: 0.0000 ± 0.0000
==================================================
================================================================================

claude-opus-4-20250514-thinking
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 89.7059 ± 1.9454
total_entries: 136.0000 ± 0.0000
correct_matches: 122.0000 ± 2.6458
average_execution_time: 6.6888 ± 0.2028
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 110.6667 ± 1.1547
    exact_code_matches: 102.3333 ± 1.5275
    exact_code_match_rate: 88.2184 ± 1.3168
    pv_exact_matched: 113.6667 ± 1.1547
    pv_exact_mismatched: 2.3333 ± 1.1547
    pv_exact_match_rate: 97.9885 ± 0.9954
    average_pv_match_rate: 98.5722 ± 0.9877
    average_pv_mismatch_rate: 1.4278 ± 0.9877
    timing_matched: 111.0000 ± 1.0000
    timing_mismatched: 5.0000 ± 1.0000
    timing_match_rate: 95.6897 ± 0.8621
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 110.6667 ± 1.1547
    full_mismatched: 5.3333 ± 1.1547
    full_match_rate: 95.4023 ± 0.9954
    accuracy: 95.4023 ± 0.9954
    average_timing_score: 0.9744 ± 0.0093
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9835 ± 0.0096
    average_best_codebleu:
      average_CodeBLEU: 0.6326 ± 0.0055
      comb_7_CodeBLEU: 0.8499 ± 0.0036
      dataflow_match_score: 0.2960 ± 0.0100
      ngram_match_score: 0.3108 ± 0.0092
      syntax_match_score: 0.9894 ± 0.0046
      weighted_ngram_match_score: 0.2302 ± 0.0085
    average_best_levenshtein: 1.7500 ± 0.3511
    average_best_normalized_levenshtein: 0.0259 ± 0.0046
    average_inference_time: 5.7371 ± 0.0645
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 11.3333 ± 1.5275
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 12.3333 ± 1.5275
    pv_exact_mismatched: 7.6667 ± 1.5275
    pv_exact_match_rate: 61.6667 ± 7.6376
    average_pv_match_rate: 90.0652 ± 2.7424
    average_pv_mismatch_rate: 9.9348 ± 2.7424
    timing_matched: 17.6667 ± 0.5774
    timing_mismatched: 2.3333 ± 0.5774
    timing_match_rate: 88.3333 ± 2.8868
    temp_matched: 7.3333 ± 0.5774
    temp_mismatched: 1.6667 ± 0.5774
    temp_match_rate: 81.4815 ± 6.4150
    full_matched: 11.3333 ± 1.5275
    full_mismatched: 8.6667 ± 1.5275
    full_match_rate: 56.6667 ± 7.6376
    accuracy: 56.6667 ± 7.6376
    average_timing_score: 0.8331 ± 0.0124
    average_temp_score: 0.9737 ± 0.0071
    average_full_score: 0.8866 ± 0.0238
    average_best_codebleu:
      average_CodeBLEU: 0.4537 ± 0.0059
      comb_7_CodeBLEU: 0.5776 ± 0.0104
      dataflow_match_score: 0.5077 ± 0.0188
      ngram_match_score: 0.2326 ± 0.0045
      syntax_match_score: 0.7125 ± 0.0153
      weighted_ngram_match_score: 0.2622 ± 0.0042
    average_best_levenshtein: 142.8167 ± 4.6047
    average_best_normalized_levenshtein: 0.4334 ± 0.0056
    average_inference_time: 12.2086 ± 1.1828
==================================================
================================================================================

debug
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 90.9314 ± 1.5306
total_entries: 136.0000 ± 0.0000
correct_matches: 123.6667 ± 2.0817
average_execution_time: 0.0000 ± 0.0000
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 103.6667 ± 2.0817
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 113.0000 ± 1.7321
    pv_exact_mismatched: 3.0000 ± 1.7321
    pv_exact_match_rate: 97.4138 ± 1.4931
    average_pv_match_rate: 98.7069 ± 0.7466
    average_pv_mismatch_rate: 1.2931 ± 0.7466
    timing_matched: 107.0000 ± 2.0000
    timing_mismatched: 9.0000 ± 2.0000
    timing_match_rate: 92.2414 ± 1.7241
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.3333 ± 0.5774
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 103.6667 ± 2.0817
    full_mismatched: 12.3333 ± 2.0817
    full_match_rate: 89.3678 ± 1.7945
    accuracy: 89.3678 ± 1.7945
    average_timing_score: 0.9595 ± 0.0059
    average_temp_score: 0.9971 ± 0.0050
    average_full_score: 0.9810 ± 0.0064
    average_best_codebleu:
      average_CodeBLEU: 0.5952 ± 0.0000
      comb_7_CodeBLEU: 0.8381 ± 0.0000
      dataflow_match_score: 0.2241 ± 0.0000
      ngram_match_score: 0.2438 ± 0.0000
      syntax_match_score: 1.0000 ± 0.0000
      weighted_ngram_match_score: 0.1371 ± 0.0000
    average_best_levenshtein: 0.0000 ± 0.0000
    average_best_normalized_levenshtein: 0.0000 ± 0.0000
    average_inference_time: 0.0000 ± 0.0000
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 20.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 20.0000 ± 0.0000
    pv_exact_mismatched: 0.0000 ± 0.0000
    pv_exact_match_rate: 100.0000 ± 0.0000
    average_pv_match_rate: 100.0000 ± 0.0000
    average_pv_mismatch_rate: 0.0000 ± 0.0000
    timing_matched: 20.0000 ± 0.0000
    timing_mismatched: 0.0000 ± 0.0000
    timing_match_rate: 100.0000 ± 0.0000
    temp_matched: 9.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 100.0000 ± 0.0000
    full_matched: 20.0000 ± 0.0000
    full_mismatched: 0.0000 ± 0.0000
    full_match_rate: 100.0000 ± 0.0000
    accuracy: 100.0000 ± 0.0000
    average_timing_score: 0.9902 ± 0.0043
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9980 ± 0.0009
    average_best_codebleu:
      average_CodeBLEU: 1.0000 ± 0.0000
      comb_7_CodeBLEU: 1.0000 ± 0.0000
      dataflow_match_score: 1.0000 ± 0.0000
      ngram_match_score: 1.0000 ± 0.0000
      syntax_match_score: 1.0000 ± 0.0000
      weighted_ngram_match_score: 1.0000 ± 0.0000
    average_best_levenshtein: 0.0000 ± 0.0000
    average_best_normalized_levenshtein: 0.0000 ± 0.0000
    average_inference_time: 0.0000 ± 0.0000
==================================================
================================================================================

claude-sonnet-4-20250514-thinking
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 88.7255 ± 1.5306
total_entries: 136.0000 ± 0.0000
correct_matches: 120.6667 ± 2.0817
average_execution_time: 5.6023 ± 0.2024
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 111.0000 ± 2.0000
    exact_code_matches: 106.3333 ± 2.0817
    exact_code_match_rate: 91.6667 ± 1.7945
    pv_exact_matched: 113.6667 ± 1.5275
    pv_exact_mismatched: 2.3333 ± 1.5275
    pv_exact_match_rate: 97.9885 ± 1.3168
    average_pv_match_rate: 99.0900 ± 0.8419
    average_pv_mismatch_rate: 0.9100 ± 0.8419
    timing_matched: 111.6667 ± 1.1547
    timing_mismatched: 4.3333 ± 1.1547
    timing_match_rate: 96.2644 ± 0.9954
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 111.0000 ± 2.0000
    full_mismatched: 5.0000 ± 2.0000
    full_match_rate: 95.6897 ± 1.7241
    accuracy: 95.6897 ± 1.7241
    average_timing_score: 0.9806 ± 0.0053
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9888 ± 0.0078
    average_best_codebleu:
      average_CodeBLEU: 0.6320 ± 0.0064
      comb_7_CodeBLEU: 0.8507 ± 0.0042
      dataflow_match_score: 0.3046 ± 0.0100
      ngram_match_score: 0.3120 ± 0.0110
      syntax_match_score: 0.9930 ± 0.0053
      weighted_ngram_match_score: 0.2231 ± 0.0097
    average_best_levenshtein: 1.2328 ± 0.1781
    average_best_normalized_levenshtein: 0.0187 ± 0.0069
    average_inference_time: 4.9973 ± 0.2391
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 9.6667 ± 0.5774
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 11.6667 ± 0.5774
    pv_exact_mismatched: 8.3333 ± 0.5774
    pv_exact_match_rate: 58.3333 ± 2.8868
    average_pv_match_rate: 91.8265 ± 1.3752
    average_pv_mismatch_rate: 8.1735 ± 1.3752
    timing_matched: 16.3333 ± 0.5774
    timing_mismatched: 3.6667 ± 0.5774
    timing_match_rate: 81.6667 ± 2.8868
    temp_matched: 8.3333 ± 0.5774
    temp_mismatched: 0.6667 ± 0.5774
    temp_match_rate: 92.5926 ± 6.4150
    full_matched: 9.6667 ± 0.5774
    full_mismatched: 10.3333 ± 0.5774
    full_match_rate: 48.3333 ± 2.8868
    accuracy: 48.3333 ± 2.8868
    average_timing_score: 0.8555 ± 0.0125
    average_temp_score: 0.9860 ± 0.0115
    average_full_score: 0.9073 ± 0.0131
    average_best_codebleu:
      average_CodeBLEU: 0.4875 ± 0.0116
      comb_7_CodeBLEU: 0.6144 ± 0.0108
      dataflow_match_score: 0.5828 ± 0.0140
      ngram_match_score: 0.2496 ± 0.0160
      syntax_match_score: 0.7150 ± 0.0068
      weighted_ngram_match_score: 0.3028 ± 0.0101
    average_best_levenshtein: 135.7000 ± 14.4074
    average_best_normalized_levenshtein: 0.4077 ± 0.0214
    average_inference_time: 9.1111 ± 0.4007
==================================================
================================================================================

bs_1
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 8.0882 ± 0.0000
total_entries: 136.0000 ± 0.0000
correct_matches: 11.0000 ± 0.0000
average_execution_time: 0.0000 ± 0.0000
metrics_by_complexity:
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 11.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 14.0000 ± 0.0000
    pv_exact_mismatched: 6.0000 ± 0.0000
    pv_exact_match_rate: 70.0000 ± 0.0000
    average_pv_match_rate: 94.5400 ± 0.0000
    average_pv_mismatch_rate: 5.4600 ± 0.0000
    timing_matched: 15.0000 ± 0.0000
    timing_mismatched: 5.0000 ± 0.0000
    timing_match_rate: 75.0000 ± 0.0000
    temp_matched: 9.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 100.0000 ± 0.0000
    full_matched: 11.0000 ± 0.0000
    full_mismatched: 9.0000 ± 0.0000
    full_match_rate: 55.0000 ± 0.0000
    accuracy: 55.0000 ± 0.0000
    average_timing_score: 0.8683 ± 0.0000
    average_temp_score: 0.9951 ± 0.0000
    average_full_score: 0.9349 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.4092 ± 0.0000
      comb_7_CodeBLEU: 0.5415 ± 0.0000
      dataflow_match_score: 0.5175 ± 0.0000
      ngram_match_score: 0.1757 ± 0.0000
      syntax_match_score: 0.6418 ± 0.0000
      weighted_ngram_match_score: 0.2019 ± 0.0000
    average_best_levenshtein: 170.5000 ± 0.0000
    average_best_normalized_levenshtein: 0.4484 ± 0.0000
    average_inference_time: 0.0000 ± 0.0000
==================================================
================================================================================

gpt-4o-abacus
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 86.2745 ± 0.4245
total_entries: 136.0000 ± 0.0000
correct_matches: 117.3333 ± 0.5774
average_execution_time: 2.5186 ± 0.4962
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 107.6667 ± 0.5774
    exact_code_matches: 93.6667 ± 0.5774
    exact_code_match_rate: 80.7471 ± 0.4977
    pv_exact_matched: 113.0000 ± 0.0000
    pv_exact_mismatched: 3.0000 ± 0.0000
    pv_exact_match_rate: 97.4138 ± 0.0000
    average_pv_match_rate: 97.9885 ± 0.0000
    average_pv_mismatch_rate: 2.0115 ± 0.0000
    timing_matched: 107.6667 ± 0.5774
    timing_mismatched: 8.3333 ± 0.5774
    timing_match_rate: 92.8161 ± 0.4977
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 107.6667 ± 0.5774
    full_mismatched: 8.3333 ± 0.5774
    full_match_rate: 92.8161 ± 0.4977
    accuracy: 92.8161 ± 0.4977
    average_timing_score: 0.9590 ± 0.0021
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9757 ± 0.0004
    average_best_codebleu:
      average_CodeBLEU: 0.5897 ± 0.0019
      comb_7_CodeBLEU: 0.8132 ± 0.0010
      dataflow_match_score: 0.2730 ± 0.0050
      ngram_match_score: 0.2587 ± 0.0037
      syntax_match_score: 0.9242 ± 0.0023
      weighted_ngram_match_score: 0.1760 ± 0.0042
    average_best_levenshtein: 3.6753 ± 0.2451
    average_best_normalized_levenshtein: 0.0467 ± 0.0006
    average_inference_time: 2.3411 ± 0.6115
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 9.6667 ± 1.1547
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 10.6667 ± 0.5774
    pv_exact_mismatched: 9.3333 ± 0.5774
    pv_exact_match_rate: 53.3333 ± 2.8868
    average_pv_match_rate: 82.9435 ± 1.5548
    average_pv_mismatch_rate: 17.0565 ± 1.5548
    timing_matched: 13.6667 ± 0.5774
    timing_mismatched: 6.3333 ± 0.5774
    timing_match_rate: 68.3333 ± 2.8868
    temp_matched: 8.0000 ± 0.0000
    temp_mismatched: 1.0000 ± 0.0000
    temp_match_rate: 88.8889 ± 0.0000
    full_matched: 9.6667 ± 1.1547
    full_mismatched: 10.3333 ± 1.1547
    full_match_rate: 48.3333 ± 5.7735
    accuracy: 48.3333 ± 5.7735
    average_timing_score: 0.8246 ± 0.0058
    average_temp_score: 0.9816 ± 0.0000
    average_full_score: 0.8367 ± 0.0108
    average_best_codebleu:
      average_CodeBLEU: 0.4251 ± 0.0073
      comb_7_CodeBLEU: 0.5440 ± 0.0063
      dataflow_match_score: 0.4850 ± 0.0085
      ngram_match_score: 0.2116 ± 0.0065
      syntax_match_score: 0.6616 ± 0.0081
      weighted_ngram_match_score: 0.2423 ± 0.0155
    average_best_levenshtein: 143.1833 ± 3.5080
    average_best_normalized_levenshtein: 0.4421 ± 0.0092
    average_inference_time: 3.5484 ± 0.2934
==================================================
================================================================================

gpt-4o
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 87.5000 ± 1.2736
total_entries: 136.0000 ± 0.0000
correct_matches: 119.0000 ± 1.7321
average_execution_time: 0.9371 ± 0.0193
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 109.6667 ± 0.5774
    exact_code_matches: 102.6667 ± 0.5774
    exact_code_match_rate: 88.5057 ± 0.4977
    pv_exact_matched: 113.0000 ± 0.0000
    pv_exact_mismatched: 3.0000 ± 0.0000
    pv_exact_match_rate: 97.4138 ± 0.0000
    average_pv_match_rate: 98.4195 ± 0.0000
    average_pv_mismatch_rate: 1.5805 ± 0.0000
    timing_matched: 110.6667 ± 0.5774
    timing_mismatched: 5.3333 ± 0.5774
    timing_match_rate: 95.4023 ± 0.4977
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 109.6667 ± 0.5774
    full_mismatched: 6.3333 ± 0.5774
    full_match_rate: 94.5402 ± 0.4977
    accuracy: 94.5402 ± 0.4977
    average_timing_score: 0.9746 ± 0.0016
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9823 ± 0.0003
    average_best_codebleu:
      average_CodeBLEU: 0.6112 ± 0.0014
      comb_7_CodeBLEU: 0.8396 ± 0.0018
      dataflow_match_score: 0.2845 ± 0.0000
      ngram_match_score: 0.2736 ± 0.0008
      syntax_match_score: 0.9837 ± 0.0041
      weighted_ngram_match_score: 0.1873 ± 0.0007
    average_best_levenshtein: 1.9023 ± 0.0697
    average_best_normalized_levenshtein: 0.0352 ± 0.0014
    average_inference_time: 0.7913 ± 0.0252
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 9.3333 ± 1.1547
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 11.6667 ± 0.5774
    pv_exact_mismatched: 8.3333 ± 0.5774
    pv_exact_match_rate: 58.3333 ± 2.8868
    average_pv_match_rate: 86.0879 ± 1.2333
    average_pv_mismatch_rate: 13.9121 ± 1.2333
    timing_matched: 14.3333 ± 1.1547
    timing_mismatched: 5.6667 ± 1.1547
    timing_match_rate: 71.6667 ± 5.7735
    temp_matched: 7.6667 ± 0.5774
    temp_mismatched: 1.3333 ± 0.5774
    temp_match_rate: 85.1852 ± 6.4150
    full_matched: 9.3333 ± 1.1547
    full_mismatched: 10.6667 ± 1.1547
    full_match_rate: 46.6667 ± 5.7735
    accuracy: 46.6667 ± 5.7735
    average_timing_score: 0.8063 ± 0.0103
    average_temp_score: 0.9761 ± 0.0061
    average_full_score: 0.8504 ± 0.0125
    average_best_codebleu:
      average_CodeBLEU: 0.4257 ± 0.0044
      comb_7_CodeBLEU: 0.5426 ± 0.0044
      dataflow_match_score: 0.4848 ± 0.0079
      ngram_match_score: 0.2221 ± 0.0057
      syntax_match_score: 0.6563 ± 0.0032
      weighted_ngram_match_score: 0.2397 ± 0.0048
    average_best_levenshtein: 156.0500 ± 11.6487
    average_best_normalized_levenshtein: 0.4607 ± 0.0119
    average_inference_time: 1.7829 ± 0.0166
==================================================
================================================================================

qwen2.5-coder
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 27.9412 ± 0.0000
total_entries: 136.0000 ± 0.0000
correct_matches: 38.0000 ± 0.0000
average_execution_time: 3.0102 ± 0.0398
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 37.0000 ± 0.0000
    exact_code_matches: 33.0000 ± 0.0000
    exact_code_match_rate: 28.4483 ± 0.0000
    pv_exact_matched: 37.0000 ± 0.0000
    pv_exact_mismatched: 79.0000 ± 0.0000
    pv_exact_match_rate: 31.8966 ± 0.0000
    average_pv_match_rate: 36.6379 ± 0.0000
    average_pv_mismatch_rate: 63.3621 ± 0.0000
    timing_matched: 48.0000 ± 0.0000
    timing_mismatched: 68.0000 ± 0.0000
    timing_match_rate: 41.3793 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 37.0000 ± 0.0000
    full_mismatched: 79.0000 ± 0.0000
    full_match_rate: 31.8966 ± 0.0000
    accuracy: 31.8966 ± 0.0000
    average_timing_score: 0.4138 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.3759 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.4586 ± 0.0000
      comb_7_CodeBLEU: 0.6903 ± 0.0000
      dataflow_match_score: 0.2414 ± 0.0000
      ngram_match_score: 0.0885 ± 0.0000
      syntax_match_score: 0.6894 ± 0.0000
      weighted_ngram_match_score: 0.0566 ± 0.0000
    average_best_levenshtein: 11.3879 ± 0.0000
    average_best_normalized_levenshtein: 0.3198 ± 0.0000
    average_inference_time: 2.7161 ± 0.0138
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 1.0000 ± 0.0000
    exact_code_matches: 1.0000 ± 0.0000
    exact_code_match_rate: 5.0000 ± 0.0000
    pv_exact_matched: 1.0000 ± 0.0000
    pv_exact_mismatched: 19.0000 ± 0.0000
    pv_exact_match_rate: 5.0000 ± 0.0000
    average_pv_match_rate: 37.9355 ± 0.9073
    average_pv_mismatch_rate: 62.0645 ± 0.9073
    timing_matched: 11.0000 ± 0.0000
    timing_mismatched: 9.0000 ± 0.0000
    timing_match_rate: 55.0000 ± 0.0000
    temp_matched: 3.0000 ± 0.0000
    temp_mismatched: 6.0000 ± 0.0000
    temp_match_rate: 33.3333 ± 0.0000
    full_matched: 1.0000 ± 0.0000
    full_mismatched: 19.0000 ± 0.0000
    full_match_rate: 5.0000 ± 0.0000
    accuracy: 5.0000 ± 0.0000
    average_timing_score: 0.5857 ± 0.0020
    average_temp_score: 0.7878 ± 0.0000
    average_full_score: 0.4274 ± 0.0058
    average_best_codebleu:
      average_CodeBLEU: 0.4447 ± 0.0003
      comb_7_CodeBLEU: 0.5702 ± 0.0003
      dataflow_match_score: 0.5641 ± 0.0000
      ngram_match_score: 0.2281 ± 0.0002
      syntax_match_score: 0.6435 ± 0.0005
      weighted_ngram_match_score: 0.2432 ± 0.0010
    average_best_levenshtein: 126.4667 ± 0.1443
    average_best_normalized_levenshtein: 0.4084 ± 0.0011
    average_inference_time: 4.7160 ± 0.2034
==================================================
================================================================================

gpt-5-minimal
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 83.5784 ± 2.9717
total_entries: 136.0000 ± 0.0000
correct_matches: 113.6667 ± 4.0415
average_execution_time: 1.6818 ± 0.0416
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 107.0000 ± 2.6458
    exact_code_matches: 97.6667 ± 2.5166
    exact_code_match_rate: 84.1954 ± 2.1695
    pv_exact_matched: 109.0000 ± 2.6458
    pv_exact_mismatched: 7.0000 ± 2.6458
    pv_exact_match_rate: 93.9655 ± 2.2808
    average_pv_match_rate: 95.7328 ± 2.2432
    average_pv_mismatch_rate: 4.2672 ± 2.2432
    timing_matched: 108.3333 ± 2.5166
    timing_mismatched: 7.6667 ± 2.5166
    timing_match_rate: 93.3908 ± 2.1695
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 107.0000 ± 2.6458
    full_mismatched: 9.0000 ± 2.6458
    full_match_rate: 92.2414 ± 2.2808
    accuracy: 92.2414 ± 2.2808
    average_timing_score: 0.9534 ± 0.0217
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9565 ± 0.0222
    average_best_codebleu:
      average_CodeBLEU: 0.5950 ± 0.0094
      comb_7_CodeBLEU: 0.8216 ± 0.0103
      dataflow_match_score: 0.2759 ± 0.0000
      ngram_match_score: 0.2564 ± 0.0095
      syntax_match_score: 0.9454 ± 0.0219
      weighted_ngram_match_score: 0.1782 ± 0.0071
    average_best_levenshtein: 5.9080 ± 1.3090
    average_best_normalized_levenshtein: 0.0653 ± 0.0175
    average_inference_time: 1.2721 ± 0.0817
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 6.6667 ± 1.5275
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 7.6667 ± 2.3094
    pv_exact_mismatched: 12.3333 ± 2.3094
    pv_exact_match_rate: 38.3333 ± 11.5470
    average_pv_match_rate: 83.6603 ± 4.4532
    average_pv_mismatch_rate: 16.3397 ± 4.4532
    timing_matched: 17.0000 ± 1.0000
    timing_mismatched: 3.0000 ± 1.0000
    timing_match_rate: 85.0000 ± 5.0000
    temp_matched: 8.0000 ± 1.0000
    temp_mismatched: 1.0000 ± 1.0000
    temp_match_rate: 88.8889 ± 11.1111
    full_matched: 6.6667 ± 1.5275
    full_mismatched: 13.3333 ± 1.5275
    full_match_rate: 33.3333 ± 7.6376
    accuracy: 33.3333 ± 7.6376
    average_timing_score: 0.8155 ± 0.0232
    average_temp_score: 0.9831 ± 0.0134
    average_full_score: 0.8386 ± 0.0376
    average_best_codebleu:
      average_CodeBLEU: 0.3742 ± 0.0141
      comb_7_CodeBLEU: 0.4977 ± 0.0126
      dataflow_match_score: 0.4867 ± 0.0317
      ngram_match_score: 0.1460 ± 0.0182
      syntax_match_score: 0.5902 ± 0.0237
      weighted_ngram_match_score: 0.1907 ± 0.0164
    average_best_levenshtein: 220.0333 ± 20.7356
    average_best_normalized_levenshtein: 0.5170 ± 0.0118
    average_inference_time: 4.0582 ± 0.3519
==================================================
================================================================================

mistral
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 13.7255 ± 0.4245
total_entries: 136.0000 ± 0.0000
correct_matches: 18.6667 ± 0.5774
average_execution_time: 1.0097 ± 0.0581
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 18.6667 ± 0.5774
    exact_code_matches: 16.6667 ± 0.5774
    exact_code_match_rate: 14.3678 ± 0.4977
    pv_exact_matched: 18.6667 ± 0.5774
    pv_exact_mismatched: 97.3333 ± 0.5774
    pv_exact_match_rate: 16.0920 ± 0.4977
    average_pv_match_rate: 19.9713 ± 0.4977
    average_pv_mismatch_rate: 80.0287 ± 0.4977
    timing_matched: 27.6667 ± 0.5774
    timing_mismatched: 88.3333 ± 0.5774
    timing_match_rate: 23.8506 ± 0.4977
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 18.6667 ± 0.5774
    full_mismatched: 97.3333 ± 0.5774
    full_match_rate: 16.0920 ± 0.4977
    accuracy: 16.0920 ± 0.4977
    average_timing_score: 0.2375 ± 0.0050
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.2073 ± 0.0050
    average_best_codebleu:
      average_CodeBLEU: 0.4586 ± 0.0002
      comb_7_CodeBLEU: 0.7035 ± 0.0004
      dataflow_match_score: 0.2443 ± 0.0050
      ngram_match_score: 0.0578 ± 0.0006
      syntax_match_score: 0.7337 ± 0.0011
      weighted_ngram_match_score: 0.0429 ± 0.0002
    average_best_levenshtein: 15.3046 ± 0.1095
    average_best_normalized_levenshtein: 0.3854 ± 0.0022
    average_inference_time: 0.8824 ± 0.0702
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 0.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 0.0000 ± 0.0000
    pv_exact_mismatched: 20.0000 ± 0.0000
    pv_exact_match_rate: 0.0000 ± 0.0000
    average_pv_match_rate: 5.7462 ± 0.0000
    average_pv_mismatch_rate: 94.2538 ± 0.0000
    timing_matched: 2.0000 ± 0.0000
    timing_mismatched: 18.0000 ± 0.0000
    timing_match_rate: 10.0000 ± 0.0000
    temp_matched: 1.0000 ± 0.0000
    temp_mismatched: 8.0000 ± 0.0000
    temp_match_rate: 11.1111 ± 0.0000
    full_matched: 0.0000 ± 0.0000
    full_mismatched: 20.0000 ± 0.0000
    full_match_rate: 0.0000 ± 0.0000
    accuracy: 0.0000 ± 0.0000
    average_timing_score: 0.1090 ± 0.0000
    average_temp_score: 0.6000 ± 0.0000
    average_full_score: 0.0686 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.2836 ± 0.0000
      comb_7_CodeBLEU: 0.3964 ± 0.0000
      dataflow_match_score: 0.4052 ± 0.0000
      ngram_match_score: 0.0849 ± 0.0000
      syntax_match_score: 0.4881 ± 0.0000
      weighted_ngram_match_score: 0.1060 ± 0.0000
    average_best_levenshtein: 191.6000 ± 0.0000
    average_best_normalized_levenshtein: 0.6300 ± 0.0000
    average_inference_time: 1.7481 ± 0.0125
==================================================
================================================================================

phi3.5-fp16
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 0.2451 ± 0.4245
total_entries: 136.0000 ± 0.0000
correct_matches: 0.3333 ± 0.5774
average_execution_time: 1.3105 ± 1.2108
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 0.3333 ± 0.5774
    exact_code_matches: 0.3333 ± 0.5774
    exact_code_match_rate: 0.2874 ± 0.4977
    pv_exact_matched: 0.3333 ± 0.5774
    pv_exact_mismatched: 115.6667 ± 0.5774
    pv_exact_match_rate: 0.2874 ± 0.4977
    average_pv_match_rate: 0.6322 ± 1.0950
    average_pv_mismatch_rate: 99.3678 ± 1.0950
    timing_matched: 0.3333 ± 0.5774
    timing_mismatched: 115.6667 ± 0.5774
    timing_match_rate: 0.2874 ± 0.4977
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 0.3333 ± 0.5774
    full_mismatched: 115.6667 ± 0.5774
    full_match_rate: 0.2874 ± 0.4977
    accuracy: 0.2874 ± 0.4977
    average_timing_score: 0.0048 ± 0.0082
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.0060 ± 0.0104
    average_best_codebleu:
      average_CodeBLEU: 0.2579 ± 0.0137
      comb_7_CodeBLEU: 0.4121 ± 0.0210
      dataflow_match_score: 0.0402 ± 0.0697
      ngram_match_score: 0.0009 ± 0.0016
      syntax_match_score: 0.0299 ± 0.0518
      weighted_ngram_match_score: 0.0008 ± 0.0014
    average_best_levenshtein: 358.7845 ± 585.7318
    average_best_normalized_levenshtein: 0.9912 ± 0.0152
    average_inference_time: 1.4315 ± 1.4194
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 0.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 0.0000 ± 0.0000
    pv_exact_mismatched: 20.0000 ± 0.0000
    pv_exact_match_rate: 0.0000 ± 0.0000
    average_pv_match_rate: 0.0000 ± 0.0000
    average_pv_mismatch_rate: 100.0000 ± 0.0000
    timing_matched: 0.0000 ± 0.0000
    timing_mismatched: 20.0000 ± 0.0000
    timing_match_rate: 0.0000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 9.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 0.0000 ± 0.0000
    full_mismatched: 20.0000 ± 0.0000
    full_match_rate: 0.0000 ± 0.0000
    accuracy: 0.0000 ± 0.0000
    average_timing_score: 0.0000 ± 0.0000
    average_temp_score: 0.5500 ± 0.0000
    average_full_score: 0.0000 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.2500 ± 0.0000
      comb_7_CodeBLEU: 0.4000 ± 0.0000
      dataflow_match_score: 0.0000 ± 0.0000
      ngram_match_score: 0.0000 ± 0.0000
      syntax_match_score: 0.0000 ± 0.0000
      weighted_ngram_match_score: 0.0000 ± 0.0000
    average_best_levenshtein: 256.0000 ± 0.0000
    average_best_normalized_levenshtein: 1.0000 ± 0.0000
    average_inference_time: 0.6089 ± 0.0012
==================================================
================================================================================

devstral-med-or
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 89.2157 ± 1.1232
average_execution_time: 2.4140 ± 1.2784
correct_matches: 121.3333 ± 1.5275
metrics_by_complexity:
  complex:
    accuracy: 43.3333 ± 5.7735
    average_best_codebleu:
      average_CodeBLEU: 0.4422 ± 0.0039
      comb_7_CodeBLEU: 0.5686 ± 0.0037
      dataflow_match_score: 0.5430 ± 0.0143
      ngram_match_score: 0.2227 ± 0.0118
      syntax_match_score: 0.6627 ± 0.0069
      weighted_ngram_match_score: 0.2405 ± 0.0122
    average_best_levenshtein: 127.7333 ± 2.7750
    average_best_normalized_levenshtein: 0.4095 ± 0.0139
    average_full_score: 0.8065 ± 0.0064
    average_inference_time: 5.0377 ± 4.2963
    average_pv_match_rate: 79.5434 ± 0.9234
    average_pv_mismatch_rate: 20.4566 ± 0.9234
    average_temp_score: 0.9834 ± 0.0156
    average_timing_score: 0.8067 ± 0.0151
    count: 20.0000 ± 0.0000
    exact_code_match_rate: 5.0000 ± 0.0000
    exact_code_matches: 1.0000 ± 0.0000
    full_match_rate: 43.3333 ± 5.7735
    full_matched: 8.6667 ± 1.1547
    full_matches: 8.6667 ± 1.1547
    full_mismatched: 11.3333 ± 1.1547
    pv_exact_match_rate: 55.0000 ± 5.0000
    pv_exact_matched: 11.0000 ± 1.0000
    pv_exact_mismatched: 9.0000 ± 1.0000
    temp_match_rate: 92.5926 ± 6.4150
    temp_matched: 8.3333 ± 0.5774
    temp_mismatched: 0.6667 ± 0.5774
    timing_match_rate: 70.0000 ± 5.0000
    timing_matched: 14.0000 ± 1.0000
    timing_mismatched: 6.0000 ± 1.0000
  simple:
    accuracy: 97.1264 ± 0.4977
    average_best_codebleu:
      average_CodeBLEU: 0.5991 ± 0.0020
      comb_7_CodeBLEU: 0.8349 ± 0.0023
      dataflow_match_score: 0.2443 ± 0.0050
      ngram_match_score: 0.2552 ± 0.0016
      syntax_match_score: 0.9843 ± 0.0051
      weighted_ngram_match_score: 0.1568 ± 0.0013
    average_best_levenshtein: 1.1695 ± 0.1650
    average_best_normalized_levenshtein: 0.0286 ± 0.0047
    average_full_score: 0.9889 ± 0.0040
    average_inference_time: 1.9617 ± 1.3865
    average_pv_match_rate: 98.8506 ± 0.4310
    average_pv_mismatch_rate: 1.1494 ± 0.4310
    average_temp_score: 1.0000 ± 0.0000
    average_timing_score: 0.9905 ± 0.0034
    count: 116.0000 ± 0.0000
    exact_code_match_rate: 91.9540 ± 0.4977
    exact_code_matches: 106.6667 ± 0.5774
    full_match_rate: 97.1264 ± 0.4977
    full_matched: 112.6667 ± 0.5774
    full_matches: 112.6667 ± 0.5774
    full_mismatched: 3.3333 ± 0.5774
    pv_exact_match_rate: 97.7011 ± 0.4977
    pv_exact_matched: 113.3333 ± 0.5774
    pv_exact_mismatched: 2.6667 ± 0.5774
    temp_match_rate: 0.0000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    timing_match_rate: 98.2759 ± 0.0000
    timing_matched: 114.0000 ± 0.0000
    timing_mismatched: 2.0000 ± 0.0000
total_entries: 136.0000 ± 0.0000
==================================================
================================================================================

