llama3.3
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 84.9206 ± 0.0000
total_entries: 126.0000 ± 0.0000
correct_matches: 107.0000 ± 0.0000
average_execution_time: 0.8937 ± 0.1976
metrics_by_complexity:
  simple:
    count: 120.0000 ± 0.0000
    full_matches: 104.0000 ± 0.0000
    exact_code_matches: 96.0000 ± 0.0000
    exact_code_match_rate: 80.0000 ± 0.0000
    pv_exact_matched: 104.0000 ± 0.0000
    pv_exact_mismatched: 16.0000 ± 0.0000
    pv_exact_match_rate: 86.6667 ± 0.0000
    average_pv_match_rate: 93.3333 ± 0.0000
    average_pv_mismatch_rate: 6.6667 ± 0.0000
    timing_matched: 120.0000 ± 0.0000
    timing_mismatched: 0.0000 ± 0.0000
    timing_match_rate: 100.0000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 104.0000 ± 0.0000
    full_mismatched: 16.0000 ± 0.0000
    full_match_rate: 86.6667 ± 0.0000
    accuracy: 86.6667 ± 0.0000
    average_timing_score: 0.9996 ± 0.0004
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9466 ± 0.0001
    average_best_codebleu:
      average_CodeBLEU: 0.5896 ± 0.0000
      comb_7_CodeBLEU: 0.8276 ± 0.0000
      dataflow_match_score: 0.1667 ± 0.0000
      ngram_match_score: 0.2352 ± 0.0000
      syntax_match_score: 0.9725 ± 0.0000
      weighted_ngram_match_score: 0.1509 ± 0.0000
    average_best_levenshtein: 4.7250 ± 0.0000
    average_best_normalized_levenshtein: 0.0698 ± 0.0000
    average_inference_time: 0.7724 ± 0.1129
  complex:
    count: 6.0000 ± 0.0000
    full_matches: 3.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 3.0000 ± 0.0000
    pv_exact_mismatched: 3.0000 ± 0.0000
    pv_exact_match_rate: 50.0000 ± 0.0000
    average_pv_match_rate: 70.6782 ± 0.0000
    average_pv_mismatch_rate: 29.3218 ± 0.0000
    timing_matched: 5.0000 ± 0.0000
    timing_mismatched: 1.0000 ± 0.0000
    timing_match_rate: 83.3333 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 3.0000 ± 0.0000
    full_mismatched: 3.0000 ± 0.0000
    full_match_rate: 50.0000 ± 0.0000
    accuracy: 50.0000 ± 0.0000
    average_timing_score: 0.8457 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.7346 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.5849 ± 0.0000
      comb_7_CodeBLEU: 0.6973 ± 0.0000
      dataflow_match_score: 0.7655 ± 0.0000
      ngram_match_score: 0.3798 ± 0.0000
      syntax_match_score: 0.7790 ± 0.0000
      weighted_ngram_match_score: 0.4154 ± 0.0000
    average_best_levenshtein: 44.1667 ± 0.0000
    average_best_normalized_levenshtein: 0.2430 ± 0.0000
    average_inference_time: 3.3205 ± 1.9168
==================================================
================================================================================

athene-v2
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 84.1270 ± 0.0000
total_entries: 126.0000 ± 0.0000
correct_matches: 106.0000 ± 0.0000
average_execution_time: 0.7504 ± 0.1202
metrics_by_complexity:
  simple:
    count: 120.0000 ± 0.0000
    full_matches: 105.0000 ± 0.0000
    exact_code_matches: 100.0000 ± 0.0000
    exact_code_match_rate: 83.3333 ± 0.0000
    pv_exact_matched: 105.0000 ± 0.0000
    pv_exact_mismatched: 15.0000 ± 0.0000
    pv_exact_match_rate: 87.5000 ± 0.0000
    average_pv_match_rate: 93.7500 ± 0.0000
    average_pv_mismatch_rate: 6.2500 ± 0.0000
    timing_matched: 120.0000 ± 0.0000
    timing_mismatched: 0.0000 ± 0.0000
    timing_match_rate: 100.0000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 105.0000 ± 0.0000
    full_mismatched: 15.0000 ± 0.0000
    full_match_rate: 87.5000 ± 0.0000
    accuracy: 87.5000 ± 0.0000
    average_timing_score: 1.0000 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9500 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.5753 ± 0.0000
      comb_7_CodeBLEU: 0.8228 ± 0.0000
      dataflow_match_score: 0.1000 ± 0.0000
      ngram_match_score: 0.2111 ± 0.0000
      syntax_match_score: 0.9756 ± 0.0000
      weighted_ngram_match_score: 0.1145 ± 0.0000
    average_best_levenshtein: 2.8083 ± 0.0000
    average_best_normalized_levenshtein: 0.0562 ± 0.0000
    average_inference_time: 0.6438 ± 0.0801
  complex:
    count: 6.0000 ± 0.0000
    full_matches: 1.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 2.0000 ± 0.0000
    pv_exact_mismatched: 4.0000 ± 0.0000
    pv_exact_match_rate: 33.3333 ± 0.0000
    average_pv_match_rate: 83.8359 ± 0.0000
    average_pv_mismatch_rate: 16.1641 ± 0.0000
    timing_matched: 4.0000 ± 0.0000
    timing_mismatched: 2.0000 ± 0.0000
    timing_match_rate: 66.6667 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 1.0000 ± 0.0000
    full_mismatched: 5.0000 ± 0.0000
    full_match_rate: 16.6667 ± 0.0000
    accuracy: 16.6667 ± 0.0000
    average_timing_score: 0.8024 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.8312 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.6373 ± 0.0000
      comb_7_CodeBLEU: 0.7091 ± 0.0000
      dataflow_match_score: 0.7391 ± 0.0000
      ngram_match_score: 0.4904 ± 0.0000
      syntax_match_score: 0.7749 ± 0.0000
      weighted_ngram_match_score: 0.5447 ± 0.0000
    average_best_levenshtein: 33.5000 ± 0.0000
    average_best_normalized_levenshtein: 0.1964 ± 0.0000
    average_inference_time: 2.8823 ± 0.9238
==================================================
================================================================================

phi3.5
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 0.7937 ± 1.3746
total_entries: 126.0000 ± 0.0000
correct_matches: 1.0000 ± 1.7321
average_execution_time: 0.7662 ± 0.3395
metrics_by_complexity:
  simple:
    count: 120.0000 ± 0.0000
    full_matches: 1.0000 ± 1.7321
    exact_code_matches: 1.0000 ± 1.7321
    exact_code_match_rate: 0.8333 ± 1.4434
    pv_exact_matched: 1.0000 ± 1.7321
    pv_exact_mismatched: 119.0000 ± 1.7321
    pv_exact_match_rate: 0.8333 ± 1.4434
    average_pv_match_rate: 12.3472 ± 2.2132
    average_pv_mismatch_rate: 87.6528 ± 2.2132
    timing_matched: 31.3333 ± 8.0829
    timing_mismatched: 88.6667 ± 8.0829
    timing_match_rate: 26.1111 ± 6.7358
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 1.0000 ± 1.7321
    full_mismatched: 119.0000 ± 1.7321
    full_match_rate: 0.8333 ± 1.4434
    accuracy: 0.8333 ± 1.4434
    average_timing_score: 0.2611 ± 0.0674
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.1510 ± 0.0312
    average_best_codebleu:
      average_CodeBLEU: 0.2570 ± 0.0121
      comb_7_CodeBLEU: 0.4100 ± 0.0173
      dataflow_match_score: 0.0250 ± 0.0433
      ngram_match_score: 0.0023 ± 0.0039
      syntax_match_score: 0.0239 ± 0.0414
      weighted_ngram_match_score: 0.0018 ± 0.0032
    average_best_levenshtein: 150.8861 ± 225.6910
    average_best_normalized_levenshtein: 0.9856 ± 0.0249
    average_inference_time: 0.7648 ± 0.3373
  complex:
    count: 6.0000 ± 0.0000
    full_matches: 0.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 0.0000 ± 0.0000
    pv_exact_mismatched: 6.0000 ± 0.0000
    pv_exact_match_rate: 0.0000 ± 0.0000
    average_pv_match_rate: 0.0000 ± 0.0000
    average_pv_mismatch_rate: 100.0000 ± 0.0000
    timing_matched: 0.0000 ± 0.0000
    timing_mismatched: 6.0000 ± 0.0000
    timing_match_rate: 0.0000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 0.0000 ± 0.0000
    full_mismatched: 6.0000 ± 0.0000
    full_match_rate: 0.0000 ± 0.0000
    accuracy: 0.0000 ± 0.0000
    average_timing_score: 0.0000 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.0000 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.2322 ± 0.0309
      comb_7_CodeBLEU: 0.3703 ± 0.0515
      dataflow_match_score: 0.0591 ± 0.1023
      ngram_match_score: 0.0006 ± 0.0010
      syntax_match_score: 0.0323 ± 0.0559
      weighted_ngram_match_score: 0.0033 ± 0.0058
    average_best_levenshtein: 181.5000 ± 107.0985
    average_best_normalized_levenshtein: 0.9550 ± 0.0780
    average_inference_time: 0.7928 ± 0.3837
==================================================
================================================================================

qwen2
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 81.7460 ± 0.0000
total_entries: 126.0000 ± 0.0000
correct_matches: 103.0000 ± 0.0000
average_execution_time: 1.0300 ± 0.0854
metrics_by_complexity:
  simple:
    count: 120.0000 ± 0.0000
    full_matches: 101.0000 ± 0.0000
    exact_code_matches: 90.0000 ± 0.0000
    exact_code_match_rate: 75.0000 ± 0.0000
    pv_exact_matched: 101.0000 ± 0.0000
    pv_exact_mismatched: 19.0000 ± 0.0000
    pv_exact_match_rate: 84.1667 ± 0.0000
    average_pv_match_rate: 90.9722 ± 0.0000
    average_pv_mismatch_rate: 9.0278 ± 0.0000
    timing_matched: 119.0000 ± 0.0000
    timing_mismatched: 1.0000 ± 0.0000
    timing_match_rate: 99.1667 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 101.0000 ± 0.0000
    full_mismatched: 19.0000 ± 0.0000
    full_match_rate: 84.1667 ± 0.0000
    accuracy: 84.1667 ± 0.0000
    average_timing_score: 0.9916 ± 0.0001
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9261 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.5671 ± 0.0000
      comb_7_CodeBLEU: 0.8172 ± 0.0000
      dataflow_match_score: 0.1000 ± 0.0000
      ngram_match_score: 0.1941 ± 0.0000
      syntax_match_score: 0.9678 ± 0.0000
      weighted_ngram_match_score: 0.1067 ± 0.0000
    average_best_levenshtein: 3.7583 ± 0.0000
    average_best_normalized_levenshtein: 0.0917 ± 0.0000
    average_inference_time: 1.0390 ± 0.0747
  complex:
    count: 6.0000 ± 0.0000
    full_matches: 2.0000 ± 0.0000
    exact_code_matches: 1.0000 ± 0.0000
    exact_code_match_rate: 16.6667 ± 0.0000
    pv_exact_matched: 2.0000 ± 0.0000
    pv_exact_mismatched: 4.0000 ± 0.0000
    pv_exact_match_rate: 33.3333 ± 0.0000
    average_pv_match_rate: 79.3349 ± 1.9245
    average_pv_mismatch_rate: 20.6651 ± 1.9245
    timing_matched: 6.0000 ± 0.0000
    timing_mismatched: 0.0000 ± 0.0000
    timing_match_rate: 100.0000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 2.0000 ± 0.0000
    full_mismatched: 4.0000 ± 0.0000
    full_match_rate: 33.3333 ± 0.0000
    accuracy: 33.3333 ± 0.0000
    average_timing_score: 0.9100 ± 0.0017
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.8167 ± 0.0150
    average_best_codebleu:
      average_CodeBLEU: 0.5046 ± 0.0599
      comb_7_CodeBLEU: 0.6314 ± 0.0560
      dataflow_match_score: 0.5155 ± 0.0615
      ngram_match_score: 0.2860 ± 0.0673
      syntax_match_score: 0.7497 ± 0.0451
      weighted_ngram_match_score: 0.3007 ± 0.0658
    average_best_levenshtein: 58.3889 ± 12.6055
    average_best_normalized_levenshtein: 0.3982 ± 0.0521
    average_inference_time: 0.8491 ± 0.3308
==================================================
================================================================================

claude-3.5-sonnet
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 85.4497 ± 0.4582
total_entries: 126.0000 ± 0.0000
correct_matches: 107.6667 ± 0.5774
average_execution_time: 1.1828 ± 0.1194
metrics_by_complexity:
  simple:
    count: 120.0000 ± 0.0000
    full_matches: 105.0000 ± 0.0000
    exact_code_matches: 100.0000 ± 0.0000
    exact_code_match_rate: 83.3333 ± 0.0000
    pv_exact_matched: 105.0000 ± 0.0000
    pv_exact_mismatched: 15.0000 ± 0.0000
    pv_exact_match_rate: 87.5000 ± 0.0000
    average_pv_match_rate: 93.7500 ± 0.0000
    average_pv_mismatch_rate: 6.2500 ± 0.0000
    timing_matched: 120.0000 ± 0.0000
    timing_mismatched: 0.0000 ± 0.0000
    timing_match_rate: 100.0000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 105.0000 ± 0.0000
    full_mismatched: 15.0000 ± 0.0000
    full_match_rate: 87.5000 ± 0.0000
    accuracy: 87.5000 ± 0.0000
    average_timing_score: 1.0000 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9500 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.6017 ± 0.0017
      comb_7_CodeBLEU: 0.8336 ± 0.0007
      dataflow_match_score: 0.1500 ± 0.0000
      ngram_match_score: 0.2605 ± 0.0033
      syntax_match_score: 0.9764 ± 0.0000
      weighted_ngram_match_score: 0.1699 ± 0.0036
    average_best_levenshtein: 2.8167 ± 0.0000
    average_best_normalized_levenshtein: 0.0555 ± 0.0000
    average_inference_time: 1.1548 ± 0.1239
  complex:
    count: 6.0000 ± 0.0000
    full_matches: 2.6667 ± 0.5774
    exact_code_matches: 1.0000 ± 0.0000
    exact_code_match_rate: 16.6667 ± 0.0000
    pv_exact_matched: 2.6667 ± 0.5774
    pv_exact_mismatched: 3.3333 ± 0.5774
    pv_exact_match_rate: 44.4444 ± 9.6225
    average_pv_match_rate: 85.8663 ± 1.4804
    average_pv_mismatch_rate: 14.1337 ± 1.4804
    timing_matched: 6.0000 ± 0.0000
    timing_mismatched: 0.0000 ± 0.0000
    timing_match_rate: 100.0000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 2.6667 ± 0.5774
    full_mismatched: 3.3333 ± 0.5774
    full_match_rate: 44.4444 ± 9.6225
    accuracy: 44.4444 ± 9.6225
    average_timing_score: 0.9199 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.8709 ± 0.0119
    average_best_codebleu:
      average_CodeBLEU: 0.6847 ± 0.0057
      comb_7_CodeBLEU: 0.7560 ± 0.0032
      dataflow_match_score: 0.7655 ± 0.0000
      ngram_match_score: 0.5395 ± 0.0127
      syntax_match_score: 0.8415 ± 0.0030
      weighted_ngram_match_score: 0.5922 ± 0.0071
    average_best_levenshtein: 31.0556 ± 5.0037
    average_best_normalized_levenshtein: 0.1735 ± 0.0188
    average_inference_time: 1.7430 ± 0.0858
==================================================
================================================================================

qwen2.5
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 80.9524 ± 0.0000
total_entries: 126.0000 ± 0.0000
correct_matches: 102.0000 ± 0.0000
average_execution_time: 1.0526 ± 0.1249
metrics_by_complexity:
  simple:
    count: 120.0000 ± 0.0000
    full_matches: 101.0000 ± 0.0000
    exact_code_matches: 92.0000 ± 0.0000
    exact_code_match_rate: 76.6667 ± 0.0000
    pv_exact_matched: 101.0000 ± 0.0000
    pv_exact_mismatched: 19.0000 ± 0.0000
    pv_exact_match_rate: 84.1667 ± 0.0000
    average_pv_match_rate: 91.3889 ± 0.0000
    average_pv_mismatch_rate: 8.6111 ± 0.0000
    timing_matched: 120.0000 ± 0.0000
    timing_mismatched: 0.0000 ± 0.0000
    timing_match_rate: 100.0000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 101.0000 ± 0.0000
    full_mismatched: 19.0000 ± 0.0000
    full_match_rate: 84.1667 ± 0.0000
    accuracy: 84.1667 ± 0.0000
    average_timing_score: 1.0000 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9311 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.5707 ± 0.0000
      comb_7_CodeBLEU: 0.8214 ± 0.0000
      dataflow_match_score: 0.1000 ± 0.0000
      ngram_match_score: 0.1984 ± 0.0000
      syntax_match_score: 0.9769 ± 0.0000
      weighted_ngram_match_score: 0.1077 ± 0.0000
    average_best_levenshtein: 3.5167 ± 0.0000
    average_best_normalized_levenshtein: 0.0832 ± 0.0000
    average_inference_time: 1.0357 ± 0.0973
  complex:
    count: 6.0000 ± 0.0000
    full_matches: 1.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 1.0000 ± 0.0000
    pv_exact_mismatched: 5.0000 ± 0.0000
    pv_exact_match_rate: 16.6667 ± 0.0000
    average_pv_match_rate: 72.8315 ± 0.0000
    average_pv_mismatch_rate: 27.1685 ± 0.0000
    timing_matched: 6.0000 ± 0.0000
    timing_mismatched: 0.0000 ± 0.0000
    timing_match_rate: 100.0000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 1.0000 ± 0.0000
    full_mismatched: 5.0000 ± 0.0000
    full_match_rate: 16.6667 ± 0.0000
    accuracy: 16.6667 ± 0.0000
    average_timing_score: 0.8943 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.7615 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.5393 ± 0.0001
      comb_7_CodeBLEU: 0.6823 ± 0.0000
      dataflow_match_score: 0.5737 ± 0.0000
      ngram_match_score: 0.2766 ± 0.0001
      syntax_match_score: 0.6482 ± 0.0000
      weighted_ngram_match_score: 0.3255 ± 0.0002
    average_best_levenshtein: 79.2222 ± 3.3679
    average_best_normalized_levenshtein: 0.4733 ± 0.0037
    average_inference_time: 1.3905 ± 0.8072
==================================================
================================================================================

athene-v2-agent
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 85.7143 ± 0.0000
total_entries: 126.0000 ± 0.0000
correct_matches: 108.0000 ± 0.0000
average_execution_time: 0.7870 ± 0.1727
metrics_by_complexity:
  simple:
    count: 120.0000 ± 0.0000
    full_matches: 105.0000 ± 0.0000
    exact_code_matches: 100.0000 ± 0.0000
    exact_code_match_rate: 83.3333 ± 0.0000
    pv_exact_matched: 105.0000 ± 0.0000
    pv_exact_mismatched: 15.0000 ± 0.0000
    pv_exact_match_rate: 87.5000 ± 0.0000
    average_pv_match_rate: 93.7500 ± 0.0000
    average_pv_mismatch_rate: 6.2500 ± 0.0000
    timing_matched: 120.0000 ± 0.0000
    timing_mismatched: 0.0000 ± 0.0000
    timing_match_rate: 100.0000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 105.0000 ± 0.0000
    full_mismatched: 15.0000 ± 0.0000
    full_match_rate: 87.5000 ± 0.0000
    accuracy: 87.5000 ± 0.0000
    average_timing_score: 1.0000 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9500 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.5753 ± 0.0000
      comb_7_CodeBLEU: 0.8228 ± 0.0000
      dataflow_match_score: 0.1000 ± 0.0000
      ngram_match_score: 0.2111 ± 0.0000
      syntax_match_score: 0.9756 ± 0.0000
      weighted_ngram_match_score: 0.1145 ± 0.0000
    average_best_levenshtein: 2.8083 ± 0.0000
    average_best_normalized_levenshtein: 0.0562 ± 0.0000
    average_inference_time: 0.6871 ± 0.1328
  complex:
    count: 6.0000 ± 0.0000
    full_matches: 3.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 3.0000 ± 0.0000
    pv_exact_mismatched: 3.0000 ± 0.0000
    pv_exact_match_rate: 50.0000 ± 0.0000
    average_pv_match_rate: 93.0315 ± 0.0000
    average_pv_mismatch_rate: 6.9685 ± 0.0000
    timing_matched: 5.0000 ± 0.0000
    timing_mismatched: 1.0000 ± 0.0000
    timing_match_rate: 83.3333 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 3.0000 ± 0.0000
    full_mismatched: 3.0000 ± 0.0000
    full_match_rate: 50.0000 ± 0.0000
    accuracy: 50.0000 ± 0.0000
    average_timing_score: 0.8479 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9138 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.6196 ± 0.0000
      comb_7_CodeBLEU: 0.7337 ± 0.0000
      dataflow_match_score: 0.6750 ± 0.0000
      ngram_match_score: 0.4207 ± 0.0000
      syntax_match_score: 0.7777 ± 0.0000
      weighted_ngram_match_score: 0.4384 ± 0.0000
    average_best_levenshtein: 40.0000 ± 0.0000
    average_best_normalized_levenshtein: 0.2631 ± 0.0000
    average_inference_time: 2.7852 ± 0.9797
==================================================
================================================================================

mistral-nemo
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 80.1587 ± 0.0000
total_entries: 126.0000 ± 0.0000
correct_matches: 101.0000 ± 0.0000
average_execution_time: 1.1329 ± 0.0857
metrics_by_complexity:
  simple:
    count: 120.0000 ± 0.0000
    full_matches: 97.0000 ± 0.0000
    exact_code_matches: 78.0000 ± 0.0000
    exact_code_match_rate: 65.0000 ± 0.0000
    pv_exact_matched: 100.0000 ± 0.0000
    pv_exact_mismatched: 20.0000 ± 0.0000
    pv_exact_match_rate: 83.3333 ± 0.0000
    average_pv_match_rate: 90.9722 ± 0.0000
    average_pv_mismatch_rate: 9.0278 ± 0.0000
    timing_matched: 117.0000 ± 0.0000
    timing_mismatched: 3.0000 ± 0.0000
    timing_match_rate: 97.5000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 97.0000 ± 0.0000
    full_mismatched: 23.0000 ± 0.0000
    full_match_rate: 80.8333 ± 0.0000
    accuracy: 80.8333 ± 0.0000
    average_timing_score: 0.9921 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9262 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.5444 ± 0.0000
      comb_7_CodeBLEU: 0.7873 ± 0.0000
      dataflow_match_score: 0.1000 ± 0.0000
      ngram_match_score: 0.1778 ± 0.0000
      syntax_match_score: 0.8985 ± 0.0000
      weighted_ngram_match_score: 0.1011 ± 0.0000
    average_best_levenshtein: 4.0417 ± 0.0000
    average_best_normalized_levenshtein: 0.1057 ± 0.0000
    average_inference_time: 1.1485 ± 0.0765
  complex:
    count: 6.0000 ± 0.0000
    full_matches: 4.0000 ± 0.0000
    exact_code_matches: 1.0000 ± 0.0000
    exact_code_match_rate: 16.6667 ± 0.0000
    pv_exact_matched: 4.0000 ± 0.0000
    pv_exact_mismatched: 2.0000 ± 0.0000
    pv_exact_match_rate: 66.6667 ± 0.0000
    average_pv_match_rate: 92.2547 ± 0.0000
    average_pv_mismatch_rate: 7.7453 ± 0.0000
    timing_matched: 6.0000 ± 0.0000
    timing_mismatched: 0.0000 ± 0.0000
    timing_match_rate: 100.0000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 4.0000 ± 0.0000
    full_mismatched: 2.0000 ± 0.0000
    full_match_rate: 66.6667 ± 0.0000
    accuracy: 66.6667 ± 0.0000
    average_timing_score: 0.8983 ± 0.0006
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9177 ± 0.0001
    average_best_codebleu:
      average_CodeBLEU: 0.7115 ± 0.0003
      comb_7_CodeBLEU: 0.8097 ± 0.0001
      dataflow_match_score: 0.8980 ± 0.0000
      ngram_match_score: 0.5024 ± 0.0006
      syntax_match_score: 0.8523 ± 0.0000
      weighted_ngram_match_score: 0.5933 ± 0.0005
    average_best_levenshtein: 22.5000 ± 0.2887
    average_best_normalized_levenshtein: 0.1482 ± 0.0014
    average_inference_time: 0.8216 ± 0.3645
==================================================
================================================================================

gpt-4o
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 84.9206 ± 0.0000
total_entries: 126.0000 ± 0.0000
correct_matches: 107.0000 ± 0.0000
average_execution_time: 0.7194 ± 0.0578
metrics_by_complexity:
  simple:
    count: 120.0000 ± 0.0000
    full_matches: 104.0000 ± 0.0000
    exact_code_matches: 99.0000 ± 0.0000
    exact_code_match_rate: 82.5000 ± 0.0000
    pv_exact_matched: 104.0000 ± 0.0000
    pv_exact_mismatched: 16.0000 ± 0.0000
    pv_exact_match_rate: 86.6667 ± 0.0000
    average_pv_match_rate: 93.3333 ± 0.0000
    average_pv_mismatch_rate: 6.6667 ± 0.0000
    timing_matched: 120.0000 ± 0.0000
    timing_mismatched: 0.0000 ± 0.0000
    timing_match_rate: 100.0000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 104.0000 ± 0.0000
    full_mismatched: 16.0000 ± 0.0000
    full_match_rate: 86.6667 ± 0.0000
    accuracy: 86.6667 ± 0.0000
    average_timing_score: 1.0000 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9467 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.6132 ± 0.0044
      comb_7_CodeBLEU: 0.8375 ± 0.0024
      dataflow_match_score: 0.1972 ± 0.0096
      ngram_match_score: 0.2813 ± 0.0076
      syntax_match_score: 0.9739 ± 0.0029
      weighted_ngram_match_score: 0.1977 ± 0.0082
    average_best_levenshtein: 3.0722 ± 0.0096
    average_best_normalized_levenshtein: 0.0587 ± 0.0003
    average_inference_time: 0.6931 ± 0.0630
  complex:
    count: 6.0000 ± 0.0000
    full_matches: 3.0000 ± 0.0000
    exact_code_matches: 1.0000 ± 0.0000
    exact_code_match_rate: 16.6667 ± 0.0000
    pv_exact_matched: 3.0000 ± 0.0000
    pv_exact_mismatched: 3.0000 ± 0.0000
    pv_exact_match_rate: 50.0000 ± 0.0000
    average_pv_match_rate: 86.7210 ± 0.0000
    average_pv_mismatch_rate: 13.2790 ± 0.0000
    timing_matched: 6.0000 ± 0.0000
    timing_mismatched: 0.0000 ± 0.0000
    timing_match_rate: 100.0000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 3.0000 ± 0.0000
    full_mismatched: 3.0000 ± 0.0000
    full_match_rate: 50.0000 ± 0.0000
    accuracy: 50.0000 ± 0.0000
    average_timing_score: 0.9195 ± 0.0005
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.8777 ± 0.0001
    average_best_codebleu:
      average_CodeBLEU: 0.6313 ± 0.0179
      comb_7_CodeBLEU: 0.7118 ± 0.0108
      dataflow_match_score: 0.7457 ± 0.0038
      ngram_match_score: 0.4857 ± 0.0259
      syntax_match_score: 0.7851 ± 0.0096
      weighted_ngram_match_score: 0.5088 ± 0.0353
    average_best_levenshtein: 37.7222 ± 2.1752
    average_best_normalized_levenshtein: 0.1887 ± 0.0125
    average_inference_time: 1.2446 ± 0.0466
==================================================
================================================================================

qwen2.5-coder
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 85.7143 ± 0.0000
total_entries: 126.0000 ± 0.0000
correct_matches: 108.0000 ± 0.0000
average_execution_time: 1.0492 ± 0.1467
metrics_by_complexity:
  simple:
    count: 120.0000 ± 0.0000
    full_matches: 105.0000 ± 0.0000
    exact_code_matches: 100.0000 ± 0.0000
    exact_code_match_rate: 83.3333 ± 0.0000
    pv_exact_matched: 105.0000 ± 0.0000
    pv_exact_mismatched: 15.0000 ± 0.0000
    pv_exact_match_rate: 87.5000 ± 0.0000
    average_pv_match_rate: 93.3333 ± 0.0000
    average_pv_mismatch_rate: 6.6667 ± 0.0000
    timing_matched: 119.0000 ± 0.0000
    timing_mismatched: 1.0000 ± 0.0000
    timing_match_rate: 99.1667 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 105.0000 ± 0.0000
    full_mismatched: 15.0000 ± 0.0000
    full_match_rate: 87.5000 ± 0.0000
    accuracy: 87.5000 ± 0.0000
    average_timing_score: 0.9917 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9450 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.5792 ± 0.0000
      comb_7_CodeBLEU: 0.8230 ± 0.0000
      dataflow_match_score: 0.1333 ± 0.0000
      ngram_match_score: 0.2190 ± 0.0000
      syntax_match_score: 0.9711 ± 0.0000
      weighted_ngram_match_score: 0.1269 ± 0.0000
    average_best_levenshtein: 2.9833 ± 0.0000
    average_best_normalized_levenshtein: 0.0582 ± 0.0000
    average_inference_time: 0.9748 ± 0.0911
  complex:
    count: 6.0000 ± 0.0000
    full_matches: 3.0000 ± 0.0000
    exact_code_matches: 1.0000 ± 0.0000
    exact_code_match_rate: 16.6667 ± 0.0000
    pv_exact_matched: 3.0000 ± 0.0000
    pv_exact_mismatched: 3.0000 ± 0.0000
    pv_exact_match_rate: 50.0000 ± 0.0000
    average_pv_match_rate: 86.7210 ± 0.0000
    average_pv_mismatch_rate: 13.2790 ± 0.0000
    timing_matched: 6.0000 ± 0.0000
    timing_mismatched: 0.0000 ± 0.0000
    timing_match_rate: 100.0000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 3.0000 ± 0.0000
    full_mismatched: 3.0000 ± 0.0000
    full_match_rate: 50.0000 ± 0.0000
    accuracy: 50.0000 ± 0.0000
    average_timing_score: 0.9206 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.8779 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.6642 ± 0.0000
      comb_7_CodeBLEU: 0.7187 ± 0.0000
      dataflow_match_score: 0.7220 ± 0.0000
      ngram_match_score: 0.5000 ± 0.0000
      syntax_match_score: 0.7879 ± 0.0000
      weighted_ngram_match_score: 0.6470 ± 0.0000
    average_best_levenshtein: 40.0000 ± 0.0000
    average_best_normalized_levenshtein: 0.2198 ± 0.0000
    average_inference_time: 2.5360 ± 1.4959
==================================================
================================================================================

mistral
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 51.5873 ± 0.7937
total_entries: 126.0000 ± 0.0000
correct_matches: 65.0000 ± 1.0000
average_execution_time: 0.8327 ± 0.0810
metrics_by_complexity:
  simple:
    count: 120.0000 ± 0.0000
    full_matches: 65.0000 ± 1.0000
    exact_code_matches: 48.0000 ± 0.0000
    exact_code_match_rate: 40.0000 ± 0.0000
    pv_exact_matched: 70.0000 ± 0.0000
    pv_exact_mismatched: 50.0000 ± 0.0000
    pv_exact_match_rate: 58.3333 ± 0.0000
    average_pv_match_rate: 65.4167 ± 0.0000
    average_pv_mismatch_rate: 34.5833 ± 0.0000
    timing_matched: 80.0000 ± 1.0000
    timing_mismatched: 40.0000 ± 1.0000
    timing_match_rate: 66.6667 ± 0.8333
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 65.0000 ± 1.0000
    full_mismatched: 55.0000 ± 1.0000
    full_match_rate: 54.1667 ± 0.8333
    accuracy: 54.1667 ± 0.8333
    average_timing_score: 0.7095 ± 0.0029
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.6652 ± 0.0006
    average_best_codebleu:
      average_CodeBLEU: 0.4867 ± 0.0000
      comb_7_CodeBLEU: 0.7235 ± 0.0000
      dataflow_match_score: 0.1396 ± 0.0000
      ngram_match_score: 0.1105 ± 0.0000
      syntax_match_score: 0.7646 ± 0.0000
      weighted_ngram_match_score: 0.0739 ± 0.0000
    average_best_levenshtein: 25.7556 ± 0.0096
    average_best_normalized_levenshtein: 0.2572 ± 0.0000
    average_inference_time: 0.7627 ± 0.0325
  complex:
    count: 6.0000 ± 0.0000
    full_matches: 0.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 0.0000 ± 0.0000
    pv_exact_mismatched: 6.0000 ± 0.0000
    pv_exact_match_rate: 0.0000 ± 0.0000
    average_pv_match_rate: 2.5253 ± 4.3739
    average_pv_mismatch_rate: 97.4747 ± 4.3739
    timing_matched: 0.0000 ± 0.0000
    timing_mismatched: 6.0000 ± 0.0000
    timing_match_rate: 0.0000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 0.0000 ± 0.0000
    full_mismatched: 6.0000 ± 0.0000
    full_match_rate: 0.0000 ± 0.0000
    accuracy: 0.0000 ± 0.0000
    average_timing_score: 0.0316 ± 0.0548
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.0265 ± 0.0459
    average_best_codebleu:
      average_CodeBLEU: 0.4665 ± 0.0011
      comb_7_CodeBLEU: 0.5959 ± 0.0004
      dataflow_match_score: 0.6643 ± 0.0361
      ngram_match_score: 0.0894 ± 0.0093
      syntax_match_score: 0.7002 ± 0.0333
      weighted_ngram_match_score: 0.4121 ± 0.0164
    average_best_levenshtein: 301.0556 ± 32.5241
    average_best_normalized_levenshtein: 0.6934 ± 0.0100
    average_inference_time: 2.2322 ± 1.0517
==================================================
================================================================================

phi3.5-fp16
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 0.7937 ± 1.3746
total_entries: 126.0000 ± 0.0000
correct_matches: 1.0000 ± 1.7321
average_execution_time: 1.2060 ± 1.2138
metrics_by_complexity:
  simple:
    count: 120.0000 ± 0.0000
    full_matches: 1.0000 ± 1.7321
    exact_code_matches: 1.0000 ± 1.7321
    exact_code_match_rate: 0.8333 ± 1.4434
    pv_exact_matched: 1.0000 ± 1.7321
    pv_exact_mismatched: 119.0000 ± 1.7321
    pv_exact_match_rate: 0.8333 ± 1.4434
    average_pv_match_rate: 12.6528 ± 1.6839
    average_pv_mismatch_rate: 87.3472 ± 1.6839
    timing_matched: 32.6667 ± 5.7735
    timing_mismatched: 87.3333 ± 5.7735
    timing_match_rate: 27.2222 ± 4.8113
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 1.0000 ± 1.7321
    full_mismatched: 119.0000 ± 1.7321
    full_match_rate: 0.8333 ± 1.4434
    accuracy: 0.8333 ± 1.4434
    average_timing_score: 0.2722 ± 0.0481
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.1557 ± 0.0231
    average_best_codebleu:
      average_CodeBLEU: 0.2566 ± 0.0115
      comb_7_CodeBLEU: 0.4094 ± 0.0163
      dataflow_match_score: 0.0222 ± 0.0385
      ngram_match_score: 0.0023 ± 0.0039
      syntax_match_score: 0.0225 ± 0.0389
      weighted_ngram_match_score: 0.0018 ± 0.0032
    average_best_levenshtein: 366.2333 ± 598.6834
    average_best_normalized_levenshtein: 0.9900 ± 0.0174
    average_inference_time: 1.1988 ± 1.2015
  complex:
    count: 6.0000 ± 0.0000
    full_matches: 0.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 0.0000 ± 0.0000
    pv_exact_mismatched: 6.0000 ± 0.0000
    pv_exact_match_rate: 0.0000 ± 0.0000
    average_pv_match_rate: 0.0000 ± 0.0000
    average_pv_mismatch_rate: 100.0000 ± 0.0000
    timing_matched: 0.0000 ± 0.0000
    timing_mismatched: 6.0000 ± 0.0000
    timing_match_rate: 0.0000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 0.0000 ± 0.0000
    full_mismatched: 6.0000 ± 0.0000
    full_match_rate: 0.0000 ± 0.0000
    accuracy: 0.0000 ± 0.0000
    average_timing_score: 0.0000 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.0000 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.2456 ± 0.0076
      comb_7_CodeBLEU: 0.3913 ± 0.0150
      dataflow_match_score: 0.2436 ± 0.4219
      ngram_match_score: 0.0005 ± 0.0008
      syntax_match_score: 0.0666 ± 0.1154
      weighted_ngram_match_score: 0.0052 ± 0.0091
    average_best_levenshtein: 380.0556 ± 451.0068
    average_best_normalized_levenshtein: 0.9607 ± 0.0681
    average_inference_time: 1.3498 ± 1.4611
==================================================
================================================================================

