OP Cog Model Comparison Summary - 2025-09-21 07:34:48
================================================================================


mistral
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 32.3529 ± 0.0000
total_entries: 136.0000 ± 0.0000
correct_matches: 44.0000 ± 0.0000
average_execution_time: 1.3472 ± 0.0359
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 44.0000 ± 0.0000
    exact_code_matches: 22.0000 ± 0.0000
    exact_code_match_rate: 18.9655 ± 0.0000
    pv_exact_matched: 45.0000 ± 0.0000
    pv_exact_mismatched: 71.0000 ± 0.0000
    pv_exact_match_rate: 38.7931 ± 0.0000
    average_pv_match_rate: 41.6667 ± 0.0000
    average_pv_mismatch_rate: 58.3333 ± 0.0000
    timing_matched: 44.0000 ± 0.0000
    timing_mismatched: 72.0000 ± 0.0000
    timing_match_rate: 37.9310 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 1.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 44.0000 ± 0.0000
    full_mismatched: 72.0000 ± 0.0000
    full_match_rate: 37.9310 ± 0.0000
    accuracy: 37.9310 ± 0.0000
    average_timing_score: 0.4019 ± 0.0021
    average_temp_score: 0.9914 ± 0.0000
    average_full_score: 0.4126 ± 0.0004
    average_best_codebleu:
      average_CodeBLEU: 0.4354 ± 0.0000
      comb_7_CodeBLEU: 0.6607 ± 0.0000
      dataflow_match_score: 0.2134 ± 0.0000
      ngram_match_score: 0.0643 ± 0.0000
      syntax_match_score: 0.6238 ± 0.0000
      weighted_ngram_match_score: 0.0555 ± 0.0000
    average_best_levenshtein: 315.5259 ± 0.0000
    average_best_normalized_levenshtein: 0.4626 ± 0.0000
    average_inference_time: 1.2696 ± 0.0417
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 0.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 0.0000 ± 0.0000
    pv_exact_mismatched: 20.0000 ± 0.0000
    pv_exact_match_rate: 0.0000 ± 0.0000
    average_pv_match_rate: 2.5000 ± 0.0000
    average_pv_mismatch_rate: 97.5000 ± 0.0000
    timing_matched: 0.6667 ± 0.5774
    timing_mismatched: 19.3333 ± 0.5774
    timing_match_rate: 3.3333 ± 2.8868
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 9.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 0.0000 ± 0.0000
    full_mismatched: 20.0000 ± 0.0000
    full_match_rate: 0.0000 ± 0.0000
    accuracy: 0.0000 ± 0.0000
    average_timing_score: 0.0414 ± 0.0080
    average_temp_score: 0.5500 ± 0.0000
    average_full_score: 0.0283 ± 0.0016
    average_best_codebleu:
      average_CodeBLEU: 0.3330 ± 0.0000
      comb_7_CodeBLEU: 0.4820 ± 0.0000
      dataflow_match_score: 0.4716 ± 0.0000
      ngram_match_score: 0.0465 ± 0.0000
      syntax_match_score: 0.4912 ± 0.0000
      weighted_ngram_match_score: 0.1228 ± 0.0000
    average_best_levenshtein: 415.2000 ± 0.0000
    average_best_normalized_levenshtein: 0.6682 ± 0.0000
    average_inference_time: 1.7974 ± 0.0025
==================================================


================================================================================

qwen2.5-coder
-------------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 88.2353 ± 0.0000
total_entries: 136.0000 ± 0.0000
correct_matches: 120.0000 ± 0.0000
average_execution_time: 1.4275 ± 0.0946
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 111.0000 ± 0.0000
    exact_code_matches: 104.0000 ± 0.0000
    exact_code_match_rate: 89.6552 ± 0.0000
    pv_exact_matched: 113.0000 ± 0.0000
    pv_exact_mismatched: 3.0000 ± 0.0000
    pv_exact_match_rate: 97.4138 ± 0.0000
    average_pv_match_rate: 98.6782 ± 0.0000
    average_pv_mismatch_rate: 1.3218 ± 0.0000
    timing_matched: 112.0000 ± 0.0000
    timing_mismatched: 4.0000 ± 0.0000
    timing_match_rate: 96.5517 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 111.0000 ± 0.0000
    full_mismatched: 5.0000 ± 0.0000
    full_match_rate: 95.6897 ± 0.0000
    accuracy: 95.6897 ± 0.0000
    average_timing_score: 0.9798 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9854 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.5963 ± 0.0000
      comb_7_CodeBLEU: 0.8347 ± 0.0000
      dataflow_match_score: 0.2328 ± 0.0000
      ngram_match_score: 0.2445 ± 0.0000
      syntax_match_score: 0.9871 ± 0.0000
      weighted_ngram_match_score: 0.1538 ± 0.0000
    average_best_levenshtein: 1.6810 ± 0.0000
    average_best_normalized_levenshtein: 0.0276 ± 0.0000
    average_inference_time: 1.0049 ± 0.0420
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 9.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 10.0000 ± 0.0000
    pv_exact_mismatched: 10.0000 ± 0.0000
    pv_exact_match_rate: 50.0000 ± 0.0000
    average_pv_match_rate: 70.3804 ± 0.0000
    average_pv_mismatch_rate: 29.6196 ± 0.0000
    timing_matched: 12.0000 ± 0.0000
    timing_mismatched: 8.0000 ± 0.0000
    timing_match_rate: 60.0000 ± 0.0000
    temp_matched: 4.0000 ± 0.0000
    temp_mismatched: 5.0000 ± 0.0000
    temp_match_rate: 44.4444 ± 0.0000
    full_matched: 9.0000 ± 0.0000
    full_mismatched: 11.0000 ± 0.0000
    full_match_rate: 45.0000 ± 0.0000
    accuracy: 45.0000 ± 0.0000
    average_timing_score: 0.7868 ± 0.0007
    average_temp_score: 0.8063 ± 0.0000
    average_full_score: 0.7195 ± 0.0001
    average_best_codebleu:
      average_CodeBLEU: 0.3836 ± 0.0000
      comb_7_CodeBLEU: 0.5131 ± 0.0000
      dataflow_match_score: 0.4824 ± 0.0000
      ngram_match_score: 0.1483 ± 0.0000
      syntax_match_score: 0.6166 ± 0.0000
      weighted_ngram_match_score: 0.1871 ± 0.0000
    average_best_levenshtein: 148.7500 ± 0.0000
    average_best_normalized_levenshtein: 0.4467 ± 0.0000
    average_inference_time: 3.8784 ± 0.3997
==================================================


================================================================================

claude-3.5-sonnet
-----------------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 92.8922 ± 0.4245
total_entries: 136.0000 ± 0.0000
correct_matches: 126.3333 ± 0.5774
average_execution_time: 1.1067 ± 0.0271
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 114.0000 ± 0.0000
    exact_code_matches: 108.0000 ± 0.0000
    exact_code_match_rate: 93.1034 ± 0.0000
    pv_exact_matched: 114.0000 ± 0.0000
    pv_exact_mismatched: 2.0000 ± 0.0000
    pv_exact_match_rate: 98.2759 ± 0.0000
    average_pv_match_rate: 99.5402 ± 0.0000
    average_pv_mismatch_rate: 0.4598 ± 0.0000
    timing_matched: 115.0000 ± 0.0000
    timing_mismatched: 1.0000 ± 0.0000
    timing_match_rate: 99.1379 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 114.0000 ± 0.0000
    full_mismatched: 2.0000 ± 0.0000
    full_match_rate: 98.2759 ± 0.0000
    accuracy: 98.2759 ± 0.0000
    average_timing_score: 0.9942 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9952 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.6321 ± 0.0000
      comb_7_CodeBLEU: 0.8523 ± 0.0000
      dataflow_match_score: 0.2845 ± 0.0000
      ngram_match_score: 0.3114 ± 0.0000
      syntax_match_score: 0.9982 ± 0.0000
      weighted_ngram_match_score: 0.2187 ± 0.0000
    average_best_levenshtein: 0.6379 ± 0.0000
    average_best_normalized_levenshtein: 0.0139 ± 0.0000
    average_inference_time: 0.9404 ± 0.0448
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 12.3333 ± 0.5774
    exact_code_matches: 1.0000 ± 0.0000
    exact_code_match_rate: 5.0000 ± 0.0000
    pv_exact_matched: 13.0000 ± 0.0000
    pv_exact_mismatched: 7.0000 ± 0.0000
    pv_exact_match_rate: 65.0000 ± 0.0000
    average_pv_match_rate: 90.4644 ± 0.5753
    average_pv_mismatch_rate: 9.5356 ± 0.5753
    timing_matched: 15.3333 ± 0.5774
    timing_mismatched: 4.6667 ± 0.5774
    timing_match_rate: 76.6667 ± 2.8868
    temp_matched: 7.0000 ± 0.0000
    temp_mismatched: 1.0000 ± 0.0000
    temp_match_rate: 87.5000 ± 0.0000
    full_matched: 12.3333 ± 0.5774
    full_mismatched: 7.6667 ± 0.5774
    full_match_rate: 61.6667 ± 2.8868
    accuracy: 61.6667 ± 2.8868
    average_timing_score: 0.8449 ± 0.0006
    average_temp_score: 0.9831 ± 0.0033
    average_full_score: 0.8977 ± 0.0047
    average_best_codebleu:
      average_CodeBLEU: 0.4308 ± 0.0075
      comb_7_CodeBLEU: 0.5241 ± 0.0014
      dataflow_match_score: 0.4395 ± 0.0095
      ngram_match_score: 0.2672 ± 0.0186
      syntax_match_score: 0.6331 ± 0.0021
      weighted_ngram_match_score: 0.2833 ± 0.0197
    average_best_levenshtein: 126.4333 ± 0.9359
    average_best_normalized_levenshtein: 0.3937 ± 0.0023
    average_inference_time: 2.0710 ± 0.1081
==================================================


================================================================================

qwen2
-----
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 81.6176 ± 0.0000
total_entries: 136.0000 ± 0.0000
correct_matches: 111.0000 ± 0.0000
average_execution_time: 1.0859 ± 0.0787
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 106.0000 ± 0.0000
    exact_code_matches: 94.0000 ± 0.0000
    exact_code_match_rate: 81.0345 ± 0.0000
    pv_exact_matched: 106.0000 ± 0.0000
    pv_exact_mismatched: 10.0000 ± 0.0000
    pv_exact_match_rate: 91.3793 ± 0.0000
    average_pv_match_rate: 94.5115 ± 0.0000
    average_pv_mismatch_rate: 5.4885 ± 0.0000
    timing_matched: 113.0000 ± 0.0000
    timing_mismatched: 3.0000 ± 0.0000
    timing_match_rate: 97.4138 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 106.0000 ± 0.0000
    full_mismatched: 10.0000 ± 0.0000
    full_match_rate: 91.3793 ± 0.0000
    accuracy: 91.3793 ± 0.0000
    average_timing_score: 0.9797 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9520 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.5897 ± 0.0000
      comb_7_CodeBLEU: 0.8322 ± 0.0000
      dataflow_match_score: 0.2328 ± 0.0000
      ngram_match_score: 0.2289 ± 0.0000
      syntax_match_score: 0.9878 ± 0.0000
      weighted_ngram_match_score: 0.1420 ± 0.0000
    average_best_levenshtein: 1.8448 ± 0.0000
    average_best_normalized_levenshtein: 0.0621 ± 0.0000
    average_inference_time: 1.0622 ± 0.0412
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 5.0000 ± 0.0000
    exact_code_matches: 0.6667 ± 0.5774
    exact_code_match_rate: 3.3333 ± 2.8868
    pv_exact_matched: 7.0000 ± 0.0000
    pv_exact_mismatched: 13.0000 ± 0.0000
    pv_exact_match_rate: 35.0000 ± 0.0000
    average_pv_match_rate: 61.4668 ± 1.2810
    average_pv_mismatch_rate: 38.5332 ± 1.2810
    timing_matched: 12.0000 ± 0.0000
    timing_mismatched: 8.0000 ± 0.0000
    timing_match_rate: 60.0000 ± 0.0000
    temp_matched: 2.0000 ± 0.0000
    temp_mismatched: 7.0000 ± 0.0000
    temp_match_rate: 22.2222 ± 0.0000
    full_matched: 5.0000 ± 0.0000
    full_mismatched: 15.0000 ± 0.0000
    full_match_rate: 25.0000 ± 0.0000
    accuracy: 25.0000 ± 0.0000
    average_timing_score: 0.7864 ± 0.0204
    average_temp_score: 0.7080 ± 0.0088
    average_full_score: 0.6238 ± 0.0089
    average_best_codebleu:
      average_CodeBLEU: 0.3382 ± 0.0074
      comb_7_CodeBLEU: 0.4714 ± 0.0107
      dataflow_match_score: 0.4240 ± 0.0116
      ngram_match_score: 0.1086 ± 0.0031
      syntax_match_score: 0.5632 ± 0.0075
      weighted_ngram_match_score: 0.1235 ± 0.0028
    average_best_levenshtein: 154.8167 ± 2.3240
    average_best_normalized_levenshtein: 0.5069 ± 0.0054
    average_inference_time: 1.2231 ± 0.3067
==================================================


================================================================================

qwen2.5
-------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 81.6176 ± 0.0000
total_entries: 136.0000 ± 0.0000
correct_matches: 111.0000 ± 0.0000
average_execution_time: 1.1725 ± 0.1256
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 109.0000 ± 0.0000
    exact_code_matches: 99.6667 ± 0.5774
    exact_code_match_rate: 85.9195 ± 0.4977
    pv_exact_matched: 109.0000 ± 0.0000
    pv_exact_mismatched: 7.0000 ± 0.0000
    pv_exact_match_rate: 93.9655 ± 0.0000
    average_pv_match_rate: 95.4023 ± 0.0000
    average_pv_mismatch_rate: 4.5977 ± 0.0000
    timing_matched: 114.0000 ± 0.0000
    timing_mismatched: 2.0000 ± 0.0000
    timing_match_rate: 98.2759 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 109.0000 ± 0.0000
    full_mismatched: 7.0000 ± 0.0000
    full_match_rate: 93.9655 ± 0.0000
    accuracy: 93.9655 ± 0.0000
    average_timing_score: 0.9827 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9598 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.5950 ± 0.0013
      comb_7_CodeBLEU: 0.8347 ± 0.0017
      dataflow_match_score: 0.2155 ± 0.0000
      ngram_match_score: 0.2417 ± 0.0009
      syntax_match_score: 0.9891 ± 0.0040
      weighted_ngram_match_score: 0.1490 ± 0.0002
    average_best_levenshtein: 1.4023 ± 0.0100
    average_best_normalized_levenshtein: 0.0515 ± 0.0009
    average_inference_time: 1.0856 ± 0.0718
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 2.0000 ± 0.0000
    exact_code_matches: 1.0000 ± 0.0000
    exact_code_match_rate: 5.0000 ± 0.0000
    pv_exact_matched: 2.0000 ± 0.0000
    pv_exact_mismatched: 18.0000 ± 0.0000
    pv_exact_match_rate: 10.0000 ± 0.0000
    average_pv_match_rate: 44.9161 ± 0.6900
    average_pv_mismatch_rate: 55.0839 ± 0.6900
    timing_matched: 11.6667 ± 0.5774
    timing_mismatched: 8.3333 ± 0.5774
    timing_match_rate: 58.3333 ± 2.8868
    temp_matched: 4.3333 ± 0.5774
    temp_mismatched: 4.6667 ± 0.5774
    temp_match_rate: 48.1481 ± 6.4150
    full_matched: 2.0000 ± 0.0000
    full_mismatched: 18.0000 ± 0.0000
    full_match_rate: 10.0000 ± 0.0000
    accuracy: 10.0000 ± 0.0000
    average_timing_score: 0.7595 ± 0.0122
    average_temp_score: 0.7737 ± 0.0286
    average_full_score: 0.5211 ± 0.0097
    average_best_codebleu:
      average_CodeBLEU: 0.3631 ± 0.0016
      comb_7_CodeBLEU: 0.4822 ± 0.0027
      dataflow_match_score: 0.4110 ± 0.0054
      ngram_match_score: 0.1603 ± 0.0026
      syntax_match_score: 0.5623 ± 0.0022
      weighted_ngram_match_score: 0.1688 ± 0.0034
    average_best_levenshtein: 158.7500 ± 6.5962
    average_best_normalized_levenshtein: 0.5061 ± 0.0043
    average_inference_time: 1.6764 ± 0.4609
==================================================


================================================================================

mistral-nemo
------------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 82.5980 ± 0.8490
total_entries: 136.0000 ± 0.0000
correct_matches: 112.3333 ± 1.1547
average_execution_time: 1.2697 ± 0.0883
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 107.0000 ± 0.0000
    exact_code_matches: 89.6667 ± 1.1547
    exact_code_match_rate: 77.2989 ± 0.9954
    pv_exact_matched: 111.0000 ± 0.0000
    pv_exact_mismatched: 5.0000 ± 0.0000
    pv_exact_match_rate: 95.6897 ± 0.0000
    average_pv_match_rate: 97.1264 ± 0.0000
    average_pv_mismatch_rate: 2.8736 ± 0.0000
    timing_matched: 112.0000 ± 0.0000
    timing_mismatched: 4.0000 ± 0.0000
    timing_match_rate: 96.5517 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 107.0000 ± 0.0000
    full_mismatched: 9.0000 ± 0.0000
    full_match_rate: 92.2414 ± 0.0000
    accuracy: 92.2414 ± 0.0000
    average_timing_score: 0.9883 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9747 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.5738 ± 0.0006
      comb_7_CodeBLEU: 0.8064 ± 0.0002
      dataflow_match_score: 0.2241 ± 0.0000
      ngram_match_score: 0.2278 ± 0.0018
      syntax_match_score: 0.9228 ± 0.0000
      weighted_ngram_match_score: 0.1447 ± 0.0004
    average_best_levenshtein: 1.3621 ± 0.0299
    average_best_normalized_levenshtein: 0.0557 ± 0.0023
    average_inference_time: 1.1755 ± 0.0518
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 5.3333 ± 1.1547
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 5.3333 ± 1.1547
    pv_exact_mismatched: 14.6667 ± 1.1547
    pv_exact_match_rate: 26.6667 ± 5.7735
    average_pv_match_rate: 50.2958 ± 2.0805
    average_pv_mismatch_rate: 49.7042 ± 2.0805
    timing_matched: 15.0000 ± 1.0000
    timing_mismatched: 5.0000 ± 1.0000
    timing_match_rate: 75.0000 ± 5.0000
    temp_matched: 2.0000 ± 0.0000
    temp_mismatched: 7.0000 ± 0.0000
    temp_match_rate: 22.2222 ± 0.0000
    full_matched: 5.3333 ± 1.1547
    full_mismatched: 14.6667 ± 1.1547
    full_match_rate: 26.6667 ± 5.7735
    accuracy: 26.6667 ± 5.7735
    average_timing_score: 0.8126 ± 0.0193
    average_temp_score: 0.7455 ± 0.0234
    average_full_score: 0.5797 ± 0.0239
    average_best_codebleu:
      average_CodeBLEU: 0.3215 ± 0.0189
      comb_7_CodeBLEU: 0.4386 ± 0.0144
      dataflow_match_score: 0.4491 ± 0.0136
      ngram_match_score: 0.1136 ± 0.0204
      syntax_match_score: 0.5341 ± 0.0093
      weighted_ngram_match_score: 0.1392 ± 0.0322
    average_best_levenshtein: 151.7000 ± 14.8599
    average_best_normalized_levenshtein: 0.4831 ± 0.0250
    average_inference_time: 1.8164 ± 0.3132
==================================================


================================================================================

gpt-4o
------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 88.2353 ± 0.7353
total_entries: 136.0000 ± 0.0000
correct_matches: 120.0000 ± 1.0000
average_execution_time: 0.8244 ± 0.0864
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 111.3333 ± 0.5774
    exact_code_matches: 104.3333 ± 0.5774
    exact_code_match_rate: 89.9425 ± 0.4977
    pv_exact_matched: 114.0000 ± 0.0000
    pv_exact_mismatched: 2.0000 ± 0.0000
    pv_exact_match_rate: 98.2759 ± 0.0000
    average_pv_match_rate: 99.4923 ± 0.0830
    average_pv_mismatch_rate: 0.5077 ± 0.0830
    timing_matched: 112.6667 ± 0.5774
    timing_mismatched: 3.3333 ± 0.5774
    timing_match_rate: 97.1264 ± 0.4977
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 111.3333 ± 0.5774
    full_mismatched: 4.6667 ± 0.5774
    full_match_rate: 95.9770 ± 0.4977
    accuracy: 95.9770 ± 0.4977
    average_timing_score: 0.9877 ± 0.0024
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9935 ± 0.0005
    average_best_codebleu:
      average_CodeBLEU: 0.6267 ± 0.0019
      comb_7_CodeBLEU: 0.8488 ± 0.0012
      dataflow_match_score: 0.2845 ± 0.0000
      ngram_match_score: 0.2991 ± 0.0031
      syntax_match_score: 0.9939 ± 0.0015
      weighted_ngram_match_score: 0.2137 ± 0.0031
    average_best_levenshtein: 1.3678 ± 0.0407
    average_best_normalized_levenshtein: 0.0221 ± 0.0017
    average_inference_time: 0.7072 ± 0.1046
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 8.6667 ± 0.5774
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 10.6667 ± 0.5774
    pv_exact_mismatched: 9.3333 ± 0.5774
    pv_exact_match_rate: 53.3333 ± 2.8868
    average_pv_match_rate: 84.8739 ± 1.3395
    average_pv_mismatch_rate: 15.1261 ± 1.3395
    timing_matched: 13.3333 ± 0.5774
    timing_mismatched: 6.6667 ± 0.5774
    timing_match_rate: 66.6667 ± 2.8868
    temp_matched: 6.0000 ± 0.0000
    temp_mismatched: 3.0000 ± 0.0000
    temp_match_rate: 66.6667 ± 0.0000
    full_matched: 8.6667 ± 0.5774
    full_mismatched: 11.3333 ± 0.5774
    full_match_rate: 43.3333 ± 2.8868
    accuracy: 43.3333 ± 2.8868
    average_timing_score: 0.7775 ± 0.0125
    average_temp_score: 0.9372 ± 0.0001
    average_full_score: 0.8303 ± 0.0091
    average_best_codebleu:
      average_CodeBLEU: 0.4120 ± 0.0045
      comb_7_CodeBLEU: 0.5217 ± 0.0021
      dataflow_match_score: 0.4343 ± 0.0054
      ngram_match_score: 0.2189 ± 0.0110
      syntax_match_score: 0.6554 ± 0.0005
      weighted_ngram_match_score: 0.2395 ± 0.0094
    average_best_levenshtein: 140.3833 ± 2.6383
    average_best_normalized_levenshtein: 0.4486 ± 0.0093
    average_inference_time: 1.5045 ± 0.0388
==================================================


================================================================================

phi3.5
------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 0.7353 ± 1.2736
total_entries: 136.0000 ± 0.0000
correct_matches: 1.0000 ± 1.7321
average_execution_time: 0.7805 ± 0.3720
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 1.0000 ± 1.7321
    exact_code_matches: 1.0000 ± 1.7321
    exact_code_match_rate: 0.8621 ± 1.4931
    pv_exact_matched: 1.0000 ± 1.7321
    pv_exact_mismatched: 115.0000 ± 1.7321
    pv_exact_match_rate: 0.8621 ± 1.4931
    average_pv_match_rate: 11.2069 ± 2.4637
    average_pv_mismatch_rate: 88.7931 ± 2.4637
    timing_matched: 27.0000 ± 8.6603
    timing_mismatched: 89.0000 ± 8.6603
    timing_match_rate: 23.2759 ± 7.4657
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 1.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 1.0000 ± 1.7321
    full_mismatched: 115.0000 ± 1.7321
    full_match_rate: 0.8621 ± 1.4931
    accuracy: 0.8621 ± 1.4931
    average_timing_score: 0.2328 ± 0.0747
    average_temp_score: 0.9914 ± 0.0000
    average_full_score: 0.1362 ± 0.0346
    average_best_codebleu:
      average_CodeBLEU: 0.2589 ± 0.0154
      comb_7_CodeBLEU: 0.4129 ± 0.0224
      dataflow_match_score: 0.0316 ± 0.0547
      ngram_match_score: 0.0023 ± 0.0041
      syntax_match_score: 0.0313 ± 0.0542
      weighted_ngram_match_score: 0.0019 ± 0.0033
    average_best_levenshtein: 187.4138 ± 288.9091
    average_best_normalized_levenshtein: 0.9842 ± 0.0274
    average_inference_time: 0.8176 ± 0.4361
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 0.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 0.0000 ± 0.0000
    pv_exact_mismatched: 20.0000 ± 0.0000
    pv_exact_match_rate: 0.0000 ± 0.0000
    average_pv_match_rate: 1.2500 ± 0.0000
    average_pv_mismatch_rate: 98.7500 ± 0.0000
    timing_matched: 1.0000 ± 0.0000
    timing_mismatched: 19.0000 ± 0.0000
    timing_match_rate: 5.0000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 9.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 0.0000 ± 0.0000
    full_mismatched: 20.0000 ± 0.0000
    full_match_rate: 0.0000 ± 0.0000
    accuracy: 0.0000 ± 0.0000
    average_timing_score: 0.0500 ± 0.0000
    average_temp_score: 0.5500 ± 0.0000
    average_full_score: 0.0200 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.2500 ± 0.0000
      comb_7_CodeBLEU: 0.4000 ± 0.0000
      dataflow_match_score: 0.0000 ± 0.0000
      ngram_match_score: 0.0000 ± 0.0000
      syntax_match_score: 0.0000 ± 0.0000
      weighted_ngram_match_score: 0.0000 ± 0.0000
    average_best_levenshtein: 256.0000 ± 0.0000
    average_best_normalized_levenshtein: 1.0000 ± 0.0000
    average_inference_time: 0.5647 ± 0.0006
==================================================


================================================================================

phi3.5-fp16
-----------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 0.7353 ± 1.2736
total_entries: 136.0000 ± 0.0000
correct_matches: 1.0000 ± 1.7321
average_execution_time: 1.0608 ± 0.9540
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 1.0000 ± 1.7321
    exact_code_matches: 1.0000 ± 1.7321
    exact_code_match_rate: 0.8621 ± 1.4931
    pv_exact_matched: 1.0000 ± 1.7321
    pv_exact_mismatched: 115.0000 ± 1.7321
    pv_exact_match_rate: 0.8621 ± 1.4931
    average_pv_match_rate: 11.4224 ± 2.0904
    average_pv_mismatch_rate: 88.5776 ± 2.0904
    timing_matched: 28.0000 ± 6.9282
    timing_mismatched: 88.0000 ± 6.9282
    timing_match_rate: 24.1379 ± 5.9726
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 1.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 1.0000 ± 1.7321
    full_mismatched: 115.0000 ± 1.7321
    full_match_rate: 0.8621 ± 1.4931
    accuracy: 0.8621 ± 1.4931
    average_timing_score: 0.2414 ± 0.0597
    average_temp_score: 0.9914 ± 0.0000
    average_full_score: 0.1397 ± 0.0287
    average_best_codebleu:
      average_CodeBLEU: 0.2586 ± 0.0149
      comb_7_CodeBLEU: 0.4125 ± 0.0216
      dataflow_match_score: 0.0287 ± 0.0498
      ngram_match_score: 0.0023 ± 0.0040
      syntax_match_score: 0.0301 ± 0.0522
      weighted_ngram_match_score: 0.0020 ± 0.0035
    average_best_levenshtein: 331.2730 ± 538.0805
    average_best_normalized_levenshtein: 0.9884 ± 0.0200
    average_inference_time: 1.1558 ± 1.1186
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 0.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 0.0000 ± 0.0000
    pv_exact_mismatched: 20.0000 ± 0.0000
    pv_exact_match_rate: 0.0000 ± 0.0000
    average_pv_match_rate: 1.2500 ± 0.0000
    average_pv_mismatch_rate: 98.7500 ± 0.0000
    timing_matched: 1.0000 ± 0.0000
    timing_mismatched: 19.0000 ± 0.0000
    timing_match_rate: 5.0000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 9.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 0.0000 ± 0.0000
    full_mismatched: 20.0000 ± 0.0000
    full_match_rate: 0.0000 ± 0.0000
    accuracy: 0.0000 ± 0.0000
    average_timing_score: 0.0500 ± 0.0000
    average_temp_score: 0.5500 ± 0.0000
    average_full_score: 0.0200 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.2500 ± 0.0000
      comb_7_CodeBLEU: 0.4000 ± 0.0000
      dataflow_match_score: 0.0000 ± 0.0000
      ngram_match_score: 0.0000 ± 0.0000
      syntax_match_score: 0.0000 ± 0.0000
      weighted_ngram_match_score: 0.0000 ± 0.0000
    average_best_levenshtein: 256.0000 ± 0.0000
    average_best_normalized_levenshtein: 1.0000 ± 0.0000
    average_inference_time: 0.5094 ± 0.0012
==================================================


================================================================================

athene-v2
---------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 90.6863 ± 0.4245
total_entries: 136.0000 ± 0.0000
correct_matches: 123.3333 ± 0.5774
average_execution_time: 1.7377 ± 0.1315
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 115.0000 ± 0.0000
    exact_code_matches: 110.0000 ± 0.0000
    exact_code_match_rate: 94.8276 ± 0.0000
    pv_exact_matched: 115.0000 ± 0.0000
    pv_exact_mismatched: 1.0000 ± 0.0000
    pv_exact_match_rate: 99.1379 ± 0.0000
    average_pv_match_rate: 99.7126 ± 0.0000
    average_pv_mismatch_rate: 0.2874 ± 0.0000
    timing_matched: 115.0000 ± 0.0000
    timing_mismatched: 1.0000 ± 0.0000
    timing_match_rate: 99.1379 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 115.0000 ± 0.0000
    full_mismatched: 1.0000 ± 0.0000
    full_match_rate: 99.1379 ± 0.0000
    accuracy: 99.1379 ± 0.0000
    average_timing_score: 0.9953 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9968 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.6059 ± 0.0000
      comb_7_CodeBLEU: 0.8421 ± 0.0000
      dataflow_match_score: 0.2328 ± 0.0000
      ngram_match_score: 0.2626 ± 0.0000
      syntax_match_score: 0.9990 ± 0.0000
      weighted_ngram_match_score: 0.1620 ± 0.0000
    average_best_levenshtein: 0.2845 ± 0.0000
    average_best_normalized_levenshtein: 0.0101 ± 0.0000
    average_inference_time: 0.7948 ± 0.1440
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 8.3333 ± 0.5774
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 10.0000 ± 0.0000
    pv_exact_mismatched: 10.0000 ± 0.0000
    pv_exact_match_rate: 50.0000 ± 0.0000
    average_pv_match_rate: 73.2463 ± 0.0000
    average_pv_mismatch_rate: 26.7537 ± 0.0000
    timing_matched: 10.3333 ± 0.5774
    timing_mismatched: 9.6667 ± 0.5774
    timing_match_rate: 51.6667 ± 2.8868
    temp_matched: 6.0000 ± 0.0000
    temp_mismatched: 3.0000 ± 0.0000
    temp_match_rate: 66.6667 ± 0.0000
    full_matched: 8.3333 ± 0.5774
    full_mismatched: 11.6667 ± 0.5774
    full_match_rate: 41.6667 ± 2.8868
    accuracy: 41.6667 ± 2.8868
    average_timing_score: 0.7014 ± 0.0110
    average_temp_score: 0.9065 ± 0.0000
    average_full_score: 0.7217 ± 0.0022
    average_best_codebleu:
      average_CodeBLEU: 0.4108 ± 0.0010
      comb_7_CodeBLEU: 0.5303 ± 0.0004
      dataflow_match_score: 0.5105 ± 0.0000
      ngram_match_score: 0.1934 ± 0.0020
      syntax_match_score: 0.6093 ± 0.0000
      weighted_ngram_match_score: 0.2300 ± 0.0018
    average_best_levenshtein: 151.2500 ± 0.1732
    average_best_normalized_levenshtein: 0.4534 ± 0.0023
    average_inference_time: 7.2068 ± 0.1441
==================================================


================================================================================

athene-v2-agent
---------------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 91.1765 ± 0.0000
total_entries: 136.0000 ± 0.0000
correct_matches: 124.0000 ± 0.0000
average_execution_time: 1.6395 ± 0.1850
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 116.0000 ± 0.0000
    exact_code_matches: 111.0000 ± 0.0000
    exact_code_match_rate: 95.6897 ± 0.0000
    pv_exact_matched: 116.0000 ± 0.0000
    pv_exact_mismatched: 0.0000 ± 0.0000
    pv_exact_match_rate: 100.0000 ± 0.0000
    average_pv_match_rate: 100.0000 ± 0.0000
    average_pv_mismatch_rate: 0.0000 ± 0.0000
    timing_matched: 116.0000 ± 0.0000
    timing_mismatched: 0.0000 ± 0.0000
    timing_match_rate: 100.0000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 116.0000 ± 0.0000
    full_mismatched: 0.0000 ± 0.0000
    full_match_rate: 100.0000 ± 0.0000
    accuracy: 100.0000 ± 0.0000
    average_timing_score: 1.0000 ± 0.0000
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 1.0000 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.6056 ± 0.0000
      comb_7_CodeBLEU: 0.8422 ± 0.0000
      dataflow_match_score: 0.2241 ± 0.0000
      ngram_match_score: 0.2620 ± 0.0000
      syntax_match_score: 1.0000 ± 0.0000
      weighted_ngram_match_score: 0.1602 ± 0.0000
    average_best_levenshtein: 0.1293 ± 0.0000
    average_best_normalized_levenshtein: 0.0079 ± 0.0000
    average_inference_time: 0.7232 ± 0.1272
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 8.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 9.0000 ± 0.0000
    pv_exact_mismatched: 11.0000 ± 0.0000
    pv_exact_match_rate: 45.0000 ± 0.0000
    average_pv_match_rate: 78.8771 ± 0.0000
    average_pv_mismatch_rate: 21.1229 ± 0.0000
    timing_matched: 11.0000 ± 0.0000
    timing_mismatched: 9.0000 ± 0.0000
    timing_match_rate: 55.0000 ± 0.0000
    temp_matched: 6.0000 ± 0.0000
    temp_mismatched: 3.0000 ± 0.0000
    temp_match_rate: 66.6667 ± 0.0000
    full_matched: 8.0000 ± 0.0000
    full_mismatched: 12.0000 ± 0.0000
    full_match_rate: 40.0000 ± 0.0000
    accuracy: 40.0000 ± 0.0000
    average_timing_score: 0.7942 ± 0.0001
    average_temp_score: 0.9517 ± 0.0000
    average_full_score: 0.7955 ± 0.0000
    average_best_codebleu:
      average_CodeBLEU: 0.3734 ± 0.0000
      comb_7_CodeBLEU: 0.4824 ± 0.0000
      dataflow_match_score: 0.4708 ± 0.0000
      ngram_match_score: 0.1856 ± 0.0000
      syntax_match_score: 0.5892 ± 0.0000
      weighted_ngram_match_score: 0.1983 ± 0.0000
    average_best_levenshtein: 147.7000 ± 0.0000
    average_best_normalized_levenshtein: 0.4606 ± 0.0000
    average_inference_time: 6.9540 ± 0.5282
==================================================


================================================================================

llama3.3
--------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 87.7451 ± 0.4245
total_entries: 136.0000 ± 0.0000
correct_matches: 119.3333 ± 0.5774
average_execution_time: 1.6217 ± 0.1783
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 114.3333 ± 0.5774
    exact_code_matches: 103.3333 ± 0.5774
    exact_code_match_rate: 89.0805 ± 0.4977
    pv_exact_matched: 114.3333 ± 0.5774
    pv_exact_mismatched: 1.6667 ± 0.5774
    pv_exact_match_rate: 98.5632 ± 0.4977
    average_pv_match_rate: 99.5977 ± 0.0995
    average_pv_mismatch_rate: 0.4023 ± 0.0995
    timing_matched: 115.0000 ± 0.0000
    timing_mismatched: 1.0000 ± 0.0000
    timing_match_rate: 99.1379 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 114.3333 ± 0.5774
    full_mismatched: 1.6667 ± 0.5774
    full_match_rate: 98.5632 ± 0.4977
    accuracy: 98.5632 ± 0.4977
    average_timing_score: 0.9914 ± 0.0001
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9951 ± 0.0008
    average_best_codebleu:
      average_CodeBLEU: 0.6111 ± 0.0002
      comb_7_CodeBLEU: 0.8429 ± 0.0008
      dataflow_match_score: 0.2816 ± 0.0050
      ngram_match_score: 0.2671 ± 0.0008
      syntax_match_score: 0.9948 ± 0.0023
      weighted_ngram_match_score: 0.1827 ± 0.0005
    average_best_levenshtein: 1.8937 ± 0.1294
    average_best_normalized_levenshtein: 0.0251 ± 0.0016
    average_inference_time: 0.8581 ± 0.1307
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 5.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 5.0000 ± 0.0000
    pv_exact_mismatched: 15.0000 ± 0.0000
    pv_exact_match_rate: 25.0000 ± 0.0000
    average_pv_match_rate: 68.9789 ± 0.0000
    average_pv_mismatch_rate: 31.0211 ± 0.0000
    timing_matched: 12.0000 ± 0.0000
    timing_mismatched: 8.0000 ± 0.0000
    timing_match_rate: 60.0000 ± 0.0000
    temp_matched: 7.0000 ± 0.0000
    temp_mismatched: 2.0000 ± 0.0000
    temp_match_rate: 77.7778 ± 0.0000
    full_matched: 5.0000 ± 0.0000
    full_mismatched: 15.0000 ± 0.0000
    full_match_rate: 25.0000 ± 0.0000
    accuracy: 25.0000 ± 0.0000
    average_timing_score: 0.7496 ± 0.0027
    average_temp_score: 0.9131 ± 0.0000
    average_full_score: 0.7131 ± 0.0005
    average_best_codebleu:
      average_CodeBLEU: 0.3646 ± 0.0004
      comb_7_CodeBLEU: 0.4996 ± 0.0002
      dataflow_match_score: 0.5310 ± 0.0000
      ngram_match_score: 0.1296 ± 0.0008
      syntax_match_score: 0.6482 ± 0.0000
      weighted_ngram_match_score: 0.1498 ± 0.0008
    average_best_levenshtein: 198.0500 ± 0.3464
    average_best_normalized_levenshtein: 0.4933 ± 0.0015
    average_inference_time: 6.0508 ± 0.4571
==================================================


================================================================================

claude-sonnet-4-20250514-thinking
---------------------------------
Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 90.1961 ± 0.4245
total_entries: 136.0000 ± 0.0000
correct_matches: 122.6667 ± 0.5774
average_execution_time: 5.1851 ± 0.2271
metrics_by_complexity:
  simple:
    count: 116.0000 ± 0.0000
    full_matches: 111.6667 ± 0.5774
    exact_code_matches: 107.3333 ± 1.5275
    exact_code_match_rate: 92.5287 ± 1.3168
    pv_exact_matched: 112.0000 ± 0.0000
    pv_exact_mismatched: 4.0000 ± 0.0000
    pv_exact_match_rate: 96.5517 ± 0.0000
    average_pv_match_rate: 97.5862 ± 0.3982
    average_pv_mismatch_rate: 2.4138 ± 0.3982
    timing_matched: 112.3333 ± 0.5774
    timing_mismatched: 3.6667 ± 0.5774
    timing_match_rate: 96.8391 ± 0.4977
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 111.6667 ± 0.5774
    full_mismatched: 4.3333 ± 0.5774
    full_match_rate: 96.2644 ± 0.4977
    accuracy: 96.2644 ± 0.4977
    average_timing_score: 0.9738 ± 0.0043
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.9755 ± 0.0040
    average_best_codebleu:
      average_CodeBLEU: 0.6023 ± 0.0019
      comb_7_CodeBLEU: 0.8356 ± 0.0008
      dataflow_match_score: 0.2414 ± 0.0086
      ngram_match_score: 0.2615 ± 0.0054
      syntax_match_score: 0.9824 ± 0.0027
      weighted_ngram_match_score: 0.1652 ± 0.0037
    average_best_levenshtein: 1.0575 ± 0.0782
    average_best_normalized_levenshtein: 0.0270 ± 0.0026
    average_inference_time: 4.4522 ± 0.2557
  complex:
    count: 20.0000 ± 0.0000
    full_matches: 11.0000 ± 1.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 13.3333 ± 1.5275
    pv_exact_mismatched: 6.6667 ± 1.5275
    pv_exact_match_rate: 66.6667 ± 7.6376
    average_pv_match_rate: 93.7051 ± 2.2358
    average_pv_mismatch_rate: 6.2949 ± 2.2358
    timing_matched: 15.0000 ± 0.0000
    timing_mismatched: 5.0000 ± 0.0000
    timing_match_rate: 75.0000 ± 0.0000
    temp_matched: 7.6667 ± 0.5774
    temp_mismatched: 1.3333 ± 0.5774
    temp_match_rate: 85.1852 ± 6.4150
    full_matched: 11.0000 ± 1.0000
    full_mismatched: 9.0000 ± 1.0000
    full_match_rate: 55.0000 ± 5.0000
    accuracy: 55.0000 ± 5.0000
    average_timing_score: 0.8506 ± 0.0136
    average_temp_score: 0.9700 ± 0.0213
    average_full_score: 0.9188 ± 0.0153
    average_best_codebleu:
      average_CodeBLEU: 0.4376 ± 0.0200
      comb_7_CodeBLEU: 0.5551 ± 0.0113
      dataflow_match_score: 0.5398 ± 0.0395
      ngram_match_score: 0.2243 ± 0.0414
      syntax_match_score: 0.6769 ± 0.0161
      weighted_ngram_match_score: 0.2594 ± 0.0329
    average_best_levenshtein: 137.1000 ± 15.2487
    average_best_normalized_levenshtein: 0.4197 ± 0.0334
    average_inference_time: 9.4360 ± 0.6643
==================================================


================================================================================
