Statistics Summary for op agent (mean ± std):
==================================================
accuracy: 51.5873 ± 0.7937
total_entries: 126.0000 ± 0.0000
correct_matches: 65.0000 ± 1.0000
average_execution_time: 0.8327 ± 0.0810
metrics_by_complexity:
  simple:
    count: 120.0000 ± 0.0000
    full_matches: 65.0000 ± 1.0000
    exact_code_matches: 48.0000 ± 0.0000
    exact_code_match_rate: 40.0000 ± 0.0000
    pv_exact_matched: 70.0000 ± 0.0000
    pv_exact_mismatched: 50.0000 ± 0.0000
    pv_exact_match_rate: 58.3333 ± 0.0000
    average_pv_match_rate: 65.4167 ± 0.0000
    average_pv_mismatch_rate: 34.5833 ± 0.0000
    timing_matched: 80.0000 ± 1.0000
    timing_mismatched: 40.0000 ± 1.0000
    timing_match_rate: 66.6667 ± 0.8333
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 65.0000 ± 1.0000
    full_mismatched: 55.0000 ± 1.0000
    full_match_rate: 54.1667 ± 0.8333
    accuracy: 54.1667 ± 0.8333
    average_timing_score: 0.7095 ± 0.0029
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.6652 ± 0.0006
    average_best_codebleu:
      average_CodeBLEU: 0.4867 ± 0.0000
      comb_7_CodeBLEU: 0.7235 ± 0.0000
      dataflow_match_score: 0.1396 ± 0.0000
      ngram_match_score: 0.1105 ± 0.0000
      syntax_match_score: 0.7646 ± 0.0000
      weighted_ngram_match_score: 0.0739 ± 0.0000
    average_best_levenshtein: 25.7556 ± 0.0096
    average_best_normalized_levenshtein: 0.2572 ± 0.0000
    average_inference_time: 0.7627 ± 0.0325
  complex:
    count: 6.0000 ± 0.0000
    full_matches: 0.0000 ± 0.0000
    exact_code_matches: 0.0000 ± 0.0000
    exact_code_match_rate: 0.0000 ± 0.0000
    pv_exact_matched: 0.0000 ± 0.0000
    pv_exact_mismatched: 6.0000 ± 0.0000
    pv_exact_match_rate: 0.0000 ± 0.0000
    average_pv_match_rate: 2.5253 ± 4.3739
    average_pv_mismatch_rate: 97.4747 ± 4.3739
    timing_matched: 0.0000 ± 0.0000
    timing_mismatched: 6.0000 ± 0.0000
    timing_match_rate: 0.0000 ± 0.0000
    temp_matched: 0.0000 ± 0.0000
    temp_mismatched: 0.0000 ± 0.0000
    temp_match_rate: 0.0000 ± 0.0000
    full_matched: 0.0000 ± 0.0000
    full_mismatched: 6.0000 ± 0.0000
    full_match_rate: 0.0000 ± 0.0000
    accuracy: 0.0000 ± 0.0000
    average_timing_score: 0.0316 ± 0.0548
    average_temp_score: 1.0000 ± 0.0000
    average_full_score: 0.0265 ± 0.0459
    average_best_codebleu:
      average_CodeBLEU: 0.4665 ± 0.0011
      comb_7_CodeBLEU: 0.5959 ± 0.0004
      dataflow_match_score: 0.6643 ± 0.0361
      ngram_match_score: 0.0894 ± 0.0093
      syntax_match_score: 0.7002 ± 0.0333
      weighted_ngram_match_score: 0.4121 ± 0.0164
    average_best_levenshtein: 301.0556 ± 32.5241
    average_best_normalized_levenshtein: 0.6934 ± 0.0100
    average_inference_time: 2.2322 ± 1.0517
==================================================
