[1] "Iteration 1"
[1] "zero-shot"
[1] "z-test between codegemma and llama3"

	2-sample test for equality of proportions with continuity correction

data:  c(codegemma, llama3) out of c(100, 100)
X-squared = 21.856, df = 1, p-value = 2.94e-06
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.4242509 -0.1757491
sample estimates:
prop 1 prop 2 
  0.11   0.41 

[1] "z-test between codegemma and mistral"

	2-sample test for equality of proportions with continuity correction

data:  c(codegemma, mistral) out of c(100, 100)
X-squared = 3.7634, df = 1, p-value = 0.05238
alternative hypothesis: two.sided
95 percent confidence interval:
 0.0001526471 0.1598473529
sample estimates:
prop 1 prop 2 
  0.11   0.03 

[1] "z-test between mistral and llama3"

	2-sample test for equality of proportions with continuity correction

data:  c(mistral, llama3) out of c(100, 100)
X-squared = 39.889, df = 1, p-value = 2.688e-10
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.4920311 -0.2679689
sample estimates:
prop 1 prop 2 
  0.03   0.41 

[1] "few-shot 1"
[1] "z-test between codegemma and llama3"

	2-sample test for equality of proportions with continuity correction

data:  c(codegemma, llama3) out of c(100, 100)
X-squared = 3.7812, df = 1, p-value = 0.05183
alternative hypothesis: two.sided
95 percent confidence interval:
 0.0003821037 0.2396178963
sample estimates:
prop 1 prop 2 
  0.26   0.14 

[1] "z-test between codegemma and mistral"

	2-sample test for equality of proportions with continuity correction

data:  c(codegemma, mistral) out of c(100, 100)
X-squared = 0.026667, df = 1, p-value = 0.8703
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.1099908  0.1499908
sample estimates:
prop 1 prop 2 
  0.26   0.24 

[1] "z-test between mistral and llama3"

	2-sample test for equality of proportions with continuity correction

data:  c(mistral, llama3) out of c(100, 100)
X-squared = 2.6316, df = 1, p-value = 0.1048
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.01785146  0.21785146
sample estimates:
prop 1 prop 2 
  0.24   0.14 

[1] "few-shot 2"
[1] "z-test between codegemma and llama3"

	2-sample test for equality of proportions with continuity correction

data:  c(codegemma, llama3) out of c(100, 100)
X-squared = 32.409, df = 1, p-value = 1.249e-08
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.5191394 -0.2608606
sample estimates:
prop 1 prop 2 
  0.14   0.53 

[1] "z-test between codegemma and mistral"

	2-sample test for equality of proportions with continuity correction

data:  c(codegemma, mistral) out of c(100, 100)
X-squared = 1.9154, df = 1, p-value = 0.1664
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.02441508  0.16441508
sample estimates:
prop 1 prop 2 
  0.14   0.07 

[1] "z-test between mistral and llama3"

	2-sample test for equality of proportions with continuity correction

data:  c(mistral, llama3) out of c(100, 100)
X-squared = 48.214, df = 1, p-value = 3.821e-12
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.5798629 -0.3401371
sample estimates:
prop 1 prop 2 
  0.07   0.53 

[1] "few-shot 3"
[1] "z-test between codegemma and llama3"

	2-sample test for equality of proportions with continuity correction

data:  c(codegemma, llama3) out of c(100, 100)
X-squared = 2.5026, df = 1, p-value = 0.1137
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.1760154  0.0160154
sample estimates:
prop 1 prop 2 
  0.07   0.15 

[1] "z-test between codegemma and mistral"

	2-sample test for equality of proportions with continuity correction

data:  c(codegemma, mistral) out of c(100, 100)
X-squared = 0.9305, df = 1, p-value = 0.3347
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.14097759  0.04097759
sample estimates:
prop 1 prop 2 
  0.07   0.12 

[1] "z-test between mistral and llama3"

	2-sample test for equality of proportions with continuity correction

data:  c(mistral, llama3) out of c(100, 100)
X-squared = 0.17127, df = 1, p-value = 0.679
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.13462791  0.07462791
sample estimates:
prop 1 prop 2 
  0.12   0.15 

[1] "Comparing percentages of <syntactic> vs <semantic (+ semantic)> results per model, per approach"
[1] "codegemma: semantic vs syntactic"
[1] "zero-shot"

	2-sample test for equality of proportions with continuity correction

data:  c(11, 18) out of c(100, 100)
X-squared = 1.4519, df = 1, p-value = 0.2282
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.17711221  0.03711221
sample estimates:
prop 1 prop 2 
  0.11   0.18 

[1] "few-shot 1"

	2-sample test for equality of proportions with continuity correction

data:  c(26, 39) out of c(100, 100)
X-squared = 3.2821, df = 1, p-value = 0.07004
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.268568259  0.008568259
sample estimates:
prop 1 prop 2 
  0.26   0.39 

[1] "few-shot 2"

	2-sample test for equality of proportions with continuity correction

data:  c(14, 34) out of c(100, 100)
X-squared = 9.8958, df = 1, p-value = 0.001657
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.32508844 -0.07491156
sample estimates:
prop 1 prop 2 
  0.14   0.34 

[1] "few-shot 3"

	2-sample test for equality of proportions with continuity correction

data:  c(7, 29) out of c(100, 100)
X-squared = 14.939, df = 1, p-value = 0.000111
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.3320311 -0.1079689
sample estimates:
prop 1 prop 2 
  0.07   0.29 

[1] "llama3: semantic vs syntactic"
[1] "zero-shot"

	2-sample test for equality of proportions with continuity correction

data:  c(41, 53) out of c(100, 100)
X-squared = 2.4287, df = 1, p-value = 0.1191
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.2673374  0.0273374
sample estimates:
prop 1 prop 2 
  0.41   0.53 

[1] "few-shot 1"

	2-sample test for equality of proportions with continuity correction

data:  c(14, 61) out of c(100, 100)
X-squared = 45.141, df = 1, p-value = 1.833e-11
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.5973198 -0.3426802
sample estimates:
prop 1 prop 2 
  0.14   0.61 

[1] "few-shot 2"

	2-sample test for equality of proportions with continuity correction

data:  c(53, 69) out of c(100, 100)
X-squared = 4.7289, df = 1, p-value = 0.02966
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.30336399 -0.01663601
sample estimates:
prop 1 prop 2 
  0.53   0.69 

[1] "few-shot 3"

	2-sample test for equality of proportions with continuity correction

data:  c(15, 35) out of c(100, 100)
X-squared = 9.6267, df = 1, p-value = 0.001918
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.32677833 -0.07322167
sample estimates:
prop 1 prop 2 
  0.15   0.35 

[1] "Mistral: semantic vs syntactic"
[1] "zero-shot"

	2-sample test for equality of proportions with continuity correction

data:  c(3, 27) out of c(100, 100)
X-squared = 20.745, df = 1, p-value = 5.247e-06
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.3432168 -0.1367832
sample estimates:
prop 1 prop 2 
  0.03   0.27 

[1] "few-shot 1"

	2-sample test for equality of proportions with continuity correction

data:  c(6, 25) out of c(100, 100)
X-squared = 12.369, df = 1, p-value = 0.0004366
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.29679524 -0.08320476
sample estimates:
prop 1 prop 2 
  0.06   0.25 

[1] "few-shot 2"

	2-sample test for equality of proportions with continuity correction

data:  c(7, 10) out of c(100, 100)
X-squared = 0.25715, df = 1, p-value = 0.6121
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.11718875  0.05718875
sample estimates:
prop 1 prop 2 
  0.07   0.10 

[1] "few-shot 3"

	2-sample test for equality of proportions with continuity correction

data:  c(12, 28) out of c(100, 100)
X-squared = 7.0312, df = 1, p-value = 0.00801
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.27863223 -0.04136777
sample estimates:
prop 1 prop 2 
  0.12   0.28 

[1] "----------------Iteration 2-----------------"
[1] "zero-shot"
[1] "z-test between codegemma and llama3"

	2-sample test for equality of proportions with continuity correction

data:  c(codegemma, llama3) out of c(100, 100)
X-squared = 56.533, df = 1, p-value = 5.526e-14
alternative hypothesis: two.sided
95 percent confidence interval:
 0.3749086 0.6050914
sample estimates:
prop 1 prop 2 
  0.96   0.47 

[1] "z-test between codegemma and mistral"

	2-sample test for equality of proportions with continuity correction

data:  c(codegemma, mistral) out of c(100, 100)
X-squared = 151.62, df = 1, p-value < 2.2e-16
alternative hypothesis: two.sided
95 percent confidence interval:
 0.8044071 0.9555929
sample estimates:
prop 1 prop 2 
  0.96   0.08 

[1] "z-test between mistral and llama3"

	2-sample test for equality of proportions with continuity correction

data:  c(mistral, llama3) out of c(100, 100)
X-squared = 36.213, df = 1, p-value = 1.769e-09
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.5113391 -0.2686609
sample estimates:
prop 1 prop 2 
  0.08   0.47 

[1] "few-shot 1"
[1] "z-test between codegemma and llama3"

	2-sample test for equality of proportions with continuity correction

data:  c(codegemma, llama3) out of c(100, 100)
X-squared = 0, df = 1, p-value = 1
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.0195014  0.0395014
sample estimates:
prop 1 prop 2 
  0.01   0.00 

[1] "z-test between codegemma and mistral"

	2-sample test for equality of proportions with continuity correction

data:  c(codegemma, mistral) out of c(100, 100)
X-squared = 24.663, df = 1, p-value = 6.829e-07
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.3481548 -0.1518452
sample estimates:
prop 1 prop 2 
  0.01   0.26 

[1] "z-test between mistral and llama3"

	2-sample test for equality of proportions with continuity correction

data:  c(mistral, llama3) out of c(100, 100)
X-squared = 27.63, df = 1, p-value = 1.469e-07
alternative hypothesis: two.sided
95 percent confidence interval:
 0.1640293 0.3559707
sample estimates:
prop 1 prop 2 
  0.26   0.00 

[1] "few-shot 2"
[1] "z-test between codegemma and llama3"

	2-sample test for equality of proportions with continuity correction

data:  c(codegemma, llama3) out of c(100, 100)
X-squared = 23.23, df = 1, p-value = 1.437e-06
alternative hypothesis: two.sided
95 percent confidence interval:
 0.1487616 0.3512384
sample estimates:
prop 1 prop 2 
  0.27   0.02 

[1] "z-test between codegemma and mistral"

	2-sample test for equality of proportions with continuity correction

data:  c(codegemma, mistral) out of c(100, 100)
X-squared = 7.7435, df = 1, p-value = 0.005391
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.34092207 -0.05907793
sample estimates:
prop 1 prop 2 
  0.27   0.47 

[1] "z-test between mistral and llama3"

	2-sample test for equality of proportions with continuity correction

data:  c(mistral, llama3) out of c(100, 100)
X-squared = 52.331, df = 1, p-value = 4.688e-13
alternative hypothesis: two.sided
95 percent confidence interval:
 0.3384028 0.5615972
sample estimates:
prop 1 prop 2 
  0.47   0.02 

[1] "few-shot 3"
[1] "z-test between codegemma and llama3"

	2-sample test for equality of proportions with continuity correction

data:  c(codegemma, llama3) out of c(100, 100)
X-squared = 0.12272, df = 1, p-value = 0.7261
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.15182106  0.09182106
sample estimates:
prop 1 prop 2 
  0.19   0.22 

[1] "z-test between codegemma and mistral"

	2-sample test for equality of proportions with continuity correction

data:  c(codegemma, mistral) out of c(100, 100)
X-squared = 1.3834, df = 1, p-value = 0.2395
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.20611856  0.04611856
sample estimates:
prop 1 prop 2 
  0.19   0.27 

[1] "z-test between mistral and llama3"

	2-sample test for equality of proportions with continuity correction

data:  c(mistral, llama3) out of c(100, 100)
X-squared = 0.43249, df = 1, p-value = 0.5108
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.07901033  0.17901033
sample estimates:
prop 1 prop 2 
  0.27   0.22 

[1] "Comparing percentages of <syntactic> vs <semantic (+ semantic)> results per model, per approach"
[1] "codegemma: semantic vs syntactic"
[1] "zero-shot"

	2-sample test for equality of proportions with continuity correction

data:  c(96, 97) out of c(100, 100)
X-squared = 5.522e-30, df = 1, p-value = 1
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.07092136  0.05092136
sample estimates:
prop 1 prop 2 
  0.96   0.97 

[1] "few-shot 1"

	2-sample test for equality of proportions with continuity correction

data:  c(1, 2) out of c(100, 100)
X-squared = 0, df = 1, p-value = 1
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.05366349  0.03366349
sample estimates:
prop 1 prop 2 
  0.01   0.02 

[1] "few-shot 2"

	2-sample test for equality of proportions with continuity correction

data:  c(27, 31) out of c(100, 100)
X-squared = 0.21855, df = 1, p-value = 0.6401
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.17565188  0.09565188
sample estimates:
prop 1 prop 2 
  0.27   0.31 

[1] "few-shot 3"

	2-sample test for equality of proportions with continuity correction

data:  c(19, 35) out of c(100, 100)
X-squared = 5.7078, df = 1, p-value = 0.01689
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.29104265 -0.02895735
sample estimates:
prop 1 prop 2 
  0.19   0.35 

[1] "llama3: semantic vs syntactic"
[1] "zero-shot"

	2-sample test for equality of proportions with continuity correction

data:  c(47, 49) out of c(100, 100)
X-squared = 0.020032, df = 1, p-value = 0.8874
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.1684517  0.1284517
sample estimates:
prop 1 prop 2 
  0.47   0.49 

[1] "few-shot 1"

	2-sample test for equality of proportions with continuity correction

data:  c(0, 37) out of c(100, 100)
X-squared = 42.978, df = 1, p-value = 5.536e-11
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.4746279 -0.2653721
sample estimates:
prop 1 prop 2 
  0.00   0.37 

[1] "few-shot 2"

	2-sample test for equality of proportions with continuity correction

data:  c(2, 20) out of c(100, 100)
X-squared = 14.76, df = 1, p-value = 0.0001221
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.27306178 -0.08693822
sample estimates:
prop 1 prop 2 
  0.02   0.20 

[1] "few-shot 3"

	2-sample test for equality of proportions with continuity correction

data:  c(22, 32) out of c(100, 100)
X-squared = 2.0548, df = 1, p-value = 0.1517
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.23227411  0.03227411
sample estimates:
prop 1 prop 2 
  0.22   0.32 

[1] "Mistral: semantic vs syntactic"
[1] "zero-shot"

	2-sample test for equality of proportions with continuity correction

data:  c(8, 11) out of c(100, 100)
X-squared = 0.23263, df = 1, p-value = 0.6296
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.12116712  0.06116712
sample estimates:
prop 1 prop 2 
  0.08   0.11 

[1] "few-shot 1"

	2-sample test for equality of proportions with continuity correction

data:  c(26, 30) out of c(100, 100)
X-squared = 0.22321, df = 1, p-value = 0.6366
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.17433033  0.09433033
sample estimates:
prop 1 prop 2 
  0.26   0.30 

[1] "few-shot 2"

	2-sample test for equality of proportions with continuity correction

data:  c(47, 48) out of c(100, 100)
X-squared = 0, df = 1, p-value = 1
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.1584101  0.1384101
sample estimates:
prop 1 prop 2 
  0.47   0.48 

[1] "few-shot 3"

	2-sample test for equality of proportions without continuity correction

data:  c(17, 17) out of c(100, 100)
X-squared = 0, df = 1, p-value = 1
alternative hypothesis: two.sided
95 percent confidence interval:
 -0.1041182  0.1041182
sample estimates:
prop 1 prop 2 
  0.17   0.17 

