cell_label,trace_id,mas_name,llm_name,benchmark_name,mast_all_zero,coordination_overhead_proxy_value,task_retry_rate_value,n_lines,n_distinct_role_tags,n_role_transitions,n_retry_hits ChatDev x ProgramDev,5,ChatDev,GPT-4o,ProgramDev,False,0.03119461183977313,0.0007089684509039348,2821,12,76,2 ChatDev x ProgramDev,0,ChatDev,GPT-4o,ProgramDev,True,0.01652285661832201,0.0,5447,12,78,0 ChatDev x ProgramDev,46,ChatDev,GPT-4o,ProgramDev,False,0.028409090909090908,0.032512626262626264,3168,12,78,103 ChatDev x ProgramDev,67,ChatDev,GPT-4o,ProgramDev,False,0.02891844997108155,0.03383458646616541,3458,13,87,117 ChatDev x ProgramDev,1,ChatDev,GPT-4o,ProgramDev,True,0.018491884117526197,0.0,4867,12,78,0 ChatDev x ProgramDev,89,ChatDev,GPT-4o,ProgramDev,False,0.022448979591836733,0.0,3920,12,76,0 ChatDev x ProgramDev,6,ChatDev,GPT-4o,ProgramDev,False,0.024888143176733782,0.0,3576,12,77,0 ChatDev x ProgramDev,91,ChatDev,GPT-4o,ProgramDev,False,0.023547880690737835,0.0020931449502878076,3822,12,78,8 ChatDev x ProgramDev,76,ChatDev,GPT-4o,ProgramDev,False,0.029343114371457153,0.0,2999,12,76,0 ChatDev x ProgramDev,2,ChatDev,GPT-4o,ProgramDev,True,0.020828511918537376,0.00277713492247165,4321,12,78,12 ChatDev x ProgramDev,12,ChatDev,GPT-4o,ProgramDev,False,0.01822231220894918,0.0,4939,12,78,0 ChatDev x ProgramDev,98,ChatDev,GPT-4o,ProgramDev,False,0.027177269919703522,0.028412600370599134,3238,12,76,92 ChatDev x ProgramDev,86,ChatDev,GPT-4o,ProgramDev,False,0.027363940407418668,0.0012161751292186075,3289,12,78,4 ChatDev x ProgramDev,8,ChatDev,GPT-4o,ProgramDev,False,0.016378358483621642,0.0,5434,12,77,0 ChatDev x ProgramDev,20,ChatDev,GPT-4o,ProgramDev,True,0.02027027027027027,0.00472972972972973,4440,12,78,21 ChatDev x ProgramDev,38,ChatDev,GPT-4o,ProgramDev,False,0.027838598686268378,0.02658742571160463,3197,12,77,85 ChatDev x ProgramDev,41,ChatDev,GPT-4o,ProgramDev,False,0.02788150360965895,0.008215085884988798,4017,12,100,33 ChatDev x ProgramDev,48,ChatDev,GPT-4o,ProgramDev,False,0.02945069490403706,0.0026472534745201853,3022,12,77,8 ChatDev x ProgramDev,66,ChatDev,GPT-4o,ProgramDev,False,0.031537916371367825,0.012756909992912827,2822,12,77,36 ChatDev x ProgramDev,54,ChatDev,GPT-4o,ProgramDev,False,0.023156899810964082,0.003544423440453686,4232,13,85,15 ChatDev x ProgramDev,31,ChatDev,GPT-4o,ProgramDev,False,0.0354695687222894,0.03305118903667876,2481,12,76,82 ChatDev x ProgramDev,87,ChatDev,GPT-4o,ProgramDev,False,0.02538787023977433,0.02679830747531735,3545,12,78,95 ChatDev x ProgramDev,92,ChatDev,GPT-4o,ProgramDev,False,0.031496062992125984,0.03221188260558339,2794,12,76,90 ChatDev x ProgramDev,55,ChatDev,GPT-4o,ProgramDev,False,0.022667971298108283,0.01451402478799739,6132,12,127,89 ChatDev x ProgramDev,24,ChatDev,GPT-4o,ProgramDev,True,0.017928286852589643,0.00099601593625498,5020,12,78,5 ChatDev x ProgramDev,58,ChatDev,GPT-4o,ProgramDev,False,0.026886648334860985,0.0,3273,12,76,0 ChatDev x ProgramDev,70,ChatDev,GPT-4o,ProgramDev,False,0.030229746070133012,0.03688029020556227,3308,13,87,122 ChatDev x ProgramDev,39,ChatDev,GPT-4o,ProgramDev,False,0.024484181568088032,0.029436038514442917,3635,12,77,107 ChatDev x ProgramDev,88,ChatDev,GPT-4o,ProgramDev,False,0.0247732802477328,0.0,4521,12,100,0 ChatDev x ProgramDev,26,ChatDev,GPT-4o,ProgramDev,True,0.014258555133079848,0.0,6312,12,78,0 MetaGPT x ProgramDev,76,MetaGPT,GPT-4o,ProgramDev,False,0.0,0.006578947368421052,152,0,0,1 MetaGPT x ProgramDev,27,MetaGPT,GPT-4o,ProgramDev,False,0.0,0.0,166,0,0,0 MetaGPT x ProgramDev,1,MetaGPT,GPT-4o,ProgramDev,True,0.0,0.008620689655172414,232,0,0,2 MetaGPT x ProgramDev,52,MetaGPT,GPT-4o,ProgramDev,False,0.0,0.0,115,0,0,0 MetaGPT x ProgramDev,98,MetaGPT,GPT-4o,ProgramDev,False,0.0,0.007246376811594203,138,0,0,1 MetaGPT x ProgramDev,4,MetaGPT,GPT-4o,ProgramDev,True,0.0,0.0234375,128,0,0,3 MetaGPT x ProgramDev,19,MetaGPT,GPT-4o,ProgramDev,False,0.0,0.0,316,0,0,0 MetaGPT x ProgramDev,83,MetaGPT,GPT-4o,ProgramDev,False,0.0,0.05263157894736842,190,0,0,10 MetaGPT x ProgramDev,23,MetaGPT,GPT-4o,ProgramDev,True,0.0,0.0,186,0,0,0 MetaGPT x ProgramDev,9,MetaGPT,GPT-4o,ProgramDev,True,0.0,0.0,197,0,0,0 MetaGPT x ProgramDev,34,MetaGPT,GPT-4o,ProgramDev,True,0.0,0.0,151,0,0,0 MetaGPT x ProgramDev,12,MetaGPT,GPT-4o,ProgramDev,False,0.0,0.0,206,0,0,0 MetaGPT x ProgramDev,29,MetaGPT,GPT-4o,ProgramDev,True,0.0,0.0,159,0,0,0 MetaGPT x ProgramDev,61,MetaGPT,GPT-4o,ProgramDev,False,0.0,0.01282051282051282,156,0,0,2 MetaGPT x ProgramDev,92,MetaGPT,GPT-4o,ProgramDev,False,0.0,0.013513513513513514,148,0,0,2 MetaGPT x ProgramDev,62,MetaGPT,GPT-4o,ProgramDev,False,0.0,0.05921052631578947,152,0,0,9 MetaGPT x ProgramDev,17,MetaGPT,GPT-4o,ProgramDev,False,0.0,0.0058823529411764705,170,0,0,1 MetaGPT x ProgramDev,97,MetaGPT,GPT-4o,ProgramDev,False,0.0,0.015306122448979591,196,0,0,3 MetaGPT x ProgramDev,26,MetaGPT,GPT-4o,ProgramDev,True,0.0,0.0,218,0,0,0 MetaGPT x ProgramDev,85,MetaGPT,GPT-4o,ProgramDev,False,0.0,0.003663003663003663,273,0,0,1 MetaGPT x ProgramDev,20,MetaGPT,GPT-4o,ProgramDev,True,0.0,0.0,185,0,0,0 MetaGPT x ProgramDev,21,MetaGPT,GPT-4o,ProgramDev,False,0.0,0.0,132,0,0,0 MetaGPT x ProgramDev,11,MetaGPT,GPT-4o,ProgramDev,False,0.0,0.0,145,0,0,0 MetaGPT x ProgramDev,72,MetaGPT,GPT-4o,ProgramDev,False,0.0,0.007575757575757576,264,0,0,2 MetaGPT x ProgramDev,18,MetaGPT,GPT-4o,ProgramDev,False,0.0,0.0,211,0,0,0 MetaGPT x ProgramDev,50,MetaGPT,GPT-4o,ProgramDev,True,0.0,0.03164556962025317,158,0,0,5 MetaGPT x ProgramDev,65,MetaGPT,GPT-4o,ProgramDev,False,0.0,0.018292682926829267,164,0,0,3 MetaGPT x ProgramDev,57,MetaGPT,GPT-4o,ProgramDev,False,0.0,0.013422818791946308,149,0,0,2 MetaGPT x ProgramDev,90,MetaGPT,GPT-4o,ProgramDev,True,0.0,0.0,187,0,0,0 MetaGPT x ProgramDev,15,MetaGPT,GPT-4o,ProgramDev,False,0.0,0.0,201,0,0,0 Magentic x GAIA,5,Magentic,GPT-4o,GAIA,False,0.0,0.006257822277847309,799,0,0,5 Magentic x GAIA,139,Magentic,GPT-4o,GAIA,False,0.0,0.0013324450366422385,1501,0,0,2 Magentic x GAIA,119,Magentic,GPT-4o,GAIA,False,0.0,0.0013722126929674098,2915,0,0,4 Magentic x GAIA,47,Magentic,GPT-4o,GAIA,False,0.0,0.0,894,0,0,0 Magentic x GAIA,30,Magentic,GPT-4o,GAIA,False,0.0,0.0,2720,0,0,0 Magentic x GAIA,142,Magentic,GPT-4o,GAIA,False,0.0,0.030303030303030304,1188,0,0,36 Magentic x GAIA,96,Magentic,GPT-4o,GAIA,False,0.0,0.0025396825396825397,3150,0,0,8 Magentic x GAIA,101,Magentic,GPT-4o,GAIA,True,0.0,0.0,441,0,0,0 Magentic x GAIA,158,Magentic,GPT-4o,GAIA,False,0.0,0.009283819628647215,1508,0,0,14 Magentic x GAIA,122,Magentic,GPT-4o,GAIA,False,0.0,0.0038535645472061657,519,0,0,2 Magentic x GAIA,65,Magentic,GPT-4o,GAIA,False,0.0,0.0,359,0,0,0 Magentic x GAIA,59,Magentic,GPT-4o,GAIA,False,0.0,0.0,279,0,0,0 Magentic x GAIA,100,Magentic,GPT-4o,GAIA,False,0.0,0.007858546168958742,509,0,0,4 Magentic x GAIA,129,Magentic,GPT-4o,GAIA,False,0.0,0.0,483,0,0,0 Magentic x GAIA,50,Magentic,GPT-4o,GAIA,True,0.0,0.0,1218,0,0,0 Magentic x GAIA,111,Magentic,GPT-4o,GAIA,False,0.0,0.0,545,0,0,0 Magentic x GAIA,11,Magentic,GPT-4o,GAIA,False,0.0,0.0038535645472061657,519,0,0,2 Magentic x GAIA,68,Magentic,GPT-4o,GAIA,False,0.0,0.001447178002894356,2073,0,0,3 Magentic x GAIA,12,Magentic,GPT-4o,GAIA,False,0.0,0.0109375,640,0,0,7 Magentic x GAIA,144,Magentic,GPT-4o,GAIA,False,0.0,0.017355371900826446,1210,0,0,21 Magentic x GAIA,81,Magentic,GPT-4o,GAIA,False,0.0,0.004106776180698152,1461,0,0,6 Magentic x GAIA,102,Magentic,GPT-4o,GAIA,True,0.0,0.0,455,0,0,0 Magentic x GAIA,10,Magentic,GPT-4o,GAIA,False,0.0,0.00040192926045016077,2488,0,0,1 Magentic x GAIA,84,Magentic,GPT-4o,GAIA,False,0.0,0.0,482,0,0,0 Magentic x GAIA,149,Magentic,GPT-4o,GAIA,True,0.0,0.0,674,0,0,0 Magentic x GAIA,137,Magentic,GPT-4o,GAIA,False,0.0,0.0,772,0,0,0 Magentic x GAIA,8,Magentic,GPT-4o,GAIA,False,0.0,0.0005830903790087463,3430,0,0,2 Magentic x GAIA,105,Magentic,GPT-4o,GAIA,False,0.0,0.00040192926045016077,2488,0,0,1 Magentic x GAIA,32,Magentic,GPT-4o,GAIA,False,0.0,0.0,724,0,0,0 Magentic x GAIA,159,Magentic,GPT-4o,GAIA,False,0.0,0.005979073243647235,669,0,0,4 AG2 x GSM,151,AG2,Claude,GSM,True,0.0,0.0,1,0,0,0 AG2 x GSM,123,AG2,Claude,GSM,False,0.0,0.0,1,0,0,0 AG2 x GSM,17,AG2,GPT-4o,GSM,False,0.0,0.0,89,0,0,0 AG2 x GSM,13,AG2,GPT-4o,GSM,True,0.0,0.0,105,0,0,0 AG2 x GSM,92,AG2,Claude,GSM,False,0.0,1.0,1,0,0,1 AG2 x GSM,187,AG2,Claude,GSM,False,0.0,0.0,1,0,0,0 AG2 x GSM,1,AG2,GPT-4o,GSM,True,0.0,0.0,87,0,0,0 AG2 x GSM,75,AG2,Claude,GSM,False,0.0,0.0,1,0,0,0 AG2 x GSM,163,AG2,Claude,GSM,True,0.0,0.0,1,0,0,0 AG2 x GSM,104,AG2,Claude,GSM,False,0.0,0.0,1,0,0,0 AG2 x GSM,120,AG2,Claude,GSM,False,0.0,0.0,1,0,0,0 AG2 x GSM,34,AG2,Claude,GSM,True,0.0,0.0,1,0,0,0 AG2 x GSM,148,AG2,Claude,GSM,False,0.0,0.0,1,0,0,0 AG2 x GSM,93,AG2,Claude,GSM,False,0.0,0.0,1,0,0,0 AG2 x GSM,42,AG2,Claude,GSM,False,0.0,0.0,1,0,0,0 AG2 x GSM,142,AG2,Claude,GSM,False,0.0,0.0,1,0,0,0 AG2 x GSM,22,AG2,GPT-4o,GSM,True,0.0,0.0,88,0,0,0 AG2 x GSM,19,AG2,GPT-4o,GSM,False,0.0,0.0,71,0,0,0 AG2 x GSM,132,AG2,Claude,GSM,False,0.0,0.0,1,0,0,0 AG2 x GSM,179,AG2,Claude,GSM,False,0.0,0.0,1,0,0,0 AG2 x GSM,72,AG2,Claude,GSM,False,0.0,0.0,1,0,0,0 AG2 x GSM,118,AG2,Claude,GSM,False,0.0,0.0,1,0,0,0 AG2 x GSM,122,AG2,Claude,GSM,False,0.0,1.0,1,0,0,1 AG2 x GSM,105,AG2,Claude,GSM,False,0.0,0.0,1,0,0,0 AG2 x GSM,4,AG2,GPT-4o,GSM,True,0.0,0.011111111111111112,90,0,0,1 AG2 x GSM,129,AG2,Claude,GSM,False,0.0,0.0,1,0,0,0 AG2 x GSM,169,AG2,Claude,GSM,False,0.0,0.0,1,0,0,0 AG2 x GSM,164,AG2,Claude,GSM,False,0.0,0.0,1,0,0,0 AG2 x GSM,98,AG2,Claude,GSM,False,0.0,0.0,1,0,0,0 AG2 x GSM,30,AG2,Claude,GSM,False,0.0,0.0,1,0,0,0 AppWorld x Test-C,17,AppWorld,GPT-4o,Test-C,False,0.0,0.02656042496679947,753,0,0,20 AppWorld x Test-C,8,AppWorld,GPT-4o,Test-C,False,0.0,0.009523809523809525,840,0,0,8 AppWorld x Test-C,23,AppWorld,GPT-4o,Test-C,True,0.0,0.016813048933500628,3985,0,0,67 AppWorld x Test-C,12,AppWorld,GPT-4o,Test-C,False,0.0,0.010940919037199124,914,0,0,10 AppWorld x Test-C,11,AppWorld,GPT-4o,Test-C,False,0.0,0.0005162622612287042,1937,0,0,1 AppWorld x Test-C,5,AppWorld,GPT-4o,Test-C,False,0.0,0.0018050541516245488,3324,0,0,6 AppWorld x Test-C,0,AppWorld,GPT-4o,Test-C,True,0.0,0.0,1108,0,0,0 AppWorld x Test-C,14,AppWorld,GPT-4o,Test-C,True,0.0,0.001182033096926714,1692,0,0,2 AppWorld x Test-C,2,AppWorld,GPT-4o,Test-C,True,0.0,0.00832639467110741,1201,0,0,10 AppWorld x Test-C,7,AppWorld,GPT-4o,Test-C,False,0.0,0.017719568567026195,1298,0,0,23 AppWorld x Test-C,20,AppWorld,GPT-4o,Test-C,True,0.0,0.007760141093474427,2835,0,0,22 AppWorld x Test-C,9,AppWorld,GPT-4o,Test-C,True,0.0,0.02656042496679947,753,0,0,20 AppWorld x Test-C,19,AppWorld,GPT-4o,Test-C,False,0.0,0.018436578171091445,1356,0,0,25 AppWorld x Test-C,3,AppWorld,GPT-4o,Test-C,False,0.0,0.013258897418004187,1433,0,0,19 AppWorld x Test-C,13,AppWorld,GPT-4o,Test-C,True,0.0,0.0,1179,0,0,0 AppWorld x Test-C,25,AppWorld,GPT-4o,Test-C,True,0.0,0.009626955475330927,831,0,0,8 AppWorld x Test-C,6,AppWorld,GPT-4o,Test-C,False,0.0,0.006149116064565719,1301,0,0,8 AppWorld x Test-C,15,AppWorld,GPT-4o,Test-C,False,0.0,0.009937888198757764,805,0,0,8 AppWorld x Test-C,24,AppWorld,GPT-4o,Test-C,True,0.0,0.012345679012345678,810,0,0,10 AppWorld x Test-C,21,AppWorld,GPT-4o,Test-C,False,0.0,0.005256241787122208,761,0,0,4 AppWorld x Test-C,16,AppWorld,GPT-4o,Test-C,True,0.0,0.019902518277822908,2462,0,0,49 AppWorld x Test-C,26,AppWorld,GPT-4o,Test-C,True,0.0,0.02099236641221374,1048,0,0,22 AppWorld x Test-C,1,AppWorld,GPT-4o,Test-C,True,0.0,0.0,927,0,0,0 AppWorld x Test-C,18,AppWorld,GPT-4o,Test-C,False,0.0,0.029045643153526972,482,0,0,14 AppWorld x Test-C,10,AppWorld,GPT-4o,Test-C,False,0.0,0.02560819462227913,781,0,0,20 AppWorld x Test-C,4,AppWorld,GPT-4o,Test-C,True,0.0,0.019455252918287938,1285,0,0,25 AppWorld x Test-C,29,AppWorld,GPT-4o,Test-C,True,0.0,0.022533800701051578,1997,0,0,45 AppWorld x Test-C,22,AppWorld,GPT-4o,Test-C,True,0.0,0.007761966364812419,773,0,0,6 AppWorld x Test-C,27,AppWorld,GPT-4o,Test-C,False,0.0,0.014358974358974359,975,0,0,14 AppWorld x Test-C,28,AppWorld,GPT-4o,Test-C,True,0.0,0.01918158567774936,782,0,0,15