@@ -17,11 +17,15 @@ def update_model_info(model_info):
1717 if "https://huggingface.co/" in info ["link" ]:
1818 hf_model = info ["link" ].split ("https://huggingface.co/" )[- 1 ]
1919 print (hf_model )
20- tokenizer = AutoTokenizer .from_pretrained (hf_model , trust_remote_code = True )
21- if tokenizer .chat_template is None :
20+ try :
21+ tokenizer = AutoTokenizer .from_pretrained (hf_model , trust_remote_code = True )
22+
23+ if tokenizer .chat_template is None :
24+ model_info [model ]["direct_complete" ] = True
25+ else :
26+ model_info [model ]["direct_complete" ] = False
27+ except :
2228 model_info [model ]["direct_complete" ] = True
23- else :
24- model_info [model ]["direct_complete" ] = False
2529 else :
2630 model_info [model ]["direct_complete" ] = False
2731
@@ -44,7 +48,7 @@ def get_results(tids):
4448 "moe" : info ["moe" ],
4549 "size" : info ["size" ],
4650 "act_param" : info ["act_param" ],
47- "direct_complete" : info ["direct_complete" ],
51+ # "direct_complete": info["direct_complete"],
4852 }
4953
5054 for model , info in model_info .items ():
@@ -53,10 +57,16 @@ def get_results(tids):
5357 files = glob (f"results/{ model } --bigcodebench-*.json" )
5458 assert files , f"No files found for results/{ model } --bigcodebench-*.json"
5559 for file in files :
56- _ , suffix = os .path .basename (file ).split ("--bigcodebench-" )
60+ try :
61+ _ , suffix = os .path .basename (file ).split ("--bigcodebench-hard-" )
62+ with open ("results/" + model + "--bigcodebench-hard-" + suffix , "r" ) as f :
63+ data = json .load (f )
64+ except :
65+ _ , suffix = os .path .basename (file ).split ("--bigcodebench-" )
66+ with open ("results/" + model + "--bigcodebench-" + suffix , "r" ) as f :
67+ data = json .load (f )
5768 status = []
58- with open ("results/" + model + "--bigcodebench-" + suffix , "r" ) as f :
59- data = json .load (f )
69+
6070 for key , value in data ["eval" ].items ():
6171 if key not in tids :
6272 continue
@@ -76,22 +86,22 @@ def get_results(tids):
7686 mode = "-cal"
7787
7888 results [info ["name" ]][f"pass@1" ][f"{ task } { mode } " ] = round (mean (status )* 100 ,1 )
79- if not info ["prompted" ] or info ["direct_complete" ]:
89+ if not info ["prompted" ]: # or info["direct_complete"]:
8090 results [info ["name" ]][f"pass@1" ][f"{ task } -cal" ] = round (mean (status )* 100 ,1 )
8191
8292 for model , result in results .items ():
8393 for task in ["complete" ]:
8494 origin = result ["pass@1" ].pop (task )
85- assert origin , f"Missing original complete results for { model } "
95+ # assert origin, f"Missing original complete results for {model}"
8696 calibrate = result ["pass@1" ].pop (f"{ task } -cal" )
8797 if calibrate :
88- if calibrate - origin > 1 :
89- results [model ]["lazy" ] = True
90- else :
91- results [model ]["lazy" ] = False
98+ # if calibrate - origin > 1:
99+ # results[model]["lazy"] = True
100+ # else:
101+ # results[model]["lazy"] = False
92102 results [model ]["pass@1" ][task ] = calibrate
93103 else :
94- results [model ]["lazy" ] = False
104+ # results[model]["lazy"] = False
95105 results [model ]["pass@1" ][task ] = origin
96106 calibrate_instruct = result ["pass@1" ].pop (f"instruct-cal" )
97107 result ["pass@1" ]["instruct" ] = calibrate_instruct
@@ -151,14 +161,44 @@ def read_task_perf(tids, task="complete"):
151161 task_perf = dict ()
152162 model = model .replace ("/" , "--" )
153163 try :
154- if info ["prompted" ] and not info ["direct_complete" ]:
155- files = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized-calibrated_eval_results.json" )
156- if files :
157- file = files [0 ]
158- else :
159- file = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized_eval_results.json" )[0 ]
160- else :
161- file = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized_eval_results.json" )[0 ]
164+ try :
165+ try :
166+ if info ["prompted" ]:# and not info["direct_complete"]:
167+ files = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized-calibrated_hard_eval_results.json" )
168+ if files :
169+ file = files [0 ]
170+ else :
171+ file = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized_hard_eval_results.json" )[0 ]
172+ else :
173+ file = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized_hard_eval_results.json" )[0 ]
174+ except :
175+ if info ["prompted" ]:
176+ files = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized-calibrated_eval_results.json" )
177+ if files :
178+ file = files [0 ]
179+ else :
180+ file = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized_eval_results.json" )[0 ]
181+ else :
182+ file = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized_eval_results.json" )[0 ]
183+ except :
184+ try :
185+ if info ["prompted" ]:# and not info["direct_complete"]:
186+ files = glob (f"results/{ model } --bigcodebench-hard-{ task } *-0-1-sanitized-calibrated_hard_eval_results.json" )
187+ if files :
188+ file = files [0 ]
189+ else :
190+ file = glob (f"results/{ model } --bigcodebench-hard-{ task } *-0-1-sanitized_hard_eval_results.json" )[0 ]
191+ else :
192+ file = glob (f"results/{ model } --bigcodebench-hard-{ task } *-0-1-sanitized_hard_eval_results.json" )[0 ]
193+ except :
194+ if info ["prompted" ]:
195+ files = glob (f"results/{ model } --bigcodebench-hard-{ task } *-0-1-sanitized-calibrated_eval_results.json" )
196+ if files :
197+ file = files [0 ]
198+ else :
199+ file = glob (f"results/{ model } --bigcodebench-hard-{ task } *-0-1-sanitized_eval_results.json" )[0 ]
200+ else :
201+ file = glob (f"results/{ model } --bigcodebench-hard-{ task } *-0-1-sanitized_eval_results.json" )[0 ]
162202 except :
163203 continue
164204
@@ -255,8 +295,9 @@ def get_elo_mle(df, SCALE=400, BASE=10, INIT_RATING=1000):
255295def update_elo_rating (results , elo_dict ):
256296 for model , info in model_info .items ():
257297 if info ["name" ] not in elo_dict :
258- continue
259- results [info ["name" ]]["elo_mle" ] = elo_dict [info ["name" ]]
298+ results [info ["name" ]]["elo_mle" ] = None
299+ else :
300+ results [info ["name" ]]["elo_mle" ] = elo_dict [info ["name" ]]
260301 return results
261302
262303
@@ -296,7 +337,7 @@ def get_solve_rate(data_dict, task="complete"):
296337
297338
298339def get_hf_ds (results ):
299- hf_dataset = {"model" : [], "link" : [], "moe" : [], "size" : [], "act_param" : [], "type" : [], "lazy" : [], "direct_complete" : [],
340+ hf_dataset = {"model" : [], "link" : [], "moe" : [], "size" : [], "act_param" : [], "type" : [], # "lazy": [],# "direct_complete": [],
300341 "complete" : [], "instruct" : [], "elo_mle" : []}
301342
302343 for model , result in results .items ():
@@ -306,10 +347,10 @@ def get_hf_ds(results):
306347 hf_dataset ["size" ].append (result ["size" ])
307348 hf_dataset ["act_param" ].append (result ["act_param" ])
308349 hf_dataset ["type" ].append ("🔶" if result ["prompted" ] else "🟢" )
309- hf_dataset ["lazy" ].append (result ["lazy" ])
350+ # hf_dataset["lazy"].append(result["lazy"])
310351 hf_dataset ["complete" ].append (result ["pass@1" ]["complete" ])
311352 hf_dataset ["instruct" ].append (result ["pass@1" ]["instruct" ])
312- hf_dataset ["direct_complete" ].append (result ["direct_complete" ])
353+ # hf_dataset["direct_complete"].append(result["direct_complete"])
313354 hf_dataset ["elo_mle" ].append (result ["elo_mle" ])
314355
315356 return Dataset .from_dict (hf_dataset )
@@ -335,19 +376,20 @@ def push_ds(ds, path, local=False):
335376
336377if __name__ == "__main__" :
337378
338- bcb_orig = load_dataset ("bigcode/bigcodebench" , split = "v0.1.0_hf" )
379+ # bcb_orig = load_dataset("bigcode/bigcodebench", split="v0.1.0_hf")
339380 bcb_hard = load_dataset ("bigcode/bigcodebench-hard" , split = "v0.1.0_hf" )
340- model_info = update_model_info (model_info )
381+ # model_info = update_model_info(model_info)
341382 bcb_config = {
342- "" : bcb_orig ,
383+ # "": bcb_orig,
343384 "-hard" : bcb_hard ,
344385 }
345386 for suffix , bcb in bcb_config .items ():
346387 results = get_results (bcb ["task_id" ])
347388 files = []
348389 complete_data , complete_files = read_task_perf (bcb ["task_id" ], "complete" )
349390 instruct_data , instruct_files = read_task_perf (bcb ["task_id" ], "instruct" )
350- assert len (model_info ) == len (complete_data )
391+ assert len (model_info ) == len (complete_data ),\
392+ f"Missing results for { set ([val ['name' ] for val in model_info .values ()]) - set ([model for model in complete_data .keys ()])} "
351393 with open ("task2domain.json" , "r" ) as f :
352394 task2domain = json .load (f )
353395 domain_complete = get_domain_perf (complete_data , task2domain )
@@ -372,7 +414,10 @@ def push_ds(ds, path, local=False):
372414 }
373415 elo_ds = dict ()
374416 for config , (task_level , no_tie ) in elo_config .items ():
375- battles = get_winner_df (complete_data , bcb ["task_id" ], "complete" , task_level = task_level , no_tie = no_tie )
417+ filter_complete_data = {model : task_perf for model , task_perf in complete_data .items () if model in instruct_data }
418+ complete_battles = get_winner_df (filter_complete_data , bcb ["task_id" ], "complete" , task_level = task_level , no_tie = no_tie )
419+ instruct_battles = get_winner_df (instruct_data , bcb ["task_id" ], "instruct" , task_level = task_level , no_tie = no_tie )
420+ battles = pd .concat ([complete_battles , instruct_battles ])
376421 elo_mle_bootstrap = get_bootstrap_result (battles , get_elo_mle , 500 )
377422 bootstrap_lu_median = elo_mle_bootstrap .median ().reset_index ().set_axis (["model" , "Elo rating" ], axis = 1 )
378423 bootstrap_lu_median ["Elo rating" ] = (bootstrap_lu_median ["Elo rating" ] + 0.5 ).astype (int )
0 commit comments