Skip to content

Commit e61bda0

Browse files
committed
fix: compute hard subset only
1 parent 80913b0 commit e61bda0

File tree

1 file changed

+78
-33
lines changed

1 file changed

+78
-33
lines changed

analysis/get_results.py

Lines changed: 78 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,15 @@ def update_model_info(model_info):
1717
if "https://huggingface.co/" in info["link"]:
1818
hf_model = info["link"].split("https://huggingface.co/")[-1]
1919
print(hf_model)
20-
tokenizer = AutoTokenizer.from_pretrained(hf_model, trust_remote_code=True)
21-
if tokenizer.chat_template is None:
20+
try:
21+
tokenizer = AutoTokenizer.from_pretrained(hf_model, trust_remote_code=True)
22+
23+
if tokenizer.chat_template is None:
24+
model_info[model]["direct_complete"] = True
25+
else:
26+
model_info[model]["direct_complete"] = False
27+
except:
2228
model_info[model]["direct_complete"] = True
23-
else:
24-
model_info[model]["direct_complete"] = False
2529
else:
2630
model_info[model]["direct_complete"] = False
2731

@@ -44,7 +48,7 @@ def get_results(tids):
4448
"moe": info["moe"],
4549
"size": info["size"],
4650
"act_param": info["act_param"],
47-
"direct_complete": info["direct_complete"],
51+
# "direct_complete": info["direct_complete"],
4852
}
4953

5054
for model, info in model_info.items():
@@ -53,10 +57,16 @@ def get_results(tids):
5357
files = glob(f"results/{model}--bigcodebench-*.json")
5458
assert files, f"No files found for results/{model}--bigcodebench-*.json"
5559
for file in files:
56-
_, suffix = os.path.basename(file).split("--bigcodebench-")
60+
try:
61+
_, suffix = os.path.basename(file).split("--bigcodebench-hard-")
62+
with open("results/"+model+"--bigcodebench-hard-"+suffix, "r") as f:
63+
data = json.load(f)
64+
except:
65+
_, suffix = os.path.basename(file).split("--bigcodebench-")
66+
with open("results/"+model+"--bigcodebench-"+suffix, "r") as f:
67+
data = json.load(f)
5768
status = []
58-
with open("results/"+model+"--bigcodebench-"+suffix, "r") as f:
59-
data = json.load(f)
69+
6070
for key, value in data["eval"].items():
6171
if key not in tids:
6272
continue
@@ -76,22 +86,22 @@ def get_results(tids):
7686
mode = "-cal"
7787

7888
results[info["name"]][f"pass@1"][f"{task}{mode}"] = round(mean(status)*100,1)
79-
if not info["prompted"] or info["direct_complete"]:
89+
if not info["prompted"]:# or info["direct_complete"]:
8090
results[info["name"]][f"pass@1"][f"{task}-cal"] = round(mean(status)*100,1)
8191

8292
for model, result in results.items():
8393
for task in ["complete"]:
8494
origin = result["pass@1"].pop(task)
85-
assert origin, f"Missing original complete results for {model}"
95+
# assert origin, f"Missing original complete results for {model}"
8696
calibrate = result["pass@1"].pop(f"{task}-cal")
8797
if calibrate:
88-
if calibrate - origin > 1:
89-
results[model]["lazy"] = True
90-
else:
91-
results[model]["lazy"] = False
98+
# if calibrate - origin > 1:
99+
# results[model]["lazy"] = True
100+
# else:
101+
# results[model]["lazy"] = False
92102
results[model]["pass@1"][task] = calibrate
93103
else:
94-
results[model]["lazy"] = False
104+
# results[model]["lazy"] = False
95105
results[model]["pass@1"][task] = origin
96106
calibrate_instruct = result["pass@1"].pop(f"instruct-cal")
97107
result["pass@1"]["instruct"] = calibrate_instruct
@@ -151,14 +161,44 @@ def read_task_perf(tids, task="complete"):
151161
task_perf = dict()
152162
model = model.replace("/", "--")
153163
try:
154-
if info["prompted"] and not info["direct_complete"]:
155-
files = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized-calibrated_eval_results.json")
156-
if files:
157-
file = files[0]
158-
else:
159-
file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_eval_results.json")[0]
160-
else:
161-
file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_eval_results.json")[0]
164+
try:
165+
try:
166+
if info["prompted"]:# and not info["direct_complete"]:
167+
files = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized-calibrated_hard_eval_results.json")
168+
if files:
169+
file = files[0]
170+
else:
171+
file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_hard_eval_results.json")[0]
172+
else:
173+
file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_hard_eval_results.json")[0]
174+
except:
175+
if info["prompted"]:
176+
files = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized-calibrated_eval_results.json")
177+
if files:
178+
file = files[0]
179+
else:
180+
file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_eval_results.json")[0]
181+
else:
182+
file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_eval_results.json")[0]
183+
except:
184+
try:
185+
if info["prompted"]:# and not info["direct_complete"]:
186+
files = glob(f"results/{model}--bigcodebench-hard-{task}*-0-1-sanitized-calibrated_hard_eval_results.json")
187+
if files:
188+
file = files[0]
189+
else:
190+
file = glob(f"results/{model}--bigcodebench-hard-{task}*-0-1-sanitized_hard_eval_results.json")[0]
191+
else:
192+
file = glob(f"results/{model}--bigcodebench-hard-{task}*-0-1-sanitized_hard_eval_results.json")[0]
193+
except:
194+
if info["prompted"]:
195+
files = glob(f"results/{model}--bigcodebench-hard-{task}*-0-1-sanitized-calibrated_eval_results.json")
196+
if files:
197+
file = files[0]
198+
else:
199+
file = glob(f"results/{model}--bigcodebench-hard-{task}*-0-1-sanitized_eval_results.json")[0]
200+
else:
201+
file = glob(f"results/{model}--bigcodebench-hard-{task}*-0-1-sanitized_eval_results.json")[0]
162202
except:
163203
continue
164204

@@ -255,8 +295,9 @@ def get_elo_mle(df, SCALE=400, BASE=10, INIT_RATING=1000):
255295
def update_elo_rating(results, elo_dict):
256296
for model, info in model_info.items():
257297
if info["name"] not in elo_dict:
258-
continue
259-
results[info["name"]]["elo_mle"] = elo_dict[info["name"]]
298+
results[info["name"]]["elo_mle"] = None
299+
else:
300+
results[info["name"]]["elo_mle"] = elo_dict[info["name"]]
260301
return results
261302

262303

@@ -296,7 +337,7 @@ def get_solve_rate(data_dict, task="complete"):
296337

297338

298339
def get_hf_ds(results):
299-
hf_dataset = {"model": [], "link": [], "moe": [], "size": [], "act_param": [], "type": [], "lazy": [], "direct_complete": [],
340+
hf_dataset = {"model": [], "link": [], "moe": [], "size": [], "act_param": [], "type": [], #"lazy": [],# "direct_complete": [],
300341
"complete": [], "instruct": [], "elo_mle": []}
301342

302343
for model, result in results.items():
@@ -306,10 +347,10 @@ def get_hf_ds(results):
306347
hf_dataset["size"].append(result["size"])
307348
hf_dataset["act_param"].append(result["act_param"])
308349
hf_dataset["type"].append("🔶" if result["prompted"] else "🟢")
309-
hf_dataset["lazy"].append(result["lazy"])
350+
# hf_dataset["lazy"].append(result["lazy"])
310351
hf_dataset["complete"].append(result["pass@1"]["complete"])
311352
hf_dataset["instruct"].append(result["pass@1"]["instruct"])
312-
hf_dataset["direct_complete"].append(result["direct_complete"])
353+
# hf_dataset["direct_complete"].append(result["direct_complete"])
313354
hf_dataset["elo_mle"].append(result["elo_mle"])
314355

315356
return Dataset.from_dict(hf_dataset)
@@ -335,19 +376,20 @@ def push_ds(ds, path, local=False):
335376

336377
if __name__ == "__main__":
337378

338-
bcb_orig = load_dataset("bigcode/bigcodebench", split="v0.1.0_hf")
379+
# bcb_orig = load_dataset("bigcode/bigcodebench", split="v0.1.0_hf")
339380
bcb_hard = load_dataset("bigcode/bigcodebench-hard", split="v0.1.0_hf")
340-
model_info = update_model_info(model_info)
381+
# model_info = update_model_info(model_info)
341382
bcb_config = {
342-
"": bcb_orig,
383+
# "": bcb_orig,
343384
"-hard": bcb_hard,
344385
}
345386
for suffix, bcb in bcb_config.items():
346387
results = get_results(bcb["task_id"])
347388
files = []
348389
complete_data, complete_files = read_task_perf(bcb["task_id"], "complete")
349390
instruct_data, instruct_files = read_task_perf(bcb["task_id"], "instruct")
350-
assert len(model_info) == len(complete_data)
391+
assert len(model_info) == len(complete_data),\
392+
f"Missing results for {set([val['name'] for val in model_info.values()]) - set([model for model in complete_data.keys()])}"
351393
with open("task2domain.json", "r") as f:
352394
task2domain = json.load(f)
353395
domain_complete = get_domain_perf(complete_data, task2domain)
@@ -372,7 +414,10 @@ def push_ds(ds, path, local=False):
372414
}
373415
elo_ds = dict()
374416
for config, (task_level, no_tie) in elo_config.items():
375-
battles = get_winner_df(complete_data, bcb["task_id"], "complete", task_level=task_level, no_tie=no_tie)
417+
filter_complete_data = {model: task_perf for model, task_perf in complete_data.items() if model in instruct_data}
418+
complete_battles = get_winner_df(filter_complete_data, bcb["task_id"], "complete", task_level=task_level, no_tie=no_tie)
419+
instruct_battles = get_winner_df(instruct_data, bcb["task_id"], "instruct", task_level=task_level, no_tie=no_tie)
420+
battles = pd.concat([complete_battles, instruct_battles])
376421
elo_mle_bootstrap = get_bootstrap_result(battles, get_elo_mle, 500)
377422
bootstrap_lu_median = elo_mle_bootstrap.median().reset_index().set_axis(["model", "Elo rating"], axis=1)
378423
bootstrap_lu_median["Elo rating"] = (bootstrap_lu_median["Elo rating"] + 0.5).astype(int)

0 commit comments

Comments
 (0)