Skip to content

Commit 8fa95f8

Browse files
committed
feat(evaluate): do calibration by default
1 parent 06437ab commit 8fa95f8

File tree

2 files changed

+27
-25
lines changed

2 files changed

+27
-25
lines changed

ADVANCED_USAGE.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ Below are all the arguments for `bigcodebench.evaluate` for the remote evaluatio
6969
- `--local_execute`: Whether to execute the samples locally, default to `False`
7070
- `--remote_execute_api`: The API endpoint for remote execution, default to `https://bigcode-bigcodebench-evaluator.hf.space/`, you can also use your own Gradio API endpoint by cloning the [bigcodebench-evaluator](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator) repo and check `Use via API` at the bottom of the HF space page.
7171
- `--pass_k`: The `k` in `Pass@k`, default to `[1, 5, 10]`, e.g. `--pass_k 1,5,10` will evaluate `Pass@1`, `Pass@5` and `Pass@10`
72+
- `--calibrated`: Whether to use the calibrated samples, default to `True`
7273
- `--save_pass_rate`: Whether to save the pass rate to a file, default to `True`
7374
- `--parallel`: The number of parallel processes, default to `-1`, e.g. `--parallel 10` will evaluate 10 samples in parallel
7475
- `--min_time_limit`: The minimum time limit for the execution, default to `1`, e.g. `--min_time_limit 10` will evaluate the samples with at least 10 seconds

bigcodebench/evaluate.py

Lines changed: 26 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ def evaluate(
122122
remote_execute_api: str = "https://bigcode-bigcodebench-evaluator.hf.space/",
123123
pass_k: str = "1,5,10",
124124
save_pass_rate: bool = True,
125+
calibrated: bool = True,
125126
parallel: int = -1,
126127
min_time_limit: float = 1,
127128
max_as_limit: int = 30*1024,
@@ -245,7 +246,7 @@ def evaluate(
245246
if "solution" in sample
246247
else problems[task_id]["complete_prompt"] + sample["completion"]
247248
)
248-
if "sanitized_calibrated" in samples:
249+
if calibrated:
249250
solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution
250251
remainings.add(sample["_identifier"])
251252
args = (
@@ -266,15 +267,15 @@ def evaluate(
266267
assert n_samples == len(remainings), "Missing problems in unfinished"
267268
assert len(completion_id) == len(problems), "Missing problems in samples"
268269

269-
def stucking_checker():
270-
while remainings:
271-
last_size = len(remainings)
272-
time.sleep(240)
273-
if last_size != len(remainings) or len(remainings) == 0:
274-
continue
275-
# Potential stucking
276-
warn("No samples had finished testing in the last 240s")
277-
warn(f"{len(remainings)} samples to be tested: {remainings}")
270+
def stucking_checker():
271+
while remainings:
272+
last_size = len(remainings)
273+
time.sleep(240)
274+
if last_size != len(remainings) or len(remainings) == 0:
275+
continue
276+
# Potential stucking
277+
warn("No samples had finished testing in the last 240s")
278+
warn(f"{len(remainings)} samples to be tested: {remainings}")
278279

279280
threading.Thread(target=stucking_checker).start()
280281

@@ -283,20 +284,20 @@ def stucking_checker():
283284
remainings.remove(result["_identifier"])
284285
eval_results[result["task_id"]].append(result)
285286

286-
# sort the results for each problem by completion_id
287-
for task_id, task_results in eval_results.items():
288-
task_results.sort(key=lambda x: x["completion_id"])
289-
results["eval"][task_id] = []
290-
for res in task_results:
291-
stat, details = res["base"]
292-
results["eval"][task_id].append(
293-
{
294-
"task_id": task_id,
295-
"solution": res["solution"],
296-
"status": stat,
297-
"details": details,
298-
}
299-
)
287+
# sort the results for each problem by completion_id
288+
for task_id, task_results in eval_results.items():
289+
task_results.sort(key=lambda x: x["completion_id"])
290+
results["eval"][task_id] = []
291+
for res in task_results:
292+
stat, details = res["base"]
293+
results["eval"][task_id].append(
294+
{
295+
"task_id": task_id,
296+
"solution": res["solution"],
297+
"status": stat,
298+
"details": details,
299+
}
300+
)
300301

301302
# Calculate pass@k.
302303
total = np.array([len(r) for k, r in results["eval"].items() if k in problems])
@@ -319,7 +320,7 @@ def stucking_checker():
319320
pass_at_k["model"] = os.path.basename(samples).split("--bigcodebench-")[0]
320321
pass_at_k["split"] = split
321322
pass_at_k["subset"] = subset
322-
pass_at_k["calibrated"] = "sanitized_calibrated" in samples
323+
pass_at_k["calibrated"] = calibrated
323324
pass_at_k["gt_pass_rate"] = gt_pass_rate
324325
pass_at_k["failed_tasks"] = failed_tasks
325326

0 commit comments

Comments
 (0)