feat(evaluate): do calibration by default

terryyz · terryyz · commit 8fa95f8bae52 · 2024-12-05T20:24:17.000+11:00
diff --git a/ADVANCED_USAGE.md b/ADVANCED_USAGE.md
@@ -69,6 +69,7 @@ Below are all the arguments for `bigcodebench.evaluate` for the remote evaluatio
 - `--local_execute`: Whether to execute the samples locally, default to `False`
 - `--remote_execute_api`: The API endpoint for remote execution, default to `https://bigcode-bigcodebench-evaluator.hf.space/`, you can also use your own Gradio API endpoint by cloning the [bigcodebench-evaluator](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator) repo and check `Use via API` at the bottom of the HF space page.
 - `--pass_k`: The `k` in `Pass@k`, default to `[1, 5, 10]`, e.g. `--pass_k 1,5,10` will evaluate `Pass@1`, `Pass@5` and `Pass@10`
+- `--calibrated`: Whether to use the calibrated samples, default to `True`
 - `--save_pass_rate`: Whether to save the pass rate to a file, default to `True`
 - `--parallel`: The number of parallel processes, default to `-1`, e.g. `--parallel 10` will evaluate 10 samples in parallel
 - `--min_time_limit`: The minimum time limit for the execution, default to `1`, e.g. `--min_time_limit 10` will evaluate the samples with at least 10 seconds
diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
@@ -122,6 +122,7 @@ def evaluate(
     remote_execute_api: str = "https://bigcode-bigcodebench-evaluator.hf.space/",
     pass_k: str = "1,5,10",
     save_pass_rate: bool = True,
+    calibrated: bool = True,
     parallel: int = -1,
     min_time_limit: float = 1,
     max_as_limit: int = 30*1024,
@@ -245,7 +246,7 @@ def evaluate(
                             if "solution" in sample
                             else problems[task_id]["complete_prompt"] + sample["completion"]
                         )
-                        if "sanitized_calibrated" in samples:
+                        if calibrated:
                             solution = problems[task_id]["code_prompt"] + "\n    pass\n" + solution
                         remainings.add(sample["_identifier"])
                         args = (
@@ -266,15 +267,15 @@ def evaluate(
                     assert n_samples == len(remainings), "Missing problems in unfinished"
                     assert len(completion_id) == len(problems), "Missing problems in samples"
 
-                def stucking_checker():
-                    while remainings:
-                        last_size = len(remainings)
-                        time.sleep(240)
-                        if last_size != len(remainings) or len(remainings) == 0:
-                            continue
-                        # Potential stucking
-                        warn("No samples had finished testing in the last 240s")
-                        warn(f"{len(remainings)} samples to be tested: {remainings}")
+                    def stucking_checker():
+                        while remainings:
+                            last_size = len(remainings)
+                            time.sleep(240)
+                            if last_size != len(remainings) or len(remainings) == 0:
+                                continue
+                            # Potential stucking
+                            warn("No samples had finished testing in the last 240s")
+                            warn(f"{len(remainings)} samples to be tested: {remainings}")
 
                 threading.Thread(target=stucking_checker).start()
 
@@ -283,20 +284,20 @@ def stucking_checker():
                     remainings.remove(result["_identifier"])
                     eval_results[result["task_id"]].append(result)
 
-                # sort the results for each problem by completion_id
-                for task_id, task_results in eval_results.items():
-                    task_results.sort(key=lambda x: x["completion_id"])
-                    results["eval"][task_id] = []
-                    for res in task_results:
-                        stat, details = res["base"]
-                        results["eval"][task_id].append(
-                            {
-                                "task_id": task_id,
-                                "solution": res["solution"],
-                                "status": stat,
-                                "details": details,
-                            }
-                        )
+                    # sort the results for each problem by completion_id
+                    for task_id, task_results in eval_results.items():
+                        task_results.sort(key=lambda x: x["completion_id"])
+                        results["eval"][task_id] = []
+                        for res in task_results:
+                            stat, details = res["base"]
+                            results["eval"][task_id].append(
+                                {
+                                    "task_id": task_id,
+                                    "solution": res["solution"],
+                                    "status": stat,
+                                    "details": details,
+                                }
+                            )
 
                 # Calculate pass@k.
                 total = np.array([len(r) for k, r in results["eval"].items() if k in problems])
@@ -319,7 +320,7 @@ def stucking_checker():
             pass_at_k["model"] = os.path.basename(samples).split("--bigcodebench-")[0]
             pass_at_k["split"] = split
             pass_at_k["subset"] = subset
-            pass_at_k["calibrated"] = "sanitized_calibrated" in samples
+            pass_at_k["calibrated"] = calibrated
             pass_at_k["gt_pass_rate"] = gt_pass_rate
             pass_at_k["failed_tasks"] = failed_tasks