fix: update calibrate option

terryyz · terryyz · commit a13bf9e89e98 · 2024-06-02T18:59:09.000+08:00
diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
@@ -96,16 +96,19 @@ def evaluate(flags):
     if flags.check_gt_only:
         # bypass the samples
         flags.samples = "__dummy__.jsonl"
-        
+    
+    if flags.calibrate:
+        assert "calibrate" in flags.samples, "Calibration is only supported for calibrated samples"
+    
     if os.path.isdir(flags.samples):
-        if flags.reprompt:
-            result_path = os.path.join(flags.samples, "reprompt_eval_results.json")
+        if flags.calibrate:
+            result_path = os.path.join(flags.samples, "calibrate_eval_results.json")
         else:
             result_path = os.path.join(flags.samples, "eval_results.json")
     else:
         assert flags.samples.endswith(".jsonl")
-        if flags.reprompt:
-            result_path = flags.samples.replace(".jsonl", "_reprompt_eval_results.json")
+        if flags.calibrate:
+            result_path = flags.samples.replace(".jsonl", "_calibrate_eval_results.json")
         else:
             result_path = flags.samples.replace(".jsonl", "_eval_results.json")
 
@@ -150,7 +153,7 @@ def evaluate(flags):
                     if "solution" in sample
                     else problems[task_id]["prompt"] + sample["completion"]
                 )
-                if flags.reprompt:
+                if flags.calibrate:
                     solution = problems[task_id]["prompt_wo_doc"] + "\n    pass\n" + solution
                 remainings.add(sample["_identifier"])
                 args = (
@@ -249,7 +252,7 @@ def main():
     parser.add_argument("--parallel", default=None, type=int)
     parser.add_argument("--min-time-limit", default=1, type=float)
     parser.add_argument(
-        "--reprompt", action="store_true", help="Prepend the prompt again"
+        "--calibrate", action="store_true", help="Calibrate the evaluation results"
     )
     parser.add_argument(
         "--check-gt-only", action="store_true", help="Check the groundtruth"
diff --git a/bigcodebench/sanitize.py b/bigcodebench/sanitize.py
@@ -179,7 +179,7 @@ def sanitize(code: str, entrypoint: Optional[str] = None) -> str:
 
 
 def script(
-    samples: str, inplace: bool = False, debug_task: str = None, reprompt: bool = False
+    samples: str, inplace: bool = False, debug_task: str = None, calibrate: bool = False
 ):
     # task_id -> entry_point
     entry_point = {}
@@ -194,13 +194,13 @@ def script(
     target_path = pathlib.Path(samples)
     if not inplace:
         if is_folder:
-            if reprompt:
-                new_name = target_path.name + "-sanitized-reprompt"
+            if calibrate:
+                new_name = target_path.name + "-sanitized-calibrate"
             else:
                 new_name = target_path.name + "-sanitized"
         else:
-            if reprompt:
-                new_name = target_path.name.replace(".jsonl", "-sanitized-reprompt.jsonl")
+            if calibrate:
+                new_name = target_path.name.replace(".jsonl", "-sanitized-calibrate.jsonl")
             else:
                 new_name = target_path.name.replace(".jsonl", "-sanitized.jsonl")
         target_path = target_path.parent / new_name
@@ -227,7 +227,7 @@ def script(
         ntotal += 1
         if "solution" in solution:
             old_code = solution["solution"]
-            if reprompt:
+            if calibrate:
                 old_code = solution["solution"].replace("```python\n    ", "```python\n"+dataset[task_id]["prompt"]+"    ")
         else:
             assert "completion" in solution