Merge branch 'main' of https://github.com/bigcode-project/bigcodebench

terryyz · terryyz · commit 0e0968f222c4 · 2024-07-18T01:57:25.000+08:00
diff --git a/README.md b/README.md
@@ -347,7 +347,7 @@ We share pre-generated code samples from LLMs we have [evaluated](https://huggin
 
 ## 🐞 Known Issues
 
-- [ ] Due to [the Hugging Face tokenizer update](https://github.com/huggingface/transformers/pull/31305), some tokenizer may be broken and will degrade the performance of the evaluation. Therefore, we set up with `legacy=False` for the initialization. If you notice the unexpected change, please try `--tokenizer_legacy` during the generation.
+- [ ] Due to [the Hugging Face tokenizer update](https://github.com/huggingface/transformers/pull/31305), some tokenizer may be broken and will degrade the performance of the evaluation. Therefore, we set up with `legacy=False` for the initialization. If you notice the unexpected behaviors, please try `--tokenizer_legacy` during the generation.
 
 - [ ] Due to the flakes in the evaluation, the execution results may vary slightly (~0.2%) between runs. We are working on improving the evaluation stability.
 
diff --git a/analysis/bcb_subset.py b/analysis/bcb_subset.py
@@ -75,8 +75,6 @@ def read_task_perf(tids, task="complete"):
             continue
         task_perf = {f"BigCodeBench/{task_id}": 0 for task_id in range(1140)}
         model = model.replace("/", "--")
-        # if info["link"].startswith("https://huggingface.co/"):
-        #     model = info["link"].split("https://huggingface.co/")[-1].replace("/", "--")
         try:
             if info["prompted"] and not info["direct_complete"]:
                 files = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized-calibrated_eval_results.json")
diff --git a/analysis/get_results.py b/analysis/get_results.py
@@ -52,9 +52,6 @@ def get_results(tids):
         hf_model = ""
         files = glob(f"results/{model}--bigcodebench-*.json")
         assert files, f"No files found for results/{model}--bigcodebench-*.json"
-        # if "https://huggingface.co/" in info["link"]:
-        #     hf_model = info["link"].split("https://huggingface.co/")[-1]
-        #     model = hf_model.replace("/", "--")
         for file in files:
             _, suffix = os.path.basename(file).split("--bigcodebench-")
             status = []
@@ -153,8 +150,6 @@ def read_task_perf(tids, task="complete"):
 
         task_perf = dict()
         model = model.replace("/", "--")
-        # if info["link"].startswith("https://huggingface.co/"):
-        #     model = info["link"].split("https://huggingface.co/")[-1].replace("/", "--")
         try:
             if info["prompted"] and not info["direct_complete"]:
                 files = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized-calibrated_eval_results.json")
diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
@@ -283,7 +283,8 @@ def stucking_checker():
             json.dump(results, f, indent=2)
     
     pass_at_k_path = result_path.replace("_eval_results.json", "_pass_at_k.json")
-    pass_at_k["model"] = flags.samples.split("/")[-1].replace(".jsonl", "")
+    pass_at_k["model"] = os.path.basename(flags.samples).split("--bigcodebench-")[0]
+    pass_at_k["calibrated"] = "sanitized-calibrated" in flags.samples
     pass_at_k["subset"] = flags.subset
 
     def save_pass_at_k():