Merge branch 'main' of https://github.com/bigcode-project/bigcodebench

terryyz · terryyz · commit afac95e1ace0 · 2024-08-02T21:25:28.000+08:00
diff --git a/README.md b/README.md
@@ -235,9 +235,10 @@ You are strongly recommended to use a sandbox such as [docker](https://docs.dock
 
 ```bash
 # Mount the current directory to the container
-# If you want to change the RAM address space limit (in MB, 128 GB by default): `--max-as-limit XXX`
-# If you want to change the RAM data segment limit (in MB, 4 GB by default): `--max-data-limit`
-# If you want to change the RAM stack limit (in MB, 4 MB by default): `--max-stack-limit`
+# If you want to change the RAM address space limit (in MB, 30 GB by default): `--max-as-limit XXX`
+# If you want to change the RAM data segment limit (in MB, 30 GB by default): `--max-data-limit`
+# If you want to change the RAM stack limit (in MB, 10 MB by default): `--max-stack-limit`
+# If you want to increase the execution time limit (in seconds, 240 seconds by default): `--min-time-limit`
 docker run -v $(pwd):/app bigcodebench/bigcodebench-evaluate:latest --split [complete|instruct] --subset [full|hard] --samples samples-sanitized-calibrated.jsonl
 
 # If you only want to check the ground truths
@@ -259,6 +260,8 @@ Then, run the evaluation:
 bigcodebench.evaluate --split [complete|instruct] --subset [full|hard] --samples samples-sanitized-calibrated.jsonl
 # ...If you really don't want to check the ground truths
 bigcodebench.evaluate --split [complete|instruct] --subset [full|hard] --samples samples-sanitized-calibrated.jsonl --no-gt
+# If you want to save the pass rate to a file
+bigcodebench.evaluate --split [complete|instruct] --subset [full|hard] --samples samples-sanitized-calibrated.jsonl --save_pass_rate
 
 # You are strongly recommended to use the following command to clean up the environment after evaluation:
 pids=$(ps -u $(id -u) -o pid,comm | grep 'bigcodebench' | awk '{print $1}'); if [ -n \"$pids\" ]; then echo $pids | xargs -r kill; fi;
diff --git a/bigcodebench/eval/__init__.py b/bigcodebench/eval/__init__.py
@@ -178,8 +178,8 @@ def untrusted_check(
     min_time_limit: float = 10,
     gt_time_limit: float = 60
 ) -> Tuple[str, np.ndarray]:
-    time_limit = max(min_time_limit, gt_time_limit)
-    timeout = max(os.getenv("BIGCODEBENCH_TIMEOUT_PER_TASK", TIMEOUT_LIMIT), time_limit) + 1
+    min_time_limit = max(min_time_limit, gt_time_limit)
+    timeout = max(os.getenv("BIGCODEBENCH_TIMEOUT_PER_TASK", TIMEOUT_LIMIT), min_time_limit) + 1
     # shared memory objects
     stat = Value("i", _UNKNOWN)
     manager = Manager()
diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
@@ -150,10 +150,11 @@ def evaluate(flags):
                 cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}", "green")
             else:
                 cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}\nPlease be cautious!", "red")
-            return
         
             if len(failed_tasks) > 0:
                 cprint(f"Failed tasks: {failed_tasks}", "red")
+            
+            return
         
         results = {
             "date": datetime.now().strftime("%Y-%m-%d %H:%M"),
diff --git a/bigcodebench/gen/util/__init__.py b/bigcodebench/gen/util/__init__.py
@@ -87,7 +87,7 @@ def trusted_check(
     max_stack_limit: float,
     min_time_limit: float = 10,
 ):
-    timeout = max(os.getenv("BIGCODEBENCH_TIMEOUT_PER_TASK", TIMEOUT_LIMIT), time_limit) + 1
+    timeout = max(os.getenv("BIGCODEBENCH_TIMEOUT_PER_TASK", TIMEOUT_LIMIT), min_time_limit) + 1
     # shared memory objects
     times = Value("d", -1)
     manager = Manager()
diff --git a/release_docker.sh b/release_docker.sh
@@ -28,4 +28,9 @@ docker push bigcodebench/bigcodebench-evaluate:latest
 docker build -f Docker/Generate.Dockerfile . -t bigcodebench/bigcodebench-generate:$version
 docker tag bigcodebench/bigcodebench-generate:$version bigcodebench/bigcodebench-generate:latest
 docker push bigcodebench/bigcodebench-generate:$version
-docker push bigcodebench/bigcodebench-generate:latest
+docker push bigcodebench/bigcodebench-generate:latest
+
+docker build -f Docker/Gradio.Dockerfile . -t bigcodebench/bigcodebench-gradio:$version
+docker tag bigcodebench/bigcodebench-gradio:$version bigcodebench/bigcodebench-gradio:latest
+docker push bigcodebench/bigcodebench-gradio:$version
+docker push bigcodebench/bigcodebench-gradio:latest