Skip to content

Commit afac95e

Browse files
committed
2 parents aa31b60 + 21af836 commit afac95e

File tree

5 files changed

+17
-8
lines changed

5 files changed

+17
-8
lines changed

README.md

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -235,9 +235,10 @@ You are strongly recommended to use a sandbox such as [docker](https://docs.dock
235235
236236
```bash
237237
# Mount the current directory to the container
238-
# If you want to change the RAM address space limit (in MB, 128 GB by default): `--max-as-limit XXX`
239-
# If you want to change the RAM data segment limit (in MB, 4 GB by default): `--max-data-limit`
240-
# If you want to change the RAM stack limit (in MB, 4 MB by default): `--max-stack-limit`
238+
# If you want to change the RAM address space limit (in MB, 30 GB by default): `--max-as-limit XXX`
239+
# If you want to change the RAM data segment limit (in MB, 30 GB by default): `--max-data-limit`
240+
# If you want to change the RAM stack limit (in MB, 10 MB by default): `--max-stack-limit`
241+
# If you want to increase the execution time limit (in seconds, 240 seconds by default): `--min-time-limit`
241242
docker run -v $(pwd):/app bigcodebench/bigcodebench-evaluate:latest --split [complete|instruct] --subset [full|hard] --samples samples-sanitized-calibrated.jsonl
242243

243244
# If you only want to check the ground truths
@@ -259,6 +260,8 @@ Then, run the evaluation:
259260
bigcodebench.evaluate --split [complete|instruct] --subset [full|hard] --samples samples-sanitized-calibrated.jsonl
260261
# ...If you really don't want to check the ground truths
261262
bigcodebench.evaluate --split [complete|instruct] --subset [full|hard] --samples samples-sanitized-calibrated.jsonl --no-gt
263+
# If you want to save the pass rate to a file
264+
bigcodebench.evaluate --split [complete|instruct] --subset [full|hard] --samples samples-sanitized-calibrated.jsonl --save_pass_rate
262265
263266
# You are strongly recommended to use the following command to clean up the environment after evaluation:
264267
pids=$(ps -u $(id -u) -o pid,comm | grep 'bigcodebench' | awk '{print $1}'); if [ -n \"$pids\" ]; then echo $pids | xargs -r kill; fi;

bigcodebench/eval/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,8 +178,8 @@ def untrusted_check(
178178
min_time_limit: float = 10,
179179
gt_time_limit: float = 60
180180
) -> Tuple[str, np.ndarray]:
181-
time_limit = max(min_time_limit, gt_time_limit)
182-
timeout = max(os.getenv("BIGCODEBENCH_TIMEOUT_PER_TASK", TIMEOUT_LIMIT), time_limit) + 1
181+
min_time_limit = max(min_time_limit, gt_time_limit)
182+
timeout = max(os.getenv("BIGCODEBENCH_TIMEOUT_PER_TASK", TIMEOUT_LIMIT), min_time_limit) + 1
183183
# shared memory objects
184184
stat = Value("i", _UNKNOWN)
185185
manager = Manager()

bigcodebench/evaluate.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,10 +150,11 @@ def evaluate(flags):
150150
cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}", "green")
151151
else:
152152
cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}\nPlease be cautious!", "red")
153-
return
154153

155154
if len(failed_tasks) > 0:
156155
cprint(f"Failed tasks: {failed_tasks}", "red")
156+
157+
return
157158

158159
results = {
159160
"date": datetime.now().strftime("%Y-%m-%d %H:%M"),

bigcodebench/gen/util/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ def trusted_check(
8787
max_stack_limit: float,
8888
min_time_limit: float = 10,
8989
):
90-
timeout = max(os.getenv("BIGCODEBENCH_TIMEOUT_PER_TASK", TIMEOUT_LIMIT), time_limit) + 1
90+
timeout = max(os.getenv("BIGCODEBENCH_TIMEOUT_PER_TASK", TIMEOUT_LIMIT), min_time_limit) + 1
9191
# shared memory objects
9292
times = Value("d", -1)
9393
manager = Manager()

release_docker.sh

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,9 @@ docker push bigcodebench/bigcodebench-evaluate:latest
2828
docker build -f Docker/Generate.Dockerfile . -t bigcodebench/bigcodebench-generate:$version
2929
docker tag bigcodebench/bigcodebench-generate:$version bigcodebench/bigcodebench-generate:latest
3030
docker push bigcodebench/bigcodebench-generate:$version
31-
docker push bigcodebench/bigcodebench-generate:latest
31+
docker push bigcodebench/bigcodebench-generate:latest
32+
33+
docker build -f Docker/Gradio.Dockerfile . -t bigcodebench/bigcodebench-gradio:$version
34+
docker tag bigcodebench/bigcodebench-gradio:$version bigcodebench/bigcodebench-gradio:latest
35+
docker push bigcodebench/bigcodebench-gradio:$version
36+
docker push bigcodebench/bigcodebench-gradio:latest

0 commit comments

Comments
 (0)