Skip to content

Commit 63ea730

Browse files
committed
Support --limit option for all tasks
1 parent f7e2539 commit 63ea730

File tree

8 files changed

+19
-14
lines changed

8 files changed

+19
-14
lines changed

astabench/evals/inspect_eval_wrappers/core_bench.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -287,18 +287,18 @@ def score_with_stderr():
287287

288288

289289
@task
290-
def core_bench_test() -> Task:
290+
def core_bench_test(**kwargs) -> Task:
291291
"""The CORE-bench eval from inspect_evals (test split), using our
292292
astabench-oriented defaults for the task settings."""
293-
return core_bench(split="test")
293+
return core_bench(split="test", **kwargs)
294294

295295

296296
@task
297-
def core_bench_validation() -> Task:
297+
def core_bench_validation(**kwargs) -> Task:
298298
"""The CORE-bench eval from inspect_evals (train split, which we use as
299299
a 'dev' split), using our astabench-oriented defaults for the task
300300
settings."""
301-
return core_bench(split="train")
301+
return core_bench(split="train", **kwargs)
302302

303303

304304
__all__ = [

astabench/evals/sqa/task.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -541,10 +541,10 @@ def sqa(
541541

542542

543543
@task
544-
def sqa_dev() -> Task:
545-
return sqa(split="dev")
544+
def sqa_dev(**kwargs) -> Task:
545+
return sqa(split="dev", **kwargs)
546546

547547

548548
@task
549-
def sqa_test() -> Task:
550-
return sqa(split="test")
549+
def sqa_test(**kwargs) -> Task:
550+
return sqa(split="test", **kwargs)

astabench/evals/super/task.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -429,15 +429,15 @@ def super(split: str = "Expert", sample_limit: int | None = None) -> Task:
429429

430430

431431
@task
432-
def super_validation() -> Task:
432+
def super_validation(limit: int = 50) -> Task:
433433
"""Runs the super task on the validation ("Auto") split."""
434434
# There are hundreds of samples in "Auto", so we limit it for validation by
435435
# default; they can only be evaluated if the solver returns trajectory data
436436
# in the scorer's format
437-
return super(split="Auto", sample_limit=50)
437+
return super(split="Auto", sample_limit=limit if limit > 0 else None)
438438

439439

440440
@task
441-
def super_test() -> Task:
441+
def super_test(limit: int = 0) -> Task:
442442
"""Runs the super task on the test ("Expert") split."""
443-
return super(split="Expert")
443+
return super(split="Expert", sample_limit=limit if limit > 0 else None)

solvers/futurehouse/demo.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,10 @@
44
set -euo pipefail
55

66
uv run inspect eval \
7-
--solver astabench/solvers/futurehouse/futurehouse_solver.py@futurehouse_solver
7+
--solver astabench/solvers/futurehouse/futurehouse_solver.py@futurehouse_solver \
88
--model openai/gpt-4.1-nano \
99
--limit 1 \
10+
-T limit=1 \
1011
$* \
1112
astabench/sqa_dev
1213

solvers/react/demo.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ uv run astabench eval \
1010
--solver astabench/solvers/react/basic_agent.py@instantiated_basic_agent \
1111
--model openai/gpt-4.1-nano \
1212
--limit 1 \
13+
-T limit=1 \
1314
-S max_steps=10 \
1415
-S with_search_tools=0 -S with_table_editor=0 -S with_report_editor=0 -S with_thinking_tool=0 \
1516
$*

solvers/sqa/demo.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,6 @@ uv run inspect eval \
99
--solver astabench/solvers/sqa/sqa.py@sqa_solver \
1010
--model openai/gpt-4.1-nano \
1111
--limit 1 \
12+
-T limit=1 \
1213
$* \
1314
astabench/sqa_dev

solvers/storm/demo.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,10 @@
44
set -euo pipefail
55

66
uv run inspect eval \
7-
--solver astabench/solvers/sqa/storm_solver.py@storm_solver \
7+
--solver astabench/solvers/sqa/storm_solver.py@storm_solver \
88
--model openai/gpt-4.1-nano \
99
--limit 1 \
10+
-T limit=1 \
1011
$* \
1112
astabench/sqa_dev
1213

solvers/super/demo.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ uv run inspect eval \
77
--solver astabench/solvers/code_agent/agent.py@code_agent \
88
--model openai/gpt-4.1-nano \
99
--limit 1 \
10+
-T limit=1 \
1011
$* \
1112
astabench/super_validation
1213

0 commit comments

Comments
 (0)