Skip to content

Commit 942d780

Browse files
refactor tests
1 parent 8b2e75a commit 942d780

11 files changed

Lines changed: 692 additions & 500 deletions

File tree

.github/workflows/paperbench_tests.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ on:
55
paths: [ 'project/paperbench/**']
66
branches: [ '**' ]
77
types: [opened, synchronize, reopened, ready_for_review]
8+
89
workflow_dispatch:
910
inputs:
1011
test_branch:
@@ -43,7 +44,7 @@ jobs:
4344
driver: docker
4445
install: true
4546

46-
- name: Build pb-env (local)
47+
- name: Build pb-env
4748
working-directory: ./project/paperbench
4849
run: |
4950
docker buildx build \
@@ -56,4 +57,4 @@ jobs:
5657
- name: Run tests
5758
working-directory: ./project/paperbench
5859
run: |
59-
uv run pytest -n 10 -v -rs
60+
uv run pytest -n 10 -vrs
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
lca-on-the-line

project/paperbench/paperbench/nano/eval.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -391,9 +391,9 @@ class PaperBench(PythonCodingEval):
391391
judge: JudgeConfig = chz.field(default_factory=JudgeConfig)
392392

393393
# task args
394-
paper_split: Literal["debug", "dev", "human", "all"] = chz.field(
394+
paper_split: Literal["debug", "dev", "human", "testing", "all"] = chz.field(
395395
default="all",
396-
doc="Paper split to use. One of 'debug' (rice only), 'dev' (two papers), 'human' (papers used in human baseline), 'all' (full set)",
396+
doc="Paper split to use. One of 'testing' (lca-on-the-line only), 'debug' (rice only), 'dev' (two papers), 'human' (papers used in human baseline), 'all' (full set)",
397397
# should match what is in experiments/splits/
398398
)
399399
resume_run_group_id: str | None = chz.field(default=None)

project/paperbench/paperbench/scripts/alcatraz_services.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@
1010

1111
from paperbench.infra.alcatraz import put_file_in_computer, tar_and_extract_from_computer
1212
from paperbench.scripts.run_reproduce import reproduce
13-
from paperbench.utils import get_dotenv
13+
from paperbench.utils import find_dotenv
1414

1515
logger = structlog.stdlib.get_logger(component=__name__)
16-
load_dotenv(get_dotenv())
16+
load_dotenv(find_dotenv())
1717

1818

1919
async def put_submission_in_computer(

project/paperbench/paperbench/utils.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import io
22
import logging
33
import os
4+
import subprocess
45
import tarfile
56
import time
67
import uuid
@@ -82,7 +83,7 @@ def get_experiments_dir() -> Path:
8283
return get_root().parent / "experiments"
8384

8485

85-
def get_dotenv() -> Path:
86+
def find_dotenv() -> Path:
8687
"""Returns an absolute path to the .env file."""
8788

8889
return get_root().parent / ".env"
@@ -94,9 +95,13 @@ def get_timestamp() -> str:
9495
return time.strftime("%Y-%m-%dT%H-%M-%S-%Z", time.gmtime())
9596

9697

97-
def create_run_id(
98-
paper_id: str,
99-
) -> str:
98+
def get_commit_hash() -> str:
99+
"""Returns the current Git commit hash."""
100+
101+
return subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("ascii")
102+
103+
104+
def create_run_id(paper_id: str) -> str:
100105
"""Creates a run ID."""
101106

102107
return f"{paper_id}_{str(uuid.uuid4())}"

project/paperbench/tests/conftest.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,6 @@
44

55

66
def pytest_addoption(parser: pytest.Parser) -> None:
7-
parser.addoption(
8-
"--no-cache",
9-
action="store_true",
10-
default=False,
11-
help="Download and prepare all datasets from scratch.",
12-
)
137
parser.addoption(
148
"--slow",
159
action="store_true",
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
runs
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
import os
2+
3+
from dotenv import load_dotenv
4+
5+
from paperbench.utils import find_dotenv
6+
7+
load_dotenv(find_dotenv())
8+
9+
from alcatraz.clusters.local import LocalConfig
10+
from preparedness_turn_completer.oai_turn_completer import OpenAITurnCompleter
11+
12+
from paperbench.nano.eval import (
13+
ExternalPythonCodingSolver,
14+
)
15+
from paperbench.nano.structs import (
16+
JudgeConfig,
17+
ReproductionConfig,
18+
)
19+
from paperbench.solvers.dummy.solver import PaperBenchDummySolver
20+
21+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", None)
22+
GRADER_OPENAI_API_KEY = os.getenv("GRADER_OPENAI_API_KEY", OPENAI_API_KEY)
23+
DEFAULT_AZURE_VM_SKU = "Standard_D2as_v4"
24+
25+
LOCAL_DUMMY_SOLVER_CONFIG = PaperBenchDummySolver()
26+
27+
LOCAL_AISI_SOLVER_CONFIG = ExternalPythonCodingSolver(
28+
agent_id="aisi-basic-agent-openai-dev",
29+
cluster_config=LocalConfig(
30+
image="aisi-basic-agent:latest",
31+
pull_from_registry=False,
32+
),
33+
)
34+
35+
LOCAL_REPRODUCTION_CONFIG = ReproductionConfig(
36+
timeout=100 * 3600,
37+
retry_threshold=600,
38+
overwrite_existing_output=False,
39+
skip_reproduction=True,
40+
cluster_config=LocalConfig(
41+
image="pb-reproducer:latest",
42+
pull_from_registry=False,
43+
),
44+
)
45+
46+
LOCAL_REPRODUCTION_WITHOUT_SKIP_CONFIG = ReproductionConfig(
47+
timeout=100 * 3600,
48+
retry_threshold=600,
49+
overwrite_existing_output=False,
50+
skip_reproduction=False,
51+
cluster_config=LocalConfig(
52+
image="pb-reproducer:latest",
53+
pull_from_registry=False,
54+
),
55+
)
56+
57+
LOCAL_JUDGE_CONFIG = JudgeConfig(
58+
grade=True,
59+
grade_locally=True,
60+
grade_id=0,
61+
overwrite_existing_output=False,
62+
scaffold="dummy",
63+
completer_config=OpenAITurnCompleter.Config(
64+
model="gpt-4o-mini",
65+
),
66+
code_only=False,
67+
resources_provided=False,
68+
cluster_config=LocalConfig(
69+
image="pb-env:latest",
70+
pull_from_registry=False,
71+
environment={"OPENAI_API_KEY": GRADER_OPENAI_API_KEY},
72+
),
73+
)

0 commit comments

Comments
 (0)