Skip to content

Commit ec5f7b4

Browse files
small fixes to turn completer integration in pb
1 parent b6072a4 commit ec5f7b4

3 files changed

Lines changed: 34 additions & 16 deletions

File tree

project/paperbench/paperbench/judge/simple.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -445,9 +445,7 @@ async def _prepare_relevant_files(
445445

446446
file_content_tasks = [
447447
read_file_content(
448-
(self.submission_dir / rel_path.strip().strip("/")).relative_to(
449-
self.submission_dir
450-
),
448+
self.submission_dir / rel_path.strip().strip("/"),
451449
self.computer,
452450
)
453451
for rel_path in selected_files.split("\n")[: max_files or None]

project/paperbench/tests/unit/test_judge.py

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,29 @@
11
import json
22
import math
3+
import os
34
import shutil
45
from pathlib import Path
56
from tempfile import NamedTemporaryFile, TemporaryDirectory
67
from typing import Callable, Generator
78

89
import pytest
10+
from dotenv import load_dotenv
11+
from preparedness_turn_completer.oai_turn_completer import OpenAITurnCompleter
912

1013
from paperbench.judge.base import Judge
1114
from paperbench.judge.dummyrandom import DummyJudge
1215
from paperbench.judge.simple import SimpleJudge
1316
from paperbench.rubric.tasks import TaskNode
14-
from paperbench.utils import in_ci
17+
from paperbench.utils import find_dotenv, in_ci
18+
19+
load_dotenv(find_dotenv())
1520

1621
non_dummy_judges = [SimpleJudge]
1722

1823

24+
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
25+
26+
1927
def get_ancestor(name: str) -> Path:
2028
"""Returns the path to an ancestor directory with `name`, starting from the current file."""
2129

@@ -119,7 +127,7 @@ async def test_all_gold_submissions_achieve_a_perfect_score_on_a_trivial_rubric(
119127
addendum=None,
120128
judge_addendum=None,
121129
submission_dir=gold_submission,
122-
model="gpt-4o",
130+
completer_config=OpenAITurnCompleter.Config(model="gpt-4o"),
123131
paper_md=empty_markdown,
124132
)
125133

@@ -134,7 +142,8 @@ async def test_all_gold_submissions_achieve_a_perfect_score_on_a_trivial_rubric(
134142

135143
@pytest.mark.asyncio
136144
@pytest.mark.parametrize("JudgeClass", non_dummy_judges) # Skip DummyJudge
137-
@pytest.mark.skipif(in_ci(), reason="Test does not apply to DummyJudge")
145+
@pytest.mark.skipif(in_ci(), reason="Skip non-dummy judge in CI.")
146+
@pytest.mark.skipif(OPENAI_API_KEY is None, reason="OPENAI_API_KEY is not set.")
138147
@pytest.mark.parametrize("task", ["empty", "hex_flag", "hex_flags", "nested_hex_flags"])
139148
async def test_all_gold_submissions_achieve_a_null_score_on_an_impossible_rubric(
140149
task: str,
@@ -153,7 +162,7 @@ async def test_all_gold_submissions_achieve_a_null_score_on_an_impossible_rubric
153162
addendum=None,
154163
judge_addendum=None,
155164
submission_dir=gold_submission,
156-
model="gpt-4o",
165+
completer_config=OpenAITurnCompleter.Config(model="gpt-4o"),
157166
paper_md=empty_markdown,
158167
)
159168

@@ -168,7 +177,8 @@ async def test_all_gold_submissions_achieve_a_null_score_on_an_impossible_rubric
168177

169178
@pytest.mark.asyncio
170179
@pytest.mark.parametrize("JudgeClass", non_dummy_judges) # Skip DummyJudge
171-
@pytest.mark.skipif(in_ci(), reason="Test does not apply to DummyJudge")
180+
@pytest.mark.skipif(in_ci(), reason="Skip non-dummy judge in CI.")
181+
@pytest.mark.skipif(OPENAI_API_KEY is None, reason="OPENAI_API_KEY is not set.")
172182
@pytest.mark.parametrize("task", ["empty", "hex_flag", "hex_flags", "nested_hex_flags"])
173183
async def test_all_gold_submissions_achieve_a_perfect_score_on_their_corresponding_rubric(
174184
task: str,
@@ -187,7 +197,7 @@ async def test_all_gold_submissions_achieve_a_perfect_score_on_their_correspondi
187197
addendum=None,
188198
judge_addendum=None,
189199
submission_dir=gold_submission,
190-
model="gpt-4o",
200+
completer_config=OpenAITurnCompleter.Config(model="gpt-4o"),
191201
paper_md=empty_markdown,
192202
)
193203

@@ -202,7 +212,8 @@ async def test_all_gold_submissions_achieve_a_perfect_score_on_their_correspondi
202212

203213
@pytest.mark.asyncio
204214
@pytest.mark.parametrize("JudgeClass", non_dummy_judges) # Skip DummyJudge
205-
@pytest.mark.skipif(in_ci(), reason="Test does not apply to DummyJudge")
215+
@pytest.mark.skipif(in_ci(), reason="Skip non-dummy judge in CI.")
216+
@pytest.mark.skipif(OPENAI_API_KEY is None, reason="OPENAI_API_KEY is not set.")
206217
@pytest.mark.parametrize("task", ["hex_flag", "hex_flags", "nested_hex_flags"])
207218
async def test_empty_submission_achieves_a_null_score_on_all_non_trvial_rubrics(
208219
task: str,
@@ -221,7 +232,7 @@ async def test_empty_submission_achieves_a_null_score_on_all_non_trvial_rubrics(
221232
addendum=None,
222233
judge_addendum=None,
223234
submission_dir=empty_submission,
224-
model="gpt-4o",
235+
completer_config=OpenAITurnCompleter.Config(model="gpt-4o"),
225236
paper_md=empty_markdown,
226237
)
227238

@@ -236,7 +247,8 @@ async def test_empty_submission_achieves_a_null_score_on_all_non_trvial_rubrics(
236247

237248
@pytest.mark.asyncio
238249
@pytest.mark.parametrize("JudgeClass", non_dummy_judges) # Skip DummyJudge
239-
@pytest.mark.skipif(in_ci(), reason="Test does not apply to DummyJudge")
250+
@pytest.mark.skipif(in_ci(), reason="Skip non-dummy judge in CI.")
251+
@pytest.mark.skipif(OPENAI_API_KEY is None, reason="OPENAI_API_KEY is not set.")
240252
@pytest.mark.parametrize(
241253
"n_missing",
242254
[
@@ -268,7 +280,7 @@ async def test_submission_with_n_missing_files_to_the_hex_flags_task_achieves_a_
268280
addendum=None,
269281
judge_addendum=None,
270282
submission_dir=submission,
271-
model="gpt-4o",
283+
completer_config=OpenAITurnCompleter.Config(model="gpt-4o"),
272284
paper_md=empty_markdown,
273285
)
274286

@@ -284,7 +296,8 @@ async def test_submission_with_n_missing_files_to_the_hex_flags_task_achieves_a_
284296

285297
@pytest.mark.asyncio
286298
@pytest.mark.parametrize("JudgeClass", non_dummy_judges) # Skip DummyJudge
287-
@pytest.mark.skipif(in_ci(), reason="Test does not apply to DummyJudge")
299+
@pytest.mark.skipif(in_ci(), reason="Skip non-dummy judge in CI.")
300+
@pytest.mark.skipif(OPENAI_API_KEY is None, reason="OPENAI_API_KEY is not set.")
288301
async def test_nested_context_preserved_in_grading(
289302
JudgeClass: type[Judge],
290303
empty_pdf: Path,
@@ -308,7 +321,7 @@ async def test_nested_context_preserved_in_grading(
308321
addendum=None,
309322
judge_addendum=None,
310323
submission_dir=submission,
311-
model="gpt-4o",
324+
completer_config=OpenAITurnCompleter.Config(model="gpt-4o"),
312325
paper_md=empty_markdown,
313326
)
314327

project/preparedness_turn_completer/preparedness_turn_completer/oai_turn_completer.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,12 @@ class OpenAITurnCompleter(TurnCompleter):
3030
def __init__(self, model: str, reasoning_effort: str | None = None):
3131
self.model: str = model
3232
self.reasoning_effort: str | None = reasoning_effort
33-
self.encoding_name: str = tiktoken.encoding_name_for_model(model)
33+
try:
34+
self.encoding_name: str = tiktoken.encoding_name_for_model(model)
35+
except KeyError:
36+
# Fallback to o200k_base
37+
logger.warning(f"Model {model} not found in tiktoken, using o200k_base")
38+
self.encoding_name: str = "o200k_base"
3439
self.n_ctx: int = get_model_context_window_length(model)
3540

3641
class Config(TurnCompleter.Config):
@@ -180,9 +185,11 @@ def get_model_context_window_length(model: str | None) -> int:
180185
"o1-mini-2024-09-12": 128000,
181186
"o1": 200000,
182187
"o1-2024-12-17": 200000,
188+
"o3": 200000,
183189
"o3-mini-2024-12-17": 128000,
184190
"o3-mini-2025-01-31": 200000,
185191
"o3-mini": 200000,
192+
"o4-mini": 200000,
186193
"o1-preview": 128000,
187194
"gpt-4-turbo": 128000,
188195
}

0 commit comments

Comments
 (0)