11import json
22import math
3+ import os
34import shutil
45from pathlib import Path
56from tempfile import NamedTemporaryFile , TemporaryDirectory
67from typing import Callable , Generator
78
89import pytest
10+ from dotenv import load_dotenv
11+ from preparedness_turn_completer .oai_turn_completer import OpenAITurnCompleter
912
1013from paperbench .judge .base import Judge
1114from paperbench .judge .dummyrandom import DummyJudge
1215from paperbench .judge .simple import SimpleJudge
1316from paperbench .rubric .tasks import TaskNode
14- from paperbench .utils import in_ci
17+ from paperbench .utils import find_dotenv , in_ci
18+
19+ load_dotenv (find_dotenv ())
1520
1621non_dummy_judges = [SimpleJudge ]
1722
1823
24+ OPENAI_API_KEY = os .environ .get ("OPENAI_API_KEY" )
25+
26+
1927def get_ancestor (name : str ) -> Path :
2028 """Returns the path to an ancestor directory with `name`, starting from the current file."""
2129
@@ -119,7 +127,7 @@ async def test_all_gold_submissions_achieve_a_perfect_score_on_a_trivial_rubric(
119127 addendum = None ,
120128 judge_addendum = None ,
121129 submission_dir = gold_submission ,
122- model = "gpt-4o" ,
130+ completer_config = OpenAITurnCompleter . Config ( model = "gpt-4o" ) ,
123131 paper_md = empty_markdown ,
124132 )
125133
@@ -134,7 +142,8 @@ async def test_all_gold_submissions_achieve_a_perfect_score_on_a_trivial_rubric(
134142
135143@pytest .mark .asyncio
136144@pytest .mark .parametrize ("JudgeClass" , non_dummy_judges ) # Skip DummyJudge
137- @pytest .mark .skipif (in_ci (), reason = "Test does not apply to DummyJudge" )
145+ @pytest .mark .skipif (in_ci (), reason = "Skip non-dummy judge in CI." )
146+ @pytest .mark .skipif (OPENAI_API_KEY is None , reason = "OPENAI_API_KEY is not set." )
138147@pytest .mark .parametrize ("task" , ["empty" , "hex_flag" , "hex_flags" , "nested_hex_flags" ])
139148async def test_all_gold_submissions_achieve_a_null_score_on_an_impossible_rubric (
140149 task : str ,
@@ -153,7 +162,7 @@ async def test_all_gold_submissions_achieve_a_null_score_on_an_impossible_rubric
153162 addendum = None ,
154163 judge_addendum = None ,
155164 submission_dir = gold_submission ,
156- model = "gpt-4o" ,
165+ completer_config = OpenAITurnCompleter . Config ( model = "gpt-4o" ) ,
157166 paper_md = empty_markdown ,
158167 )
159168
@@ -168,7 +177,8 @@ async def test_all_gold_submissions_achieve_a_null_score_on_an_impossible_rubric
168177
169178@pytest .mark .asyncio
170179@pytest .mark .parametrize ("JudgeClass" , non_dummy_judges ) # Skip DummyJudge
171- @pytest .mark .skipif (in_ci (), reason = "Test does not apply to DummyJudge" )
180+ @pytest .mark .skipif (in_ci (), reason = "Skip non-dummy judge in CI." )
181+ @pytest .mark .skipif (OPENAI_API_KEY is None , reason = "OPENAI_API_KEY is not set." )
172182@pytest .mark .parametrize ("task" , ["empty" , "hex_flag" , "hex_flags" , "nested_hex_flags" ])
173183async def test_all_gold_submissions_achieve_a_perfect_score_on_their_corresponding_rubric (
174184 task : str ,
@@ -187,7 +197,7 @@ async def test_all_gold_submissions_achieve_a_perfect_score_on_their_correspondi
187197 addendum = None ,
188198 judge_addendum = None ,
189199 submission_dir = gold_submission ,
190- model = "gpt-4o" ,
200+ completer_config = OpenAITurnCompleter . Config ( model = "gpt-4o" ) ,
191201 paper_md = empty_markdown ,
192202 )
193203
@@ -202,7 +212,8 @@ async def test_all_gold_submissions_achieve_a_perfect_score_on_their_correspondi
202212
203213@pytest .mark .asyncio
204214@pytest .mark .parametrize ("JudgeClass" , non_dummy_judges ) # Skip DummyJudge
205- @pytest .mark .skipif (in_ci (), reason = "Test does not apply to DummyJudge" )
215+ @pytest .mark .skipif (in_ci (), reason = "Skip non-dummy judge in CI." )
216+ @pytest .mark .skipif (OPENAI_API_KEY is None , reason = "OPENAI_API_KEY is not set." )
206217@pytest .mark .parametrize ("task" , ["hex_flag" , "hex_flags" , "nested_hex_flags" ])
207218async def test_empty_submission_achieves_a_null_score_on_all_non_trvial_rubrics (
208219 task : str ,
@@ -221,7 +232,7 @@ async def test_empty_submission_achieves_a_null_score_on_all_non_trvial_rubrics(
221232 addendum = None ,
222233 judge_addendum = None ,
223234 submission_dir = empty_submission ,
224- model = "gpt-4o" ,
235+ completer_config = OpenAITurnCompleter . Config ( model = "gpt-4o" ) ,
225236 paper_md = empty_markdown ,
226237 )
227238
@@ -236,7 +247,8 @@ async def test_empty_submission_achieves_a_null_score_on_all_non_trvial_rubrics(
236247
237248@pytest .mark .asyncio
238249@pytest .mark .parametrize ("JudgeClass" , non_dummy_judges ) # Skip DummyJudge
239- @pytest .mark .skipif (in_ci (), reason = "Test does not apply to DummyJudge" )
250+ @pytest .mark .skipif (in_ci (), reason = "Skip non-dummy judge in CI." )
251+ @pytest .mark .skipif (OPENAI_API_KEY is None , reason = "OPENAI_API_KEY is not set." )
240252@pytest .mark .parametrize (
241253 "n_missing" ,
242254 [
@@ -268,7 +280,7 @@ async def test_submission_with_n_missing_files_to_the_hex_flags_task_achieves_a_
268280 addendum = None ,
269281 judge_addendum = None ,
270282 submission_dir = submission ,
271- model = "gpt-4o" ,
283+ completer_config = OpenAITurnCompleter . Config ( model = "gpt-4o" ) ,
272284 paper_md = empty_markdown ,
273285 )
274286
@@ -284,7 +296,8 @@ async def test_submission_with_n_missing_files_to_the_hex_flags_task_achieves_a_
284296
285297@pytest .mark .asyncio
286298@pytest .mark .parametrize ("JudgeClass" , non_dummy_judges ) # Skip DummyJudge
287- @pytest .mark .skipif (in_ci (), reason = "Test does not apply to DummyJudge" )
299+ @pytest .mark .skipif (in_ci (), reason = "Skip non-dummy judge in CI." )
300+ @pytest .mark .skipif (OPENAI_API_KEY is None , reason = "OPENAI_API_KEY is not set." )
288301async def test_nested_context_preserved_in_grading (
289302 JudgeClass : type [Judge ],
290303 empty_pdf : Path ,
@@ -308,7 +321,7 @@ async def test_nested_context_preserved_in_grading(
308321 addendum = None ,
309322 judge_addendum = None ,
310323 submission_dir = submission ,
311- model = "gpt-4o" ,
324+ completer_config = OpenAITurnCompleter . Config ( model = "gpt-4o" ) ,
312325 paper_md = empty_markdown ,
313326 )
314327
0 commit comments