55import tempfile
66import time
77from contextlib import asynccontextmanager , nullcontext
8- from dataclasses import asdict
8+ from dataclasses import asdict , replace
99from datetime import timedelta
1010from pathlib import Path
1111from typing import Any , AsyncGenerator
4242 PaperBenchGrade ,
4343 PaperBenchResult ,
4444 ReproductionConfig ,
45- ReproductionOutput ,
4645)
4746from paperbench .nano .utils import get_file_at_duration
4847from paperbench .paper_registry import paper_registry
49- from paperbench .scripts .alcatraz_services import reproduce_on_computer
50- from paperbench .scripts .run_reproduce import ReproductionMetadata
48+ from paperbench .scripts .run_reproduce import ReproductionMetadata , reproduce_on_computer
5149from paperbench .utils import purple
5250
5351GRADER_OPENAI_API_KEY = os .getenv ("GRADER_OPENAI_API_KEY" ) or os .getenv ("OPENAI_API_KEY" )
@@ -197,7 +195,7 @@ def _early_exit_grade(
197195 resources_provided = self .judge .resources_provided ,
198196 agent_output = None ,
199197 judge_output = None ,
200- reproduction_output = None ,
198+ reproduction_metadata = None ,
201199 monitor_result = monitor_result ,
202200 monitor_ran = monitor_ran ,
203201 ),
@@ -208,12 +206,12 @@ def _early_exit_grade(
208206 self ._save_grade (grade )
209207 return grade
210208
211- def _should_grade (self , reproduction_output : ReproductionOutput | None ) -> bool :
209+ def _should_grade (self , reproduction_metadata : ReproductionMetadata | None ) -> bool :
212210 """
213211 We can proceed with grading if reproduction was successful
214212 OR we are in a reproduction-free setup
215213 """
216- return (reproduction_output and reproduction_output . success ) or (
214+ return (reproduction_metadata is not None ) or (
217215 self .reproduction .skip_reproduction or self .judge .code_only
218216 )
219217
@@ -277,17 +275,17 @@ async def grade(
277275 )
278276
279277 # 3. run reproduction
280- repro_output = None
278+ repro_metadata = None
281279 submission_to_grade_path = path_to_submission
282280 if self ._should_reproduce ():
283- repro_output = await self ._run_reproduce (path_to_submission )
284- repro_metadata = repro_output . metadata . to_dict () if repro_output . metadata else {}
281+ repro_metadata = await self ._run_reproduce (path_to_submission )
282+ repro_metadata_dict = repro_metadata . to_dict () if repro_metadata else {}
285283 submission_to_grade_path = path_to_executed_submission
286- self ._record_extra ({"repro_metadata" : repro_metadata })
284+ self ._record_extra ({"repro_metadata" : repro_metadata_dict })
287285
288286 # 4. run judge
289287 judge_output = None
290- if self ._should_grade (repro_output ):
288+ if self ._should_grade (repro_metadata ):
291289 judge_output = await self ._run_judge (submission_to_grade_path , self .paper_id )
292290 self ._record_extra ({"judge_output" : judge_output .to_dict () if judge_output else None })
293291
@@ -301,7 +299,7 @@ async def grade(
301299 code_only = self .judge .code_only ,
302300 resources_provided = self .judge .resources_provided ,
303301 judge_output = judge_output ,
304- reproduction_output = repro_output ,
302+ reproduction_metadata = repro_metadata ,
305303 monitor_ran = mon_ran ,
306304 monitor_result = mon_result ,
307305 ),
@@ -346,24 +344,23 @@ def _run_monitor(self, log_file_path: str) -> MonitorResult:
346344 monitor_result = monitor .check_log (log_file_path )
347345 return monitor_result
348346
349- async def _run_reproduce (self , submission : str ) -> ReproductionOutput :
347+ async def _run_reproduce (self , submission : str ) -> ReproductionMetadata | None :
350348 """Runs the reproduction process for the submission associated with the PBTask."""
351349 ctx_logger = logger .bind (
352350 run_group_id = self .run_group_id ,
353351 run_id = self .run_id ,
354352 runs_dir = self .runs_dir ,
355353 )
356354 ctx_logger .info (
357- f"Starting the reproduction process for `{ self .question_id } . { self . attempt_id } `..." ,
355+ f"Starting the reproduction process for `{ self .run_id } `..." ,
358356 destinations = ["group" , "run" ],
359357 _print = True ,
360358 )
361359
360+ metadata : ReproductionMetadata | None = None
362361 reproduce_output_path = submission .replace (".tar.gz" , "_executed.tar.gz" )
363362 repro_metadata_path = submission .replace (".tar.gz" , "_executed_metadata.json" )
364363
365- ctx_logger .info (f"Reproducing submission { reproduce_output_path } ..." , destinations = ["run" ])
366-
367364 # If the reproduction output already exists, we can skip reproduction
368365 if not self .reproduction .overwrite_existing_output :
369366 repro_output_exists = bf .exists (reproduce_output_path )
@@ -374,62 +371,37 @@ async def _run_reproduce(self, submission: str) -> ReproductionOutput:
374371 destinations = ["run" ],
375372 )
376373 with bf .BlobFile (repro_metadata_path , "r" ) as f :
377- data = json .loads (f .read ())
378- metadata = ReproductionMetadata .from_dict (data )
379- return ReproductionOutput (
380- executed_submission = reproduce_output_path ,
381- metadata = metadata ,
382- )
383-
384- # Reproduce on alcatraz
385- async with self ._start_computer (self .reproduction .cluster_config ) as computer :
386- await reproduce_on_computer (
387- computer = computer ,
374+ metadata = ReproductionMetadata .from_dict (json .loads (f .read ()))
375+ metadata = replace (metadata , executed_submission = reproduce_output_path )
376+ return metadata
377+
378+ # Reproduce on alcatraz and collect metadata
379+ try :
380+ metadata = await reproduce_on_computer (
381+ cluster_config = self .reproduction .cluster_config ,
388382 submission_path = submission ,
389383 logger = ctx_logger .bind (destinations = ["run" ]),
390384 run_dir = self .run_dir ,
391385 timeout = self .reproduction .timeout ,
392386 retry_threshold = self .reproduction .retry_threshold ,
393387 )
388+ except Exception as e :
389+ logger .exception (f"Reproduction failed with error:\n { str (e )} " )
394390
395- # Now the result should exist
396- repro_output_exists = bf .exists (reproduce_output_path )
397- repro_metadata_exists = bf .exists (repro_metadata_path )
398- if not repro_output_exists :
399- ctx_logger .exception (
400- f"Reproduction failed to produce output: { reproduce_output_path } " ,
401- destinations = ["group" , "run" ],
402- _print = True ,
403- )
404- return ReproductionOutput (
405- executed_submission = reproduce_output_path ,
406- metadata = None ,
407- )
408- if not repro_metadata_exists :
391+ if metadata is None :
409392 ctx_logger .exception (
410393 f"Reproduction failed to produce metadata: { repro_metadata_path } " ,
411394 destinations = ["group" , "run" ],
412395 _print = True ,
413396 )
414- return ReproductionOutput (
415- executed_submission = reproduce_output_path ,
416- metadata = None ,
417- )
418-
419- with bf .BlobFile (repro_metadata_path , "r" ) as f :
420- data = json .loads (f .read ())
421- metadata = ReproductionMetadata .from_dict (data )
422397
423398 ctx_logger .info (
424- f"The reproduction process for { self .question_id } . { self . attempt_id } has finished!" ,
399+ f"The reproduction process for { self .run_id } has finished!" ,
425400 destinations = ["group" , "run" ],
426401 _print = True ,
427402 )
428403
429- return ReproductionOutput (
430- executed_submission = reproduce_output_path ,
431- metadata = metadata ,
432- )
404+ return metadata
433405
434406 async def _select_checkpoint (self ) -> tuple [str , timedelta ] | None :
435407 """Identifies the submission tarball to use for reproduction/grading."""
0 commit comments