openai
diff --git a/‎project/paperbench/paperbench/agents/registry.py‎
Lines changed: 16 additions & 0 deletions b/‎project/paperbench/paperbench/agents/registry.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎project/paperbench/paperbench/nano/eval.py‎
Lines changed: 9 additions & 17 deletions b/‎project/paperbench/paperbench/nano/eval.py‎
Lines changed: 9 additions & 17 deletions
diff --git a/‎project/paperbench/paperbench/nano/structs.py‎
Lines changed: 15 additions & 39 deletions b/‎project/paperbench/paperbench/nano/structs.py‎
Lines changed: 15 additions & 39 deletions
diff --git a/‎project/paperbench/paperbench/nano/task.py‎
Lines changed: 27 additions & 55 deletions b/‎project/paperbench/paperbench/nano/task.py‎
Lines changed: 27 additions & 55 deletions
diff --git a/‎project/paperbench/paperbench/reproducer.Dockerfile‎
Lines changed: 1 addition & 1 deletion b/‎project/paperbench/paperbench/reproducer.Dockerfile‎
Lines changed: 1 addition & 1 deletion
@@ -108,3 +108,19 @@ def get_agent(self, agent_id: str) -> Agent:
 
 
 registry = AgentRegistry()
+
+
+def get_agents_env_vars(registry: AgentRegistry) -> dict[str, str]:
+    """Parses agent.env txt file of KEY=VALUE into a dictionary"""
+    agent_env_path = registry.get_agents_dir() / "agent.env"
+
+    if not agent_env_path.exists():
+        logger.warning(f"agent.env not found in {agent_env_path}")
+        return {}
+    env_vars = {}
+    with open(agent_env_path, "r") as f:
+        for line in f:
+            if line.strip() and not line.startswith("#"):
+                key, value = line.strip().split("=", 1)
+                env_vars[key] = value
+    return env_vars
@@ -193,7 +193,7 @@ async def run(self, task: ComputerTask) -> AsyncGenerator[Step | FinalResult, No
                             code_only=task.judge.code_only,
                             resources_provided=task.judge.resources_provided,
                             judge_output=None,
-                            reproduction_output=None,
+                            reproduction_metadata=None,
                             monitor_result=grade.paperbench_result.monitor_result,
                             monitor_ran=grade.paperbench_result.monitor_ran,
                         ),
@@ -569,11 +569,7 @@ async def get_full_summary(
                 [r for r in results_clean if not r.agent_output or not r.submission_exists]
             ),
             "n_reproductions_failed": len(
-                [
-                    r
-                    for r in results_clean
-                    if not r.reproduction_output or not r.reproduction_output.success
-                ]
+                [r for r in results_clean if not r.reproduction_metadata]
             ),
             "n_gradings_failed": len(
                 [r for r in results_clean if not r.judge_output or not r.judge_output.success]
@@ -595,36 +591,32 @@ async def get_full_summary(
         other_stats = {
             "repro_mean_time": safe_mean(
                 [
-                    r.reproduction_output.metadata.repro_execution_time  # type: ignore
+                    r.reproduction_metadata.repro_execution_time
                     for r in results_clean
-                    if r.reproduction_output and r.reproduction_output.success
+                    if r.reproduction_metadata and r.reproduction_metadata.repro_execution_time
                 ]
             ),
             "n_is_valid_git_repo": len(
                 [
                     r
                     for r in results_clean
-                    if r.reproduction_output
-                    and r.reproduction_output.success
-                    and r.reproduction_output.metadata.is_valid_git_repo  # type: ignore
+                    if r.reproduction_metadata and r.reproduction_metadata.is_valid_git_repo
                 ]
             ),
             "n_nontrivial_git_log": len(
                 [
                     r
                     for r in results_clean
-                    if r.reproduction_output
-                    and r.reproduction_output.success
-                    and len(r.reproduction_output.metadata.git_log.strip().splitlines()) > 1  # type: ignore
+                    if r.reproduction_metadata
+                    and r.reproduction_metadata.git_log is not None
+                    and len(r.reproduction_metadata.git_log.strip().splitlines()) > 1
                 ]
             ),
             "n_repro_script_exists": len(
                 [
                     r
                     for r in results_clean
-                    if r.reproduction_output
-                    and r.reproduction_output.success
-                    and r.reproduction_output.metadata.repro_script_exists  # type: ignore
+                    if r.reproduction_metadata and r.reproduction_metadata.repro_script_exists
                 ]
             ),
         }
 
@@ -2,8 +2,7 @@
 
 import os
 from dataclasses import dataclass
-from pathlib import Path
-from typing import Any
+from typing import Any, Self
 
 from dotenv import load_dotenv
 
@@ -15,7 +14,7 @@
 from nanoeval.solvers.computer_tasks.task import Grade
 from preparedness_turn_completer.oai_turn_completer import OpenAITurnCompleter
 from preparedness_turn_completer.turn_completer import TurnCompleter
-from pydantic import BaseModel
+from pydantic import BaseModel, model_validator
 
 from paperbench.agents.utils import (
     AgentOutput,
@@ -28,38 +27,6 @@
 logger = structlog.stdlib.get_logger(component=__name__)
 
 
-class ReproductionOutput(BaseModel):
-    executed_submission: Path | str | None = None
-    metadata: ReproductionMetadata | None = None
-
-    @classmethod
-    def from_dict(cls, data: dict[str, Any]) -> ReproductionOutput:
-        metadata_exists = data.get("metadata") is not None
-
-        if metadata_exists:
-            metadata = ReproductionMetadata.from_dict(data["metadata"])
-        else:
-            metadata = None
-
-        try:
-            return cls(
-                executed_submission=data.get("executed_submission"),
-                metadata=metadata,
-            )
-        except KeyError as e:
-            raise ValueError("Missing required field in reproduction output") from e
-
-    def to_dict(self) -> dict[str, Any]:
-        return {
-            "executed_submission": self.executed_submission,
-            "metadata": self.metadata.to_dict() if self.metadata else None,
-        }
-
-    @property
-    def success(self) -> bool:
-        return self.metadata is not None
-
-
 @dataclass(frozen=False)
 class PaperBenchResult:
     paper_id: str
@@ -70,7 +37,7 @@ class PaperBenchResult:
     resources_provided: bool
     agent_output: AgentOutput | None = None
     judge_output: JudgeOutput | None = None
-    reproduction_output: ReproductionOutput | None = None
+    reproduction_metadata: ReproductionMetadata | None = None
     monitor_result: MonitorResult | None = None
     monitor_ran: bool = False
 
@@ -84,7 +51,7 @@ def to_dict(self) -> dict[str, Any]:
             "resources_provided": self.resources_provided,
             "agent_output": None,
             "judge_output": None,
-            "reproduction_output": None,
+            "reproduction_metadata": None,
             "monitor_result": None,
             "monitor_ran": self.monitor_ran,
         }
@@ -95,8 +62,8 @@ def to_dict(self) -> dict[str, Any]:
         if self.judge_output:
             data["judge_output"] = self.judge_output.to_dict()
 
-        if self.reproduction_output:
-            data["reproduction_output"] = self.reproduction_output.to_dict()
+        if self.reproduction_metadata:
+            data["reproduction_metadata"] = self.reproduction_metadata.to_dict()
 
         if self.monitor_result:
             data["monitor_result"] = self.monitor_result.to_dict()
@@ -106,6 +73,7 @@ def to_dict(self) -> dict[str, Any]:
 
 class ReproductionConfig(BaseModel):
     timeout: int = 100 * 3600
+    # if the reproduce.sh runs for less than this, it will be retried with salvaging fixes
     retry_threshold: float = 600
     overwrite_existing_output: bool = False
     skip_reproduction: bool = False
@@ -114,6 +82,14 @@ class ReproductionConfig(BaseModel):
         pull_from_registry=False,
     )
 
+    @model_validator(mode="after")
+    def _validate_timeout_and_retry_threshold(self) -> Self:
+        if self.retry_threshold >= self.timeout:
+            logger.warning(
+                "ReproductionConfig.retry_threshold >= ReproductionConfig.timeout, so reproduce.sh salvaging is disabled.",
+            )
+        return self
+
 
 class JudgeConfig(BaseModel):
     grade: bool = True
 
@@ -5,7 +5,7 @@
 import tempfile
 import time
 from contextlib import asynccontextmanager, nullcontext
-from dataclasses import asdict
+from dataclasses import asdict, replace
 from datetime import timedelta
 from pathlib import Path
 from typing import Any, AsyncGenerator
@@ -42,12 +42,10 @@
     PaperBenchGrade,
     PaperBenchResult,
     ReproductionConfig,
-    ReproductionOutput,
 )
 from paperbench.nano.utils import get_file_at_duration
 from paperbench.paper_registry import paper_registry
-from paperbench.scripts.alcatraz_services import reproduce_on_computer
-from paperbench.scripts.run_reproduce import ReproductionMetadata
+from paperbench.scripts.run_reproduce import ReproductionMetadata, reproduce_on_computer
 from paperbench.utils import purple
 
 GRADER_OPENAI_API_KEY = os.getenv("GRADER_OPENAI_API_KEY") or os.getenv("OPENAI_API_KEY")
@@ -197,7 +195,7 @@ def _early_exit_grade(
                 resources_provided=self.judge.resources_provided,
                 agent_output=None,
                 judge_output=None,
-                reproduction_output=None,
+                reproduction_metadata=None,
                 monitor_result=monitor_result,
                 monitor_ran=monitor_ran,
             ),
@@ -208,12 +206,12 @@ def _early_exit_grade(
         self._save_grade(grade)
         return grade
 
-    def _should_grade(self, reproduction_output: ReproductionOutput | None) -> bool:
+    def _should_grade(self, reproduction_metadata: ReproductionMetadata | None) -> bool:
         """
         We can proceed with grading if reproduction was successful
         OR we are in a reproduction-free setup
         """
-        return (reproduction_output and reproduction_output.success) or (
+        return (reproduction_metadata is not None) or (
             self.reproduction.skip_reproduction or self.judge.code_only
         )
 
@@ -277,17 +275,17 @@ async def grade(
                 )
 
         # 3. run reproduction
-        repro_output = None
+        repro_metadata = None
         submission_to_grade_path = path_to_submission
         if self._should_reproduce():
-            repro_output = await self._run_reproduce(path_to_submission)
-            repro_metadata = repro_output.metadata.to_dict() if repro_output.metadata else {}
+            repro_metadata = await self._run_reproduce(path_to_submission)
+            repro_metadata_dict = repro_metadata.to_dict() if repro_metadata else {}
             submission_to_grade_path = path_to_executed_submission
-            self._record_extra({"repro_metadata": repro_metadata})
+            self._record_extra({"repro_metadata": repro_metadata_dict})
 
         # 4. run judge
         judge_output = None
-        if self._should_grade(repro_output):
+        if self._should_grade(repro_metadata):
             judge_output = await self._run_judge(submission_to_grade_path, self.paper_id)
             self._record_extra({"judge_output": judge_output.to_dict() if judge_output else None})
 
@@ -301,7 +299,7 @@ async def grade(
                 code_only=self.judge.code_only,
                 resources_provided=self.judge.resources_provided,
                 judge_output=judge_output,
-                reproduction_output=repro_output,
+                reproduction_metadata=repro_metadata,
                 monitor_ran=mon_ran,
                 monitor_result=mon_result,
             ),
@@ -346,24 +344,23 @@ def _run_monitor(self, log_file_path: str) -> MonitorResult:
         monitor_result = monitor.check_log(log_file_path)
         return monitor_result
 
-    async def _run_reproduce(self, submission: str) -> ReproductionOutput:
+    async def _run_reproduce(self, submission: str) -> ReproductionMetadata | None:
         """Runs the reproduction process for the submission associated with the PBTask."""
         ctx_logger = logger.bind(
             run_group_id=self.run_group_id,
             run_id=self.run_id,
             runs_dir=self.runs_dir,
         )
         ctx_logger.info(
-            f"Starting the reproduction process for `{self.question_id}.{self.attempt_id}`...",
+            f"Starting the reproduction process for `{self.run_id}`...",
             destinations=["group", "run"],
             _print=True,
         )
 
+        metadata: ReproductionMetadata | None = None
         reproduce_output_path = submission.replace(".tar.gz", "_executed.tar.gz")
         repro_metadata_path = submission.replace(".tar.gz", "_executed_metadata.json")
 
-        ctx_logger.info(f"Reproducing submission {reproduce_output_path}...", destinations=["run"])
-
         # If the reproduction output already exists, we can skip reproduction
         if not self.reproduction.overwrite_existing_output:
             repro_output_exists = bf.exists(reproduce_output_path)
@@ -374,62 +371,37 @@ async def _run_reproduce(self, submission: str) -> ReproductionOutput:
                     destinations=["run"],
                 )
                 with bf.BlobFile(repro_metadata_path, "r") as f:
-                    data = json.loads(f.read())
-                metadata = ReproductionMetadata.from_dict(data)
-                return ReproductionOutput(
-                    executed_submission=reproduce_output_path,
-                    metadata=metadata,
-                )
-
-        # Reproduce on alcatraz
-        async with self._start_computer(self.reproduction.cluster_config) as computer:
-            await reproduce_on_computer(
-                computer=computer,
+                    metadata = ReproductionMetadata.from_dict(json.loads(f.read()))
+                metadata = replace(metadata, executed_submission=reproduce_output_path)
+                return metadata
+
+        # Reproduce on alcatraz and collect metadata
+        try:
+            metadata = await reproduce_on_computer(
+                cluster_config=self.reproduction.cluster_config,
                 submission_path=submission,
                 logger=ctx_logger.bind(destinations=["run"]),
                 run_dir=self.run_dir,
                 timeout=self.reproduction.timeout,
                 retry_threshold=self.reproduction.retry_threshold,
             )
+        except Exception as e:
+            logger.exception(f"Reproduction failed with error:\n{str(e)}")
 
-        # Now the result should exist
-        repro_output_exists = bf.exists(reproduce_output_path)
-        repro_metadata_exists = bf.exists(repro_metadata_path)
-        if not repro_output_exists:
-            ctx_logger.exception(
-                f"Reproduction failed to produce output: {reproduce_output_path}",
-                destinations=["group", "run"],
-                _print=True,
-            )
-            return ReproductionOutput(
-                executed_submission=reproduce_output_path,
-                metadata=None,
-            )
-        if not repro_metadata_exists:
+        if metadata is None:
             ctx_logger.exception(
                 f"Reproduction failed to produce metadata: {repro_metadata_path}",
                 destinations=["group", "run"],
                 _print=True,
             )
-            return ReproductionOutput(
-                executed_submission=reproduce_output_path,
-                metadata=None,
-            )
-
-        with bf.BlobFile(repro_metadata_path, "r") as f:
-            data = json.loads(f.read())
-        metadata = ReproductionMetadata.from_dict(data)
 
         ctx_logger.info(
-            f"The reproduction process for {self.question_id}.{self.attempt_id} has finished!",
+            f"The reproduction process for {self.run_id} has finished!",
             destinations=["group", "run"],
             _print=True,
         )
 
-        return ReproductionOutput(
-            executed_submission=reproduce_output_path,
-            metadata=metadata,
-        )
+        return metadata
 
     async def _select_checkpoint(self) -> tuple[str, timedelta] | None:
         """Identifies the submission tarball to use for reproduction/grading."""
 
@@ -8,7 +8,7 @@ ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && \
     apt-get install -y \
         software-properties-common \
-        wget curl unzip \
+        wget curl unzip sudo \
         build-essential git cmake \
         libatlas-base-dev libblas-dev liblapack-dev libopenblas-dev \
         gfortran libsm6 libxext6 libxrender-dev && \
Original file line number	Diff line number	Diff line change
`@@ -193,7 +193,7 @@ async def run(self, task: ComputerTask) -> AsyncGenerator[Step \| FinalResult, No`
`193`	`193`	`code_only=task.judge.code_only,`
`194`	`194`	`resources_provided=task.judge.resources_provided,`
`195`	`195`	`judge_output=None,`
`196`		`- reproduction_output=None,`
	`196`	`+ reproduction_metadata=None,`
`197`	`197`	`monitor_result=grade.paperbench_result.monitor_result,`
`198`	`198`	`monitor_ran=grade.paperbench_result.monitor_ran,`
`199`	`199`	`),`
`@@ -569,11 +569,7 @@ async def get_full_summary(`
`569`	`569`	`[r for r in results_clean if not r.agent_output or not r.submission_exists]`
`570`	`570`	`),`
`571`	`571`	`"n_reproductions_failed": len(`
`572`		`- [`
`573`		`- r`
`574`		`- for r in results_clean`
`575`		`- if not r.reproduction_output or not r.reproduction_output.success`
`576`		`- ]`
	`572`	`+ [r for r in results_clean if not r.reproduction_metadata]`
`577`	`573`	`),`
`578`	`574`	`"n_gradings_failed": len(`
`579`	`575`	`[r for r in results_clean if not r.judge_output or not r.judge_output.success]`
`@@ -595,36 +591,32 @@ async def get_full_summary(`
`595`	`591`	`other_stats = {`
`596`	`592`	`"repro_mean_time": safe_mean(`
`597`	`593`	`[`
`598`		`- r.reproduction_output.metadata.repro_execution_time # type: ignore`
	`594`	`+ r.reproduction_metadata.repro_execution_time`
`599`	`595`	`for r in results_clean`
`600`		`- if r.reproduction_output and r.reproduction_output.success`
	`596`	`+ if r.reproduction_metadata and r.reproduction_metadata.repro_execution_time`
`601`	`597`	`]`
`602`	`598`	`),`
`603`	`599`	`"n_is_valid_git_repo": len(`
`604`	`600`	`[`
`605`	`601`	`r`
`606`	`602`	`for r in results_clean`
`607`		`- if r.reproduction_output`
`608`		`- and r.reproduction_output.success`
`609`		`- and r.reproduction_output.metadata.is_valid_git_repo # type: ignore`
	`603`	`+ if r.reproduction_metadata and r.reproduction_metadata.is_valid_git_repo`
`610`	`604`	`]`
`611`	`605`	`),`
`612`	`606`	`"n_nontrivial_git_log": len(`
`613`	`607`	`[`
`614`	`608`	`r`
`615`	`609`	`for r in results_clean`
`616`		`- if r.reproduction_output`
`617`		`- and r.reproduction_output.success`
`618`		`- and len(r.reproduction_output.metadata.git_log.strip().splitlines()) > 1 # type: ignore`
	`610`	`+ if r.reproduction_metadata`
	`611`	`+ and r.reproduction_metadata.git_log is not None`
	`612`	`+ and len(r.reproduction_metadata.git_log.strip().splitlines()) > 1`
`619`	`613`	`]`
`620`	`614`	`),`
`621`	`615`	`"n_repro_script_exists": len(`
`622`	`616`	`[`
`623`	`617`	`r`
`624`	`618`	`for r in results_clean`
`625`		`- if r.reproduction_output`
`626`		`- and r.reproduction_output.success`
`627`		`- and r.reproduction_output.metadata.repro_script_exists # type: ignore`
	`619`	`+ if r.reproduction_metadata and r.reproduction_metadata.repro_script_exists`
`628`	`620`	`]`
`629`	`621`	`),`
`630`	`622`	`}`