Skip to content

Commit 7b7e9c3

Browse files
add retries when starting alcatraz computer
1 parent afc5e3a commit 7b7e9c3

3 files changed

Lines changed: 28 additions & 6 deletions

File tree

project/paperbench/paperbench/nano/eval.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
from dotenv import load_dotenv
1212

1313
load_dotenv()
14-
from nanoeval_alcatraz.alcatraz_computer_interface import AlcatrazComputerInterface
1514
from nanoeval_alcatraz.task_to_alcatraz_config import task_to_alcatraz_config
1615
from typing_extensions import override
1716

@@ -43,6 +42,7 @@
4342
from paperbench.nano.task import PBTask
4443
from paperbench.nano.utils import SPLIT_TO_EXPECTED_PAPERS, gather_eval_runs
4544
from paperbench.paper_registry import paper_registry
45+
from paperbench.scripts.alcatraz_services import start_alcatraz_computer
4646
from paperbench.utils import (
4747
create_run_dir,
4848
create_run_id,
@@ -57,6 +57,7 @@
5757

5858
MIN_UPLOAD_INTERVAL_MESSAGES = 5
5959
MIN_UPLOAD_INTERVAL_SECONDS = 1800
60+
MAX_CLUSTER_START_ATTEMPTS = 5
6061

6162

6263
GRADER_OPENAI_API_KEY = os.getenv("GRADER_OPENAI_API_KEY") or os.getenv("OPENAI_API_KEY")
@@ -166,8 +167,11 @@ async def _start_computer(self, task: PBTask) -> AsyncGenerator[ComputerInterfac
166167
is_nvidia_gpu_env=self.is_nvidia_gpu_env,
167168
)
168169

169-
async with alcatraz_config.build() as cluster:
170-
yield AlcatrazComputerInterface(cluster_value=cluster)
170+
async with start_alcatraz_computer(
171+
cluster_config=alcatraz_config,
172+
max_attempts=MAX_CLUSTER_START_ATTEMPTS,
173+
) as computer:
174+
yield computer
171175

172176
@override
173177
async def run(self, task: ComputerTask) -> AsyncGenerator[Step | FinalResult, None]:

project/paperbench/paperbench/nano/task.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -383,7 +383,6 @@ async def _run_reproduce(self, submission: str) -> ReproductionMetadata | None:
383383
metadata = replace(metadata, executed_submission=reproduce_output_path)
384384
return metadata
385385

386-
# Reproduce on alcatraz and collect metadata
387386
try:
388387
metadata = await reproduce_on_computer_with_salvaging(
389388
cluster_config=self.reproduction.cluster_config,

project/paperbench/paperbench/scripts/alcatraz_services.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@
44
import structlog
55
from dotenv import load_dotenv
66
from nanoeval_alcatraz.alcatraz_computer_interface import AlcatrazComputerInterface
7+
from tenacity import RetryCallState, Retrying, retry_if_exception_type, stop_after_attempt
78

9+
from alcatraz.clusters.interface import AlcatrazException
810
from alcatraz.clusters.local import ClusterConfig
911
from nanoeval.solvers.computer_tasks.code_execution_interface import ComputerInterface
1012
from paperbench.infra.alcatraz import put_file_in_computer
@@ -58,7 +60,24 @@ async def put_submission_in_computer(
5860
@asynccontextmanager
5961
async def start_alcatraz_computer(
6062
cluster_config: ClusterConfig,
63+
max_attempts: int = 5,
6164
) -> AsyncGenerator[ComputerInterface, None]:
6265
"""Helper method for starting an AlcatrazComputerInterface given a ClusterConfig."""
63-
async with cluster_config.build() as cluster:
64-
yield AlcatrazComputerInterface(cluster_value=cluster)
66+
67+
def before_sleep(state: RetryCallState) -> None:
68+
exception = state.outcome.exception() if state.outcome else None
69+
logger.warning(
70+
f"Cluster start failed on attempt {state.attempt_number} out of {max_attempts}; "
71+
f"retrying due to '{exception}'"
72+
)
73+
74+
retrying = Retrying(
75+
stop=stop_after_attempt(max_attempts),
76+
retry=retry_if_exception_type(AlcatrazException),
77+
before_sleep=before_sleep,
78+
)
79+
80+
for attempt in retrying:
81+
with attempt:
82+
async with cluster_config.build() as cluster:
83+
yield AlcatrazComputerInterface(cluster_value=cluster)

0 commit comments

Comments
 (0)