Skip to content

Commit f050409

Browse files
fix salvaging logic
1 parent 5c39f95 commit f050409

2 files changed

Lines changed: 109 additions & 41 deletions

File tree

project/paperbench/paperbench/nano/task.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,10 @@
4545
)
4646
from paperbench.nano.utils import get_file_at_duration
4747
from paperbench.paper_registry import paper_registry
48-
from paperbench.scripts.run_reproduce import ReproductionMetadata, reproduce_on_computer
48+
from paperbench.scripts.run_reproduce import (
49+
ReproductionMetadata,
50+
reproduce_on_computer_with_salvaging,
51+
)
4952
from paperbench.utils import purple
5053

5154
GRADER_OPENAI_API_KEY = os.getenv("GRADER_OPENAI_API_KEY") or os.getenv("OPENAI_API_KEY")
@@ -377,7 +380,7 @@ async def _run_reproduce(self, submission: str) -> ReproductionMetadata | None:
377380

378381
# Reproduce on alcatraz and collect metadata
379382
try:
380-
metadata = await reproduce_on_computer(
383+
metadata = await reproduce_on_computer_with_salvaging(
381384
cluster_config=self.reproduction.cluster_config,
382385
submission_path=submission,
383386
logger=ctx_logger.bind(destinations=["run"]),

project/paperbench/paperbench/scripts/run_reproduce.py

Lines changed: 104 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import asyncio
44
import json
55
import time
6-
from dataclasses import asdict, dataclass, field
6+
from dataclasses import asdict, dataclass, field, replace
77
from pathlib import Path
88
from typing import Any
99

@@ -129,16 +129,17 @@ async def reproduce(
129129
submission_path: Path,
130130
logger: BoundLogger,
131131
timeout: float | None = None,
132-
retry_threshold: float = 0,
132+
use_py3_11: bool = False,
133+
make_venv: bool = False,
133134
) -> ReproductionMetadata:
134135
"""
135136
args:
136137
computer: ComputerInterface on which the reproduce.sh script will be run
137138
submission_path: Path to the submission directory
138139
logger: Logger object to log messages
139140
timeout: (optional) Timeout for the reproduce.sh script
140-
retry_threshold: (optional) If greater than 0 and timeout, when the reproduce.sh runs for
141-
less than this threshold of seconds, it is retried with series of arbitrary/generic fixes
141+
use_py3_11: (optional) Whether to switch python3 to python3.11 before running
142+
make_venv: (optional) Whether to create and use a virtualenv before running
142143
"""
143144
# get git history for interest
144145
cmd_str = f"bash -c 'cd {submission_path} && git --no-pager log'"
@@ -173,33 +174,14 @@ async def reproduce(
173174
cmd_str = f"bash -c 'git config --global --add safe.directory {submission_path}'"
174175
await computer.send_shell_command(cmd_str)
175176

176-
repro_outcomes: list[ReproScriptRunOutcome] = []
177-
repro_outcome = await run_reproduce_script(computer, logger, submission_path, timeout)
178-
repro_outcomes.append(repro_outcome)
179-
180-
valid_threshold = True if timeout is None else retry_threshold < timeout
181-
retries_enabled = retry_threshold > 0 and valid_threshold
182-
script_ran_quickly = repro_outcome.repro_execution_time <= retry_threshold
183-
184-
# only ran shortly, something trivial might be broken: maybe trivial fixes help, so retry
185-
if retries_enabled and script_ran_quickly:
186-
logger.info("Reproduce.sh ran for <= 10 minutes, retrying with small fixes")
187-
retry_options = [
188-
{"use_py3_11": True, "make_venv": False},
189-
{"use_py3_11": False, "make_venv": True},
190-
{"use_py3_11": True, "make_venv": True},
191-
]
192-
for retry_opts in retry_options:
193-
repro_outcome = await run_reproduce_script(
194-
computer, logger, submission_path, timeout, **retry_opts
195-
)
196-
repro_outcomes.append(repro_outcome)
197-
if repro_outcome.repro_execution_time > retry_threshold:
198-
logger.info("Reproduce.sh ran for more than 10 minutes, breaking out of retry loop")
199-
break
200-
if repro_outcome.repro_execution_time <= retry_threshold:
201-
logger.info("Reproduce.sh still ran for <= 10 minutes, giving up")
202-
final_outcome = repro_outcomes[-1]
177+
repro_outcome = await run_reproduce_script(
178+
computer=computer,
179+
logger=logger,
180+
submission_path=submission_path,
181+
timeout=timeout,
182+
use_py3_11=use_py3_11,
183+
make_venv=make_venv,
184+
)
203185

204186
result = await computer.check_shell_command(f"ls -la {submission_path}")
205187
files_after_reproduce = result.output.decode("utf-8")
@@ -211,13 +193,13 @@ async def reproduce(
211193
is_valid_git_repo=is_valid_git_repo,
212194
git_log=git_log,
213195
repro_script_exists=repro_script_exists,
214-
repro_execution_time=final_outcome.repro_execution_time,
215-
repro_log=final_outcome.repro_log,
196+
repro_execution_time=repro_outcome.repro_execution_time,
197+
repro_log=repro_outcome.repro_log,
216198
files_before_reproduce=files_before_reproduce,
217199
files_after_reproduce=files_after_reproduce,
218200
git_status_after_reproduce=git_status,
219-
timedout=final_outcome.timedout,
220-
retried_results=repro_outcomes[:-1],
201+
timedout=repro_outcome.timedout,
202+
# will populate retried_results and executed_submission later
221203
)
222204

223205

@@ -229,8 +211,9 @@ async def reproduce_on_computer(
229211
submission_cluster_path: Path = Path("/submission"),
230212
output_cluster_path: Path = Path("/output"),
231213
timeout: float | None = None,
232-
retry_threshold: float = 0,
233-
) -> ReproductionMetadata | None:
214+
use_py3_11: bool = False,
215+
make_venv: bool = False,
216+
) -> ReproductionMetadata:
234217
"""
235218
Reproduce a single submission on a computer.
236219
@@ -245,7 +228,6 @@ async def reproduce_on_computer(
245228
)
246229
async with start_alcatraz_computer(cluster_config) as computer:
247230
time_start = time.time()
248-
repro_metadata: ReproductionMetadata | None = None
249231

250232
await computer.check_shell_command(
251233
f"mkdir -p {output_cluster_path} {submission_cluster_path}"
@@ -260,7 +242,8 @@ async def reproduce_on_computer(
260242
submission_path=submission_cluster_path,
261243
logger=logger,
262244
timeout=timeout,
263-
retry_threshold=retry_threshold,
245+
use_py3_11=use_py3_11,
246+
make_venv=make_venv,
264247
)
265248

266249
# Step 3: Save metadata
@@ -286,3 +269,85 @@ async def reproduce_on_computer(
286269
logger.info(f"Reproduction completed in {time_end - time_start:.2f} seconds.")
287270

288271
return repro_metadata
272+
273+
274+
async def reproduce_on_computer_with_salvaging(
275+
cluster_config: ClusterConfig,
276+
submission_path: str,
277+
logger: BoundLogger,
278+
run_dir: str,
279+
submission_cluster_path: Path = Path("/submission"),
280+
output_cluster_path: Path = Path("/output"),
281+
timeout: float | None = None,
282+
retry_threshold: float = 0,
283+
) -> ReproductionMetadata:
284+
"""
285+
Reproduce a single submission on a computer,
286+
salvaging reproduce attempts by retrying with slightly different configurations.
287+
"""
288+
valid_threshold = True if timeout is None else retry_threshold < timeout
289+
retries_enabled = retry_threshold > 0 and valid_threshold
290+
291+
retry_options = [{"use_py3_11": False, "make_venv": False}]
292+
if retries_enabled:
293+
retry_options.extend(
294+
[
295+
{"use_py3_11": True, "make_venv": False},
296+
{"use_py3_11": False, "make_venv": True},
297+
{"use_py3_11": True, "make_venv": True},
298+
]
299+
)
300+
301+
repro_attempts: list[ReproductionMetadata] = []
302+
303+
for opts in retry_options:
304+
logger.info(
305+
f"Executing reproduce.sh with py3_11={opts['use_py3_11']}"
306+
f" and make_venv={opts['make_venv']}"
307+
)
308+
repro_attempt = await reproduce_on_computer(
309+
cluster_config=cluster_config,
310+
submission_path=submission_path,
311+
logger=logger,
312+
run_dir=run_dir,
313+
submission_cluster_path=submission_cluster_path,
314+
output_cluster_path=output_cluster_path,
315+
timeout=timeout,
316+
use_py3_11=opts["use_py3_11"],
317+
make_venv=opts["make_venv"],
318+
)
319+
repro_attempts.append(repro_attempt)
320+
if _should_retry(retries_enabled, repro_attempt, retry_threshold):
321+
logger.info(
322+
f"Reproduction attempt ran for less than {retry_threshold} seconds,"
323+
" retrying with different configuration.",
324+
)
325+
continue # retry, with next configuration
326+
else:
327+
break # this last attempt was it
328+
329+
repro_metadata = repro_attempts[-1]
330+
repro_metadata = _populate_retried_results(repro_metadata, repro_attempts[:-1])
331+
332+
return repro_metadata
333+
334+
335+
def _should_retry(
336+
retries_enabled: bool, repro_attempt: ReproductionMetadata, retry_threshold: float
337+
) -> bool:
338+
"""helper for determining whether we should retry to run reproduce.sh"""
339+
execution_time = repro_attempt.repro_execution_time or 0
340+
return retries_enabled and execution_time < retry_threshold
341+
342+
343+
def _populate_retried_results(
344+
repro_metadata: ReproductionMetadata, repro_attempts: list[ReproductionMetadata]
345+
) -> ReproductionMetadata:
346+
"""Populates a ReproductionMetadata.retried_results with info from previous attempts"""
347+
if len(repro_attempts) >= 1:
348+
retried = [
349+
ReproScriptRunOutcome(float(m.repro_execution_time or 0), m.timedout, m.repro_log)
350+
for m in repro_attempts
351+
]
352+
repro_metadata = replace(repro_metadata, retried_results=retried)
353+
return repro_metadata

0 commit comments

Comments
 (0)