33import asyncio
44import json
55import time
6- from dataclasses import asdict , dataclass , field
6+ from dataclasses import asdict , dataclass , field , replace
77from pathlib import Path
88from typing import Any
99
@@ -129,16 +129,17 @@ async def reproduce(
129129 submission_path : Path ,
130130 logger : BoundLogger ,
131131 timeout : float | None = None ,
132- retry_threshold : float = 0 ,
132+ use_py3_11 : bool = False ,
133+ make_venv : bool = False ,
133134) -> ReproductionMetadata :
134135 """
135136 args:
136137 computer: ComputerInterface on which the reproduce.sh script will be run
137138 submission_path: Path to the submission directory
138139 logger: Logger object to log messages
139140 timeout: (optional) Timeout for the reproduce.sh script
140- retry_threshold : (optional) If greater than 0 and timeout, when the reproduce.sh runs for
141- less than this threshold of seconds, it is retried with series of arbitrary/generic fixes
141+ use_py3_11 : (optional) Whether to switch python3 to python3.11 before running
142+ make_venv: (optional) Whether to create and use a virtualenv before running
142143 """
143144 # get git history for interest
144145 cmd_str = f"bash -c 'cd { submission_path } && git --no-pager log'"
@@ -173,33 +174,14 @@ async def reproduce(
173174 cmd_str = f"bash -c 'git config --global --add safe.directory { submission_path } '"
174175 await computer .send_shell_command (cmd_str )
175176
176- repro_outcomes : list [ReproScriptRunOutcome ] = []
177- repro_outcome = await run_reproduce_script (computer , logger , submission_path , timeout )
178- repro_outcomes .append (repro_outcome )
179-
180- valid_threshold = True if timeout is None else retry_threshold < timeout
181- retries_enabled = retry_threshold > 0 and valid_threshold
182- script_ran_quickly = repro_outcome .repro_execution_time <= retry_threshold
183-
184- # only ran shortly, something trivial might be broken: maybe trivial fixes help, so retry
185- if retries_enabled and script_ran_quickly :
186- logger .info ("Reproduce.sh ran for <= 10 minutes, retrying with small fixes" )
187- retry_options = [
188- {"use_py3_11" : True , "make_venv" : False },
189- {"use_py3_11" : False , "make_venv" : True },
190- {"use_py3_11" : True , "make_venv" : True },
191- ]
192- for retry_opts in retry_options :
193- repro_outcome = await run_reproduce_script (
194- computer , logger , submission_path , timeout , ** retry_opts
195- )
196- repro_outcomes .append (repro_outcome )
197- if repro_outcome .repro_execution_time > retry_threshold :
198- logger .info ("Reproduce.sh ran for more than 10 minutes, breaking out of retry loop" )
199- break
200- if repro_outcome .repro_execution_time <= retry_threshold :
201- logger .info ("Reproduce.sh still ran for <= 10 minutes, giving up" )
202- final_outcome = repro_outcomes [- 1 ]
177+ repro_outcome = await run_reproduce_script (
178+ computer = computer ,
179+ logger = logger ,
180+ submission_path = submission_path ,
181+ timeout = timeout ,
182+ use_py3_11 = use_py3_11 ,
183+ make_venv = make_venv ,
184+ )
203185
204186 result = await computer .check_shell_command (f"ls -la { submission_path } " )
205187 files_after_reproduce = result .output .decode ("utf-8" )
@@ -211,13 +193,13 @@ async def reproduce(
211193 is_valid_git_repo = is_valid_git_repo ,
212194 git_log = git_log ,
213195 repro_script_exists = repro_script_exists ,
214- repro_execution_time = final_outcome .repro_execution_time ,
215- repro_log = final_outcome .repro_log ,
196+ repro_execution_time = repro_outcome .repro_execution_time ,
197+ repro_log = repro_outcome .repro_log ,
216198 files_before_reproduce = files_before_reproduce ,
217199 files_after_reproduce = files_after_reproduce ,
218200 git_status_after_reproduce = git_status ,
219- timedout = final_outcome .timedout ,
220- retried_results = repro_outcomes [: - 1 ],
201+ timedout = repro_outcome .timedout ,
202+ # will populate retried_results and executed_submission later
221203 )
222204
223205
@@ -229,8 +211,9 @@ async def reproduce_on_computer(
229211 submission_cluster_path : Path = Path ("/submission" ),
230212 output_cluster_path : Path = Path ("/output" ),
231213 timeout : float | None = None ,
232- retry_threshold : float = 0 ,
233- ) -> ReproductionMetadata | None :
214+ use_py3_11 : bool = False ,
215+ make_venv : bool = False ,
216+ ) -> ReproductionMetadata :
234217 """
235218 Reproduce a single submission on a computer.
236219
@@ -245,7 +228,6 @@ async def reproduce_on_computer(
245228 )
246229 async with start_alcatraz_computer (cluster_config ) as computer :
247230 time_start = time .time ()
248- repro_metadata : ReproductionMetadata | None = None
249231
250232 await computer .check_shell_command (
251233 f"mkdir -p { output_cluster_path } { submission_cluster_path } "
@@ -260,7 +242,8 @@ async def reproduce_on_computer(
260242 submission_path = submission_cluster_path ,
261243 logger = logger ,
262244 timeout = timeout ,
263- retry_threshold = retry_threshold ,
245+ use_py3_11 = use_py3_11 ,
246+ make_venv = make_venv ,
264247 )
265248
266249 # Step 3: Save metadata
@@ -286,3 +269,85 @@ async def reproduce_on_computer(
286269 logger .info (f"Reproduction completed in { time_end - time_start :.2f} seconds." )
287270
288271 return repro_metadata
272+
273+
274+ async def reproduce_on_computer_with_salvaging (
275+ cluster_config : ClusterConfig ,
276+ submission_path : str ,
277+ logger : BoundLogger ,
278+ run_dir : str ,
279+ submission_cluster_path : Path = Path ("/submission" ),
280+ output_cluster_path : Path = Path ("/output" ),
281+ timeout : float | None = None ,
282+ retry_threshold : float = 0 ,
283+ ) -> ReproductionMetadata :
284+ """
285+ Reproduce a single submission on a computer,
286+ salvaging reproduce attempts by retrying with slightly different configurations.
287+ """
288+ valid_threshold = True if timeout is None else retry_threshold < timeout
289+ retries_enabled = retry_threshold > 0 and valid_threshold
290+
291+ retry_options = [{"use_py3_11" : False , "make_venv" : False }]
292+ if retries_enabled :
293+ retry_options .extend (
294+ [
295+ {"use_py3_11" : True , "make_venv" : False },
296+ {"use_py3_11" : False , "make_venv" : True },
297+ {"use_py3_11" : True , "make_venv" : True },
298+ ]
299+ )
300+
301+ repro_attempts : list [ReproductionMetadata ] = []
302+
303+ for opts in retry_options :
304+ logger .info (
305+ f"Executing reproduce.sh with py3_11={ opts ['use_py3_11' ]} "
306+ f" and make_venv={ opts ['make_venv' ]} "
307+ )
308+ repro_attempt = await reproduce_on_computer (
309+ cluster_config = cluster_config ,
310+ submission_path = submission_path ,
311+ logger = logger ,
312+ run_dir = run_dir ,
313+ submission_cluster_path = submission_cluster_path ,
314+ output_cluster_path = output_cluster_path ,
315+ timeout = timeout ,
316+ use_py3_11 = opts ["use_py3_11" ],
317+ make_venv = opts ["make_venv" ],
318+ )
319+ repro_attempts .append (repro_attempt )
320+ if _should_retry (retries_enabled , repro_attempt , retry_threshold ):
321+ logger .info (
322+ f"Reproduction attempt ran for less than { retry_threshold } seconds,"
323+ " retrying with different configuration." ,
324+ )
325+ continue # retry, with next configuration
326+ else :
327+ break # this last attempt was it
328+
329+ repro_metadata = repro_attempts [- 1 ]
330+ repro_metadata = _populate_retried_results (repro_metadata , repro_attempts [:- 1 ])
331+
332+ return repro_metadata
333+
334+
335+ def _should_retry (
336+ retries_enabled : bool , repro_attempt : ReproductionMetadata , retry_threshold : float
337+ ) -> bool :
338+ """helper for determining whether we should retry to run reproduce.sh"""
339+ execution_time = repro_attempt .repro_execution_time or 0
340+ return retries_enabled and execution_time < retry_threshold
341+
342+
343+ def _populate_retried_results (
344+ repro_metadata : ReproductionMetadata , repro_attempts : list [ReproductionMetadata ]
345+ ) -> ReproductionMetadata :
346+ """Populates a ReproductionMetadata.retried_results with info from previous attempts"""
347+ if len (repro_attempts ) >= 1 :
348+ retried = [
349+ ReproScriptRunOutcome (float (m .repro_execution_time or 0 ), m .timedout , m .repro_log )
350+ for m in repro_attempts
351+ ]
352+ repro_metadata = replace (repro_metadata , retried_results = retried )
353+ return repro_metadata
0 commit comments