Skip to content

Commit 5399d80

Browse files
authored
Enh/error on timeout (#683)
* Raise an error when a process stops due to activity timeout This avoids situations where the benchmark continues to run, thinking that the process completed successfully. * Add tests around raising StaleProcessErrors * More explicit warning that activity timeout requires live output * Simplify the exception * Elaborate comment
1 parent 2c25ed0 commit 5399d80

File tree

4 files changed

+56
-5
lines changed

4 files changed

+56
-5
lines changed

amlb/utils/process.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,10 @@ def file_lock(path, timeout=-1):
4949
yield
5050

5151

52+
class StaleProcessError(subprocess.TimeoutExpired):
53+
pass
54+
55+
5256
def run_subprocess(
5357
*popenargs,
5458
input=None,
@@ -104,7 +108,20 @@ def communicate(process, input=None, timeout=None):
104108
except: # also handles kb interrupts
105109
process.kill()
106110
raise
111+
112+
# Communication is aborted if either
113+
# - the process has completed, or
114+
# - the process does not produce any output in the specified timeout
115+
# `communicate` can't distinguish those cases. If the process has a
116+
# return code then it stopped already, otherwise we stop it now.
107117
retcode = process.poll()
118+
if retcode is None:
119+
process.kill()
120+
process.wait()
121+
raise StaleProcessError(
122+
process.args, float("nan"), output=stdout, stderr=stderr
123+
)
124+
108125
if check and retcode:
109126
raise subprocess.CalledProcessError(
110127
retcode, process.args, output=stdout, stderr=stderr
@@ -186,6 +203,9 @@ def read_pipe(pipe, timeout):
186203
pipes = as_list(pipe)
187204
# wait until a pipe is ready for reading, non-Windows only.
188205
ready, *_ = select.select(pipes, [], [], timeout)
206+
# if a pipe is not ready it could be timeout or it could be end of process
207+
# so at this point we do not know. Only after the communicate function is over do we know.
208+
# i.e., if the process is still running it does not have a retcode.
189209
reads = [""] * len(pipes)
190210
# print update for each pipe that is ready for reading
191211
for i, p in enumerate(pipes):
@@ -315,8 +335,6 @@ def communicate(*args, **kwargs):
315335
log.log(params.output_level, e.stdout)
316336
if e.stderr:
317337
log.log(params.error_level, e.stderr)
318-
# error_tail = tail(e.stderr, 25) if e.stderr else 'Unknown Error'
319-
# raise subprocess.SubprocessError("Error when running command `{cmd}`: {error}".format(cmd=full_cmd, error=error_tail))
320338
raise e
321339

322340

resources/config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ archive: ['logs'] # list of output folders that should be archived by defa
2727
setup: # configuration namespace for the framework setup phase.
2828
live_output: true # set to true to stream the output of setup commands, if false they are only printed when setup is complete.
2929
activity_timeout: 600 # when using live output, subprocess will be considered as hanging if nothing was printed during this activity time.
30+
# No effect is live_output is set to 'False'.
3031

3132
frameworks: # configuration namespace for the frameworks definitions.
3233
definition_file: # list of yaml files describing the frameworks base definitions.

runbenchmark.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
str2bool,
1919
str_sanitize,
2020
zip_path,
21+
StaleProcessError,
22+
Namespace,
2123
)
2224
from amlb import log, AutoMLError
2325
from amlb.defaults import default_dirs
@@ -358,9 +360,19 @@
358360
args.mode,
359361
)
360362

361-
bench.setup(amlb.SetupMode[args.setup])
362-
if args.setup != "only":
363-
res = bench.run(args.task, args.fold)
363+
try:
364+
bench.setup(amlb.SetupMode[args.setup])
365+
except StaleProcessError as e:
366+
setting = "setup.activity_timeout"
367+
timeout = Namespace.get(amlb_res.config, setting)
368+
log.error(
369+
f"Process '{e.cmd}' was aborted after producing no output for {timeout} seconds. "
370+
f"If the process is expected to take more time, please raise the '{setting}' limit."
371+
)
372+
exit_code = 1
373+
else:
374+
if args.setup != "only":
375+
res = bench.run(args.task, args.fold)
364376
except (ValueError, AutoMLError) as e:
365377
log.error("\nERROR:\n%s", e)
366378
if extras.get("verbose") is True:
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
import pytest
2+
3+
from amlb.utils import StaleProcessError, run_cmd
4+
5+
6+
@pytest.mark.parametrize("mode", ["line", "block", True])
7+
def test_subprocess_detects_stale_process(mode):
8+
one_ms = 0.001
9+
with pytest.raises(StaleProcessError):
10+
run_cmd(f"sleep {one_ms}", _activity_timeout_=one_ms / 2, _live_output_=mode)
11+
12+
13+
def test_subprocess_does_not_raises_if_process_exits_early():
14+
run_cmd("echo hi", _activity_timeout_=1, _live_output_=True)
15+
16+
17+
@pytest.mark.xfail(reason="Detection of stale processes currently requires output")
18+
def test_subprocess_does_not_raises_on_silent_process():
19+
one_ms = 0.001
20+
run_cmd(f"sleep {one_ms}", _activity_timeout_=one_ms / 2, _live_output_=True)

0 commit comments

Comments
 (0)