Skip to content

Commit 139c004

Browse files
committed
Detect crashed test runners and report and exit
Probably a better job could be done than this, but this should get us a clean failure instead of a long hang.
1 parent caff8be commit 139c004

File tree

2 files changed

+57
-5
lines changed

2 files changed

+57
-5
lines changed

edb/tools/test/mproc_fixes.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828

2929

3030
_orig_pool_worker_handler = None
31+
_orig_pool_join_exited_workers = None
3132

3233
logger = logging.getLogger(__name__)
3334

@@ -92,9 +93,17 @@ def multiprocessing_worker_handler(*args):
9293
worker_process.join(timeout=10)
9394

9495

96+
def join_exited_workers(pool):
97+
# Our use case shouldn't have workers exiting really, so we skip
98+
# doing the joins so that we can detect crashes ourselves in the
99+
# test runner.x
100+
pass
101+
102+
95103
def patch_multiprocessing(debug: bool):
96104
global _orig_pool_worker
97105
global _orig_pool_worker_handler
106+
global _orig_pool_join_exited_workers
98107

99108
if debug:
100109
multiprocessing.util.log_to_stderr(logging.DEBUG)
@@ -111,3 +120,9 @@ def patch_multiprocessing(debug: bool):
111120
# Allow workers some time to shut down gracefully.
112121
_orig_pool_worker_handler = multiprocessing.pool.Pool._handle_workers
113122
multiprocessing.pool.Pool._handle_workers = multiprocessing_worker_handler
123+
124+
_orig_pool_join_exited_workers = (
125+
multiprocessing.pool.Pool._join_exited_workers)
126+
multiprocessing.pool.Pool._join_exited_workers = join_exited_workers
127+
128+
multiprocessing.pool.Pool._crashed_workers = None

edb/tools/test/runner.py

Lines changed: 42 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -166,15 +166,17 @@ def _run(self, test, result):
166166
getattr(result, '_moduleSetUpFailed', False)):
167167
return
168168

169+
result.annotate_test(test, {
170+
'py-hash-secret': py_hash_secret,
171+
'py-random-seed': py_random_seed,
172+
'runner-pid': os.getpid(),
173+
})
174+
169175
start = time.monotonic()
170176
test.run(result)
171177
elapsed = time.monotonic() - start
172178

173179
result.record_test_stats(test, {'running-time': elapsed})
174-
result.annotate_test(test, {
175-
'py-hash-secret': py_hash_secret,
176-
'py-random-seed': py_random_seed,
177-
})
178180

179181
result._testRunEntered = False
180182
return result
@@ -374,6 +376,28 @@ def run(self, result):
374376
try:
375377
ar.get(timeout=0.1)
376378
except multiprocessing.TimeoutError:
379+
# multiprocessing doesn't handle processes
380+
# crashing very well, so we check ourselves
381+
# (having disabled its own child pruning in
382+
# mproc_fixes)
383+
#
384+
# TODO: Should we look into using
385+
# concurrent.futures.ProcessPoolExecutor
386+
# instead?
387+
for p in pool._pool:
388+
if p.exitcode:
389+
tmsg = ''
390+
if isinstance(result, ParallelTextTestResult):
391+
test = result.current_pids.get(p.pid)
392+
tmsg = f' while running {test}'
393+
print(
394+
f"ERROR: Test worker {p.pid} crashed with "
395+
f"exit code {p.exitcode}{tmsg}",
396+
file=sys.stderr,
397+
)
398+
sys.stderr.flush()
399+
os._exit(1)
400+
377401
if self.stop_requested:
378402
break
379403
else:
@@ -744,6 +768,7 @@ def __init__(self, *, stream, verbosity, warnings, tests,
744768
self.warnings = []
745769
self.notImplemented = []
746770
self.currently_running = {}
771+
self.current_pids = {}
747772
# An index of all seen warnings to keep track
748773
# of repeated warnings.
749774
self._warnings = {}
@@ -775,7 +800,14 @@ def report_still_running(self):
775800
for test, start in self.currently_running.items():
776801
running_for = now - start
777802
if running_for > 5.0:
778-
still_running[test] = running_for
803+
key = str(test)
804+
if (
805+
test in self.test_annotations
806+
and (pid := self.test_annotations[test].get('runner-pid'))
807+
):
808+
key = f'{key} (pid={pid})'
809+
810+
still_running[key] = running_for
779811
if still_running:
780812
self.ren.report_still_running(still_running)
781813

@@ -800,6 +832,11 @@ def startTest(self, test):
800832
self.currently_running[test] = time.monotonic()
801833
self.ren.report_start(
802834
test, currently_running=list(self.currently_running))
835+
if (
836+
test in self.test_annotations
837+
and (pid := self.test_annotations[test].get('runner-pid'))
838+
):
839+
self.current_pids[pid] = test
803840

804841
def addSuccess(self, test):
805842
super().addSuccess(test)

0 commit comments

Comments
 (0)