Skip to content

Commit 03f3fad

Browse files
committed
Use heuristic to identify "stable" startup of kernel
This is mainly due to the weakness of the previous method: `is_alive` only tests that the kernel process is alive, it does not indicate that the kernel has successfully completed startup. To solve this correctly, we would need to wait for a kernel info reply, but it is not necessarily appropriate to start a kernel client + channels in the restarter. Therefore, we use a "has been alive continuously for X time" as a heuristic for a stable start up.
1 parent 6889cea commit 03f3fad

File tree

4 files changed

+57
-34
lines changed

4 files changed

+57
-34
lines changed

jupyter_client/ioloop/restarter.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
# Copyright (c) Jupyter Development Team.
77
# Distributed under the terms of the Modified BSD License.
88
import inspect
9+
import time
910
import warnings
1011

1112
from traitlets import Instance
@@ -34,7 +35,10 @@ def start(self):
3435
"""Start the polling of the kernel."""
3536
if self._pcallback is None:
3637
if inspect.isawaitable(self.poll):
37-
cb = lambda: run_sync(self.poll)
38+
39+
def cb():
40+
run_sync(self.poll)
41+
3842
else:
3943
cb = self.poll
4044
self._pcallback = ioloop.PeriodicCallback(
@@ -55,13 +59,15 @@ async def poll(self):
5559
if self.debug:
5660
self.log.debug("Polling kernel...")
5761
is_alive = await self.kernel_manager.is_alive()
62+
now = time.time()
5863
if not is_alive:
64+
self._last_dead = now
5965
if self._restarting:
6066
self._restart_count += 1
6167
else:
6268
self._restart_count = 1
6369

64-
if self._restart_count >= self.restart_limit:
70+
if self._restart_count > self.restart_limit:
6571
self.log.warning("AsyncIOLoopKernelRestarter: restart failed")
6672
self._fire_callbacks("dead")
6773
self._restarting = False
@@ -79,8 +85,8 @@ async def poll(self):
7985
await self.kernel_manager.restart_kernel(now=True, newports=newports)
8086
self._restarting = True
8187
else:
82-
if self._initial_startup:
88+
if self._initial_startup and self._last_dead - now >= self.stable_start_time:
8389
self._initial_startup = False
84-
if self._restarting:
90+
if self._restarting and self._last_dead - now >= self.stable_start_time:
8591
self.log.debug("AsyncIOLoopKernelRestarter: restart apparently succeeded")
86-
self._restarting = False
92+
self._restarting = False

jupyter_client/restarter.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,10 @@
77
"""
88
# Copyright (c) Jupyter Development Team.
99
# Distributed under the terms of the Modified BSD License.
10+
import time
11+
1012
from traitlets import Bool # type: ignore
13+
from traitlets import default # type: ignore
1114
from traitlets import Dict
1215
from traitlets import Float
1316
from traitlets import Instance
@@ -31,6 +34,12 @@ class KernelRestarter(LoggingConfigurable):
3134

3235
time_to_dead = Float(3.0, config=True, help="""Kernel heartbeat interval in seconds.""")
3336

37+
stable_start_time = Float(
38+
10.0,
39+
config=True,
40+
help="""The time in seconds to consider the kernel to have completed a stable start up.""",
41+
)
42+
3443
restart_limit = Integer(
3544
5,
3645
config=True,
@@ -45,6 +54,11 @@ class KernelRestarter(LoggingConfigurable):
4554
_restarting = Bool(False)
4655
_restart_count = Integer(0)
4756
_initial_startup = Bool(True)
57+
_last_dead = Float()
58+
59+
@default("_last_dead")
60+
def _default_last_dead(self):
61+
return time.time()
4862

4963
callbacks = Dict()
5064

@@ -103,13 +117,15 @@ def poll(self):
103117
if self.kernel_manager.shutting_down:
104118
self.log.debug("Kernel shutdown in progress...")
105119
return
120+
now = time.time()
106121
if not self.kernel_manager.is_alive():
122+
self._last_dead = now
107123
if self._restarting:
108124
self._restart_count += 1
109125
else:
110126
self._restart_count = 1
111127

112-
if self._restart_count >= self.restart_limit:
128+
if self._restart_count > self.restart_limit:
113129
self.log.warning("KernelRestarter: restart failed")
114130
self._fire_callbacks("dead")
115131
self._restarting = False
@@ -127,8 +143,8 @@ def poll(self):
127143
self.kernel_manager.restart_kernel(now=True, newports=newports)
128144
self._restarting = True
129145
else:
130-
if self._initial_startup:
146+
if self._initial_startup and self._last_dead - now >= self.stable_start_time:
131147
self._initial_startup = False
132-
if self._restarting:
148+
if self._restarting and self._last_dead - now >= self.stable_start_time:
133149
self.log.debug("KernelRestarter: restart apparently succeeded")
134-
self._restarting = False
150+
self._restarting = False

jupyter_client/tests/problemkernel.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,7 @@
22
# Copyright (c) Jupyter Development Team.
33
# Distributed under the terms of the Modified BSD License.
44
import os
5-
import signal
65
import time
7-
from subprocess import PIPE
8-
from subprocess import Popen
96

107
from ipykernel.displayhook import ZMQDisplayHook
118
from ipykernel.kernelapp import IPKernelApp

jupyter_client/tests/test_restarter.py

Lines changed: 26 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,22 @@
11
"""Tests for the KernelManager"""
22
# Copyright (c) Jupyter Development Team.
33
# Distributed under the terms of the Modified BSD License.
4-
54
import asyncio
6-
import concurrent.futures
75
import json
86
import os
9-
import signal
107
import sys
11-
import time
12-
from subprocess import PIPE
138

149
import pytest
1510
from jupyter_core import paths
1611
from traitlets.config.loader import Config
1712
from traitlets.log import get_logger
1813

19-
from jupyter_client import AsyncKernelManager
20-
from jupyter_client.ioloop import AsyncIOLoopKernelManager, IOLoopKernelManager
21-
from ..manager import start_new_async_kernel
22-
from ..manager import start_new_kernel
14+
from jupyter_client.ioloop import AsyncIOLoopKernelManager
15+
from jupyter_client.ioloop import IOLoopKernelManager
2316

2417
pjoin = os.path.join
2518

19+
2620
def _install_kernel(name="problemtest", extra_env=None):
2721
if extra_env is None:
2822
extra_env = dict()
@@ -46,29 +40,31 @@ def _install_kernel(name="problemtest", extra_env=None):
4640
)
4741
return name
4842

43+
4944
@pytest.fixture
5045
def install_kernel():
5146
return _install_kernel("problemtest")
5247

48+
5349
@pytest.fixture
5450
def install_fail_kernel():
55-
return _install_kernel("problemtest-fail", extra_env={
56-
"FAIL_ON_START": "1"
57-
})
51+
return _install_kernel("problemtest-fail", extra_env={"FAIL_ON_START": "1"})
52+
5853

5954
@pytest.fixture
6055
def install_slow_fail_kernel():
61-
return _install_kernel("problemtest-slow", extra_env={
62-
"STARTUP_DELAY": "5",
63-
"FAIL_ON_START": "1"
64-
})
56+
return _install_kernel(
57+
"problemtest-slow", extra_env={"STARTUP_DELAY": "5", "FAIL_ON_START": "1"}
58+
)
59+
6560

6661
@pytest.fixture(params=["tcp", "ipc"])
6762
def transport(request):
6863
if sys.platform == "win32" and request.param == "ipc": #
6964
pytest.skip("Transport 'ipc' not supported on Windows.")
7065
return request.param
7166

67+
7268
@pytest.fixture
7369
def config(transport):
7470
c = Config()
@@ -77,6 +73,7 @@ def config(transport):
7773
c.KernelManager.ip = "test"
7874
return c
7975

76+
8077
@pytest.fixture
8178
def debug_logging():
8279
get_logger().setLevel("DEBUG")
@@ -93,6 +90,7 @@ async def test_restart_check(config, install_kernel):
9390

9491
cbs = 0
9592
restarts = [asyncio.Future() for i in range(N_restarts)]
93+
9694
def cb():
9795
nonlocal cbs
9896
if cbs >= N_restarts:
@@ -103,7 +101,7 @@ def cb():
103101
try:
104102
km.start_kernel()
105103
km.add_restart_callback(cb, 'restart')
106-
except:
104+
except BaseException:
107105
if km.has_kernel:
108106
km.shutdown_kernel()
109107
raise
@@ -127,6 +125,7 @@ def cb():
127125
km.shutdown_kernel(now=True)
128126
assert km.context.closed
129127

128+
130129
@pytest.mark.asyncio
131130
async def test_restarter_gives_up(config, install_fail_kernel):
132131
"""Test that the restarter gives up after reaching the restart limit"""
@@ -138,6 +137,7 @@ async def test_restarter_gives_up(config, install_fail_kernel):
138137

139138
cbs = 0
140139
restarts = [asyncio.Future() for i in range(N_restarts)]
140+
141141
def cb():
142142
nonlocal cbs
143143
if cbs >= N_restarts:
@@ -146,14 +146,15 @@ def cb():
146146
cbs += 1
147147

148148
died = asyncio.Future()
149+
149150
def on_death():
150151
died.set_result(True)
151152

152153
try:
153154
km.start_kernel()
154155
km.add_restart_callback(cb, 'restart')
155156
km.add_restart_callback(on_death, 'dead')
156-
except:
157+
except BaseException:
157158
if km.has_kernel:
158159
km.shutdown_kernel()
159160
raise
@@ -182,6 +183,7 @@ async def test_async_restart_check(config, install_kernel):
182183

183184
cbs = 0
184185
restarts = [asyncio.Future() for i in range(N_restarts)]
186+
185187
def cb():
186188
nonlocal cbs
187189
if cbs >= N_restarts:
@@ -192,7 +194,7 @@ def cb():
192194
try:
193195
await km.start_kernel()
194196
km.add_restart_callback(cb, 'restart')
195-
except:
197+
except BaseException:
196198
if km.has_kernel:
197199
await km.shutdown_kernel()
198200
raise
@@ -216,18 +218,20 @@ def cb():
216218
await km.shutdown_kernel(now=True)
217219
assert km.context.closed
218220

221+
219222
@pytest.mark.asyncio
220223
async def test_async_restarter_gives_up(config, install_slow_fail_kernel):
221224
"""Test that the restarter gives up after reaching the restart limit"""
222225
# If this test failes, run it with --log-cli-level=DEBUG to inspect
223226
N_restarts = 2
224227
config.KernelRestarter.restart_limit = N_restarts
225228
config.KernelRestarter.debug = True
226-
config.KernelRestarter.stable_start_time = 30.
229+
config.KernelRestarter.stable_start_time = 30.0
227230
km = AsyncIOLoopKernelManager(kernel_name=install_slow_fail_kernel, config=config)
228231

229232
cbs = 0
230233
restarts = [asyncio.Future() for i in range(N_restarts)]
234+
231235
def cb():
232236
nonlocal cbs
233237
if cbs >= N_restarts:
@@ -236,14 +240,15 @@ def cb():
236240
cbs += 1
237241

238242
died = asyncio.Future()
243+
239244
def on_death():
240245
died.set_result(True)
241246

242247
try:
243248
await km.start_kernel()
244249
km.add_restart_callback(cb, 'restart')
245250
km.add_restart_callback(on_death, 'dead')
246-
except:
251+
except BaseException:
247252
if km.has_kernel:
248253
await km.shutdown_kernel()
249254
raise
@@ -258,4 +263,3 @@ def on_death():
258263

259264
await km.shutdown_kernel(now=True)
260265
assert km.context.closed
261-

0 commit comments

Comments
 (0)