Skip to content

Commit 670ee79

Browse files
authored
Merge pull request #717 from vidartf/fix-async-restarter
Improve restarter logic
2 parents 318e1c1 + 89fee5e commit 670ee79

File tree

5 files changed

+390
-10
lines changed

5 files changed

+390
-10
lines changed

jupyter_client/ioloop/restarter.py

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,15 @@
55
"""
66
# Copyright (c) Jupyter Development Team.
77
# Distributed under the terms of the Modified BSD License.
8+
import asyncio
9+
import time
810
import warnings
911

1012
from traitlets import Instance
1113
from zmq.eventloop import ioloop
1214

1315
from jupyter_client.restarter import KernelRestarter
16+
from jupyter_client.utils import run_sync
1417

1518

1619
class IOLoopKernelRestarter(KernelRestarter):
@@ -31,8 +34,12 @@ def _loop_default(self):
3134
def start(self):
3235
"""Start the polling of the kernel."""
3336
if self._pcallback is None:
37+
if asyncio.iscoroutinefunction(self.poll):
38+
cb = run_sync(self.poll)
39+
else:
40+
cb = self.poll
3441
self._pcallback = ioloop.PeriodicCallback(
35-
self.poll,
42+
cb,
3643
1000 * self.time_to_dead,
3744
)
3845
self._pcallback.start()
@@ -49,13 +56,15 @@ async def poll(self):
4956
if self.debug:
5057
self.log.debug("Polling kernel...")
5158
is_alive = await self.kernel_manager.is_alive()
59+
now = time.time()
5260
if not is_alive:
61+
self._last_dead = now
5362
if self._restarting:
5463
self._restart_count += 1
5564
else:
5665
self._restart_count = 1
5766

58-
if self._restart_count >= self.restart_limit:
67+
if self._restart_count > self.restart_limit:
5968
self.log.warning("AsyncIOLoopKernelRestarter: restart failed")
6069
self._fire_callbacks("dead")
6170
self._restarting = False
@@ -73,8 +82,20 @@ async def poll(self):
7382
await self.kernel_manager.restart_kernel(now=True, newports=newports)
7483
self._restarting = True
7584
else:
76-
if self._initial_startup:
85+
# Since `is_alive` only tests that the kernel process is alive, it does not
86+
# indicate that the kernel has successfully completed startup. To solve this
87+
# correctly, we would need to wait for a kernel info reply, but it is not
88+
# necessarily appropriate to start a kernel client + channels in the
89+
# restarter. Therefore, we use "has been alive continuously for X time" as a
90+
# heuristic for a stable start up.
91+
# See https://github.com/jupyter/jupyter_client/pull/717 for details.
92+
stable_start_time = self.stable_start_time
93+
if self.kernel_manager.provisioner:
94+
stable_start_time = self.kernel_manager.provisioner.get_stable_start_time(
95+
recommended=stable_start_time
96+
)
97+
if self._initial_startup and now - self._last_dead >= stable_start_time:
7798
self._initial_startup = False
78-
if self._restarting:
99+
if self._restarting and now - self._last_dead >= stable_start_time:
79100
self.log.debug("AsyncIOLoopKernelRestarter: restart apparently succeeded")
80-
self._restarting = False
101+
self._restarting = False

jupyter_client/provisioning/provisioner_base.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ async def load_provisioner_info(self, provisioner_info: Dict) -> None:
206206

207207
def get_shutdown_wait_time(self, recommended: float = 5.0) -> float:
208208
"""
209-
Returns the time allowed for a complete shutdown. This may vary by provisioner.
209+
Returns the time allowed for a complete shutdown. This may vary by provisioner.
210210
211211
This method is called from `KernelManager.finish_shutdown()` during the graceful
212212
phase of its kernel shutdown sequence.
@@ -215,6 +215,15 @@ def get_shutdown_wait_time(self, recommended: float = 5.0) -> float:
215215
"""
216216
return recommended
217217

218+
def get_stable_start_time(self, recommended: float = 10.0) -> float:
219+
"""
220+
Returns the expected upper bound for a kernel (re-)start to complete.
221+
This may vary by provisioner.
222+
223+
The recommended value will typically be what is configured in the kernel restarter.
224+
"""
225+
return recommended
226+
218227
def _finalize_env(self, env: Dict[str, str]) -> None:
219228
"""
220229
Ensures env is appropriate prior to launch.

jupyter_client/restarter.py

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,10 @@
77
"""
88
# Copyright (c) Jupyter Development Team.
99
# Distributed under the terms of the Modified BSD License.
10+
import time
11+
1012
from traitlets import Bool # type: ignore
13+
from traitlets import default # type: ignore
1114
from traitlets import Dict
1215
from traitlets import Float
1316
from traitlets import Instance
@@ -31,6 +34,12 @@ class KernelRestarter(LoggingConfigurable):
3134

3235
time_to_dead = Float(3.0, config=True, help="""Kernel heartbeat interval in seconds.""")
3336

37+
stable_start_time = Float(
38+
10.0,
39+
config=True,
40+
help="""The time in seconds to consider the kernel to have completed a stable start up.""",
41+
)
42+
3443
restart_limit = Integer(
3544
5,
3645
config=True,
@@ -45,6 +54,11 @@ class KernelRestarter(LoggingConfigurable):
4554
_restarting = Bool(False)
4655
_restart_count = Integer(0)
4756
_initial_startup = Bool(True)
57+
_last_dead = Float()
58+
59+
@default("_last_dead")
60+
def _default_last_dead(self):
61+
return time.time()
4862

4963
callbacks = Dict()
5064

@@ -103,13 +117,15 @@ def poll(self):
103117
if self.kernel_manager.shutting_down:
104118
self.log.debug("Kernel shutdown in progress...")
105119
return
120+
now = time.time()
106121
if not self.kernel_manager.is_alive():
122+
self._last_dead = now
107123
if self._restarting:
108124
self._restart_count += 1
109125
else:
110126
self._restart_count = 1
111127

112-
if self._restart_count >= self.restart_limit:
128+
if self._restart_count > self.restart_limit:
113129
self.log.warning("KernelRestarter: restart failed")
114130
self._fire_callbacks("dead")
115131
self._restarting = False
@@ -127,8 +143,20 @@ def poll(self):
127143
self.kernel_manager.restart_kernel(now=True, newports=newports)
128144
self._restarting = True
129145
else:
130-
if self._initial_startup:
146+
# Since `is_alive` only tests that the kernel process is alive, it does not
147+
# indicate that the kernel has successfully completed startup. To solve this
148+
# correctly, we would need to wait for a kernel info reply, but it is not
149+
# necessarily appropriate to start a kernel client + channels in the
150+
# restarter. Therefore, we use "has been alive continuously for X time" as a
151+
# heuristic for a stable start up.
152+
# See https://github.com/jupyter/jupyter_client/pull/717 for details.
153+
stable_start_time = self.stable_start_time
154+
if self.kernel_manager.provisioner:
155+
stable_start_time = self.kernel_manager.provisioner.get_stable_start_time(
156+
recommended=stable_start_time
157+
)
158+
if self._initial_startup and now - self._last_dead >= stable_start_time:
131159
self._initial_startup = False
132-
if self._restarting:
160+
if self._restarting and now - self._last_dead >= stable_start_time:
133161
self.log.debug("KernelRestarter: restart apparently succeeded")
134-
self._restarting = False
162+
self._restarting = False

jupyter_client/tests/problemkernel.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
"""Test kernel for signalling subprocesses"""
2+
# Copyright (c) Jupyter Development Team.
3+
# Distributed under the terms of the Modified BSD License.
4+
import os
5+
import time
6+
7+
from ipykernel.displayhook import ZMQDisplayHook
8+
from ipykernel.kernelapp import IPKernelApp
9+
from ipykernel.kernelbase import Kernel
10+
11+
12+
class ProblemTestKernel(Kernel):
13+
"""Kernel for testing kernel problems"""
14+
15+
implementation = "problemtest"
16+
implementation_version = "0.0"
17+
banner = ""
18+
19+
20+
class ProblemTestApp(IPKernelApp):
21+
kernel_class = ProblemTestKernel
22+
23+
def init_io(self):
24+
# Overridden to disable stdout/stderr capture
25+
self.displayhook = ZMQDisplayHook(self.session, self.iopub_socket)
26+
27+
def init_sockets(self):
28+
if os.environ.get("FAIL_ON_START") == "1":
29+
# Simulates e.g. a port binding issue (Adress already in use)
30+
raise RuntimeError("Failed for testing purposes")
31+
return super().init_sockets()
32+
33+
34+
if __name__ == "__main__":
35+
# make startup artificially slow,
36+
# so that we exercise client logic for slow-starting kernels
37+
startup_delay = int(os.environ.get("STARTUP_DELAY", "2"))
38+
time.sleep(startup_delay)
39+
ProblemTestApp.launch_instance()

0 commit comments

Comments
 (0)