Skip to content

Commit 758dba6

Browse files
blink1073Steven Silvester
andauthored
Use pending kernels (#593)
* wip use pending kernels yup wip wip handle getattr use ensure_future add more backwards compat clean up tests more cleanup Fix ping handling for shut down kernels fix handling of kernel startup lint * fix handling of kernel activity * clean up pre-commit * make tests less brittle * make tests less brittle Co-authored-by: Steven Silvester <[email protected]>
1 parent c5c515b commit 758dba6

File tree

9 files changed

+260
-37
lines changed

9 files changed

+260
-37
lines changed

jupyter_server/base/zmqhandlers.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,10 @@ def send_ping(self):
177177
self.ping_callback.stop()
178178
return
179179

180+
if self.ws_connection.client_terminated:
181+
self.close()
182+
return
183+
180184
# check for timeout on pong. Make sure that we really have sent a recent ping in
181185
# case the machine with both server and client has been suspended since the last ping.
182186
now = ioloop.IOLoop.current().time()

jupyter_server/pytest_plugin.py

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -416,13 +416,16 @@ def client_fetch(*parts, headers=None, params=None, **kwargs):
416416
@pytest.fixture
417417
def jp_kernelspecs(jp_data_dir):
418418
"""Configures some sample kernelspecs in the Jupyter data directory."""
419-
spec_names = ["sample", "sample 2"]
419+
spec_names = ["sample", "sample 2", "bad"]
420420
for name in spec_names:
421421
sample_kernel_dir = jp_data_dir.joinpath("kernels", name)
422422
sample_kernel_dir.mkdir(parents=True)
423423
# Create kernel json file
424424
sample_kernel_file = sample_kernel_dir.joinpath("kernel.json")
425-
sample_kernel_file.write_text(json.dumps(sample_kernel_json))
425+
kernel_json = sample_kernel_json.copy()
426+
if name == "bad":
427+
kernel_json["argv"] = ["non_existent_path"]
428+
sample_kernel_file.write_text(json.dumps(kernel_json))
426429
# Create resources text
427430
sample_kernel_resources = sample_kernel_dir.joinpath("resource.txt")
428431
sample_kernel_resources.write_text(some_resource)
@@ -474,12 +477,24 @@ async def _():
474477
terminal_cleanup = jp_serverapp.web_app.settings["terminal_manager"].terminate_all
475478
kernel_cleanup = jp_serverapp.kernel_manager.shutdown_all
476479
if asyncio.iscoroutinefunction(terminal_cleanup):
477-
await terminal_cleanup()
480+
try:
481+
await terminal_cleanup()
482+
except Exception as e:
483+
print(e)
478484
else:
479-
terminal_cleanup()
485+
try:
486+
await terminal_cleanup()
487+
except Exception as e:
488+
print(e)
480489
if asyncio.iscoroutinefunction(kernel_cleanup):
481-
await kernel_cleanup()
490+
try:
491+
await kernel_cleanup()
492+
except Exception as e:
493+
print(e)
482494
else:
483-
kernel_cleanup()
495+
try:
496+
kernel_cleanup()
497+
except Exception as e:
498+
print(e)
484499

485500
return _

jupyter_server/services/kernels/handlers.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
# Distributed under the terms of the Modified BSD License.
77
import json
88
from textwrap import dedent
9+
from traceback import format_tb
910

1011
from ipython_genutils.py3compat import cast_unicode
1112
from jupyter_client import protocol_version as client_protocol_version
@@ -78,7 +79,10 @@ async def post(self, kernel_id, action):
7879
try:
7980
await km.restart_kernel(kernel_id)
8081
except Exception as e:
81-
self.log.error("Exception restarting kernel", exc_info=True)
82+
message = "Exception restarting kernel"
83+
self.log.error(message, exc_info=True)
84+
traceback = format_tb(e.__traceback__)
85+
self.write(json.dumps(dict(message=message, traceback=traceback)))
8286
self.set_status(500)
8387
else:
8488
model = await ensure_async(km.kernel_model(kernel_id))
@@ -325,6 +329,15 @@ async def pre_get(self):
325329
# We don't want to wait forever, because browsers don't take it well when
326330
# servers never respond to websocket connection requests.
327331
kernel = self.kernel_manager.get_kernel(self.kernel_id)
332+
333+
if hasattr(kernel, "ready"):
334+
try:
335+
await kernel.ready
336+
except Exception as e:
337+
kernel.execution_state = "dead"
338+
kernel.reason = str(e)
339+
raise web.HTTPError(500, str(e)) from e
340+
328341
self.session.key = kernel.session.key
329342
future = self.request_kernel_info()
330343

@@ -445,6 +458,7 @@ def on_message(self, msg):
445458
def _on_zmq_reply(self, stream, msg_list):
446459
idents, fed_msg_list = self.session.feed_identities(msg_list)
447460
msg = self.session.deserialize(fed_msg_list)
461+
448462
parent = msg["parent_header"]
449463

450464
def write_stderr(error_message):

jupyter_server/services/kernels/kernelmanager.py

Lines changed: 40 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"""
66
# Copyright (c) Jupyter Development Team.
77
# Distributed under the terms of the Modified BSD License.
8+
import asyncio
89
import os
910
from collections import defaultdict
1011
from datetime import datetime
@@ -209,16 +210,14 @@ async def start_kernel(self, kernel_id=None, path=None, **kwargs):
209210
kwargs["kernel_id"] = kernel_id
210211
kernel_id = await ensure_async(self.pinned_superclass.start_kernel(self, **kwargs))
211212
self._kernel_connections[kernel_id] = 0
212-
self._kernel_ports[kernel_id] = self._kernels[kernel_id].ports
213-
self.start_watching_activity(kernel_id)
213+
asyncio.ensure_future(self._finish_kernel_start(kernel_id))
214+
# add busy/activity markers:
215+
kernel = self.get_kernel(kernel_id)
216+
kernel.execution_state = "starting"
217+
kernel.reason = ""
218+
kernel.last_activity = utcnow()
214219
self.log.info("Kernel started: %s" % kernel_id)
215220
self.log.debug("Kernel args: %r" % kwargs)
216-
# register callback for failed auto-restart
217-
self.add_restart_callback(
218-
kernel_id,
219-
lambda: self._handle_kernel_died(kernel_id),
220-
"dead",
221-
)
222221

223222
# Increase the metric of number of kernels running
224223
# for the relevant kernel type by 1
@@ -233,6 +232,24 @@ async def start_kernel(self, kernel_id=None, path=None, **kwargs):
233232

234233
return kernel_id
235234

235+
async def _finish_kernel_start(self, kernel_id):
236+
km = self.get_kernel(kernel_id)
237+
if hasattr(km, "ready"):
238+
try:
239+
await km.ready
240+
except Exception:
241+
self.log.exception(km.ready.exception())
242+
return
243+
244+
self._kernel_ports[kernel_id] = km.ports
245+
self.start_watching_activity(kernel_id)
246+
# register callback for failed auto-restart
247+
self.add_restart_callback(
248+
kernel_id,
249+
lambda: self._handle_kernel_died(kernel_id),
250+
"dead",
251+
)
252+
236253
def ports_changed(self, kernel_id):
237254
"""Used by ZMQChannelsHandler to determine how to coordinate nudge and replays.
238255
@@ -448,6 +465,8 @@ def kernel_model(self, kernel_id):
448465
"execution_state": kernel.execution_state,
449466
"connections": self._kernel_connections.get(kernel_id, 0),
450467
}
468+
if getattr(kernel, "reason", None):
469+
model["reason"] = kernel.reason
451470
return model
452471

453472
def list_kernels(self):
@@ -479,6 +498,7 @@ def start_watching_activity(self, kernel_id):
479498
kernel = self._kernels[kernel_id]
480499
# add busy/activity markers:
481500
kernel.execution_state = "starting"
501+
kernel.reason = ""
482502
kernel.last_activity = utcnow()
483503
kernel._activity_stream = kernel.connect_iopub()
484504
session = Session(
@@ -507,7 +527,7 @@ def record_activity(msg_list):
507527
def stop_watching_activity(self, kernel_id):
508528
"""Stop watching IOPub messages on a kernel for activity."""
509529
kernel = self._kernels[kernel_id]
510-
if kernel._activity_stream:
530+
if getattr(kernel, "_activity_stream", None):
511531
kernel._activity_stream.close()
512532
kernel._activity_stream = None
513533

@@ -561,6 +581,17 @@ async def cull_kernels(self):
561581

562582
async def cull_kernel_if_idle(self, kernel_id):
563583
kernel = self._kernels[kernel_id]
584+
585+
if getattr(kernel, "execution_state") == "dead":
586+
self.log.warning(
587+
"Culling '%s' dead kernel '%s' (%s).",
588+
kernel.execution_state,
589+
kernel.kernel_name,
590+
kernel_id,
591+
)
592+
await ensure_async(self.shutdown_kernel(kernel_id))
593+
return
594+
564595
if hasattr(
565596
kernel, "last_activity"
566597
): # last_activity is monkey-patched, so ensure that has occurred

jupyter_server/services/sessions/handlers.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
"""
55
# Copyright (c) Jupyter Development Team.
66
# Distributed under the terms of the Modified BSD License.
7+
import asyncio
78
import json
89

910
try:
@@ -78,6 +79,8 @@ async def post(self):
7879
self.set_status(501)
7980
self.finish(json.dumps(dict(message=msg, short_message=status_msg)))
8081
return
82+
except Exception as e:
83+
raise web.HTTPError(500, str(e)) from e
8184

8285
location = url_path_join(self.base_url, "api", "sessions", model["id"])
8386
self.set_header("Location", location)
@@ -144,7 +147,10 @@ async def patch(self, session_id):
144147
if model["kernel"]["id"] != before["kernel"]["id"]:
145148
# kernel_id changed because we got a new kernel
146149
# shutdown the old one
147-
await ensure_async(km.shutdown_kernel(before["kernel"]["id"]))
150+
fut = asyncio.ensure_future(ensure_async(km.shutdown_kernel(before["kernel"]["id"])))
151+
# If we are not using pending kernels, wait for the kernel to shut down
152+
if not getattr(km, "use_pending_kernels", None):
153+
await fut
148154
self.finish(json.dumps(model, default=json_default))
149155

150156
@web.authenticated

jupyter_server/tests/services/kernels/test_api.py

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,29 @@
44
import pytest
55
import tornado
66
from jupyter_client.kernelspec import NATIVE_KERNEL_NAME
7+
from tornado.httpclient import HTTPClientError
78

89
from ...utils import expected_http_error
10+
from jupyter_server.services.kernels.kernelmanager import AsyncMappingKernelManager
911
from jupyter_server.utils import url_path_join
1012

1113

12-
@pytest.fixture(params=["MappingKernelManager", "AsyncMappingKernelManager"])
14+
class TestMappingKernelManager(AsyncMappingKernelManager):
15+
"""A no-op subclass to use in a fixture"""
16+
17+
18+
@pytest.fixture(
19+
params=["MappingKernelManager", "AsyncMappingKernelManager", "TestMappingKernelManager"]
20+
)
1321
def jp_argv(request):
22+
if request.param == "TestMappingKernelManager":
23+
extra = []
24+
if hasattr(AsyncMappingKernelManager, "use_pending_kernels"):
25+
extra = ["--AsyncMappingKernelManager.use_pending_kernels=True"]
26+
return [
27+
"--ServerApp.kernel_manager_class=jupyter_server.tests.services.kernels.test_api."
28+
+ request.param
29+
] + extra
1430
return [
1531
"--ServerApp.kernel_manager_class=jupyter_server.services.kernels.kernelmanager."
1632
+ request.param
@@ -38,7 +54,7 @@ async def test_default_kernels(jp_fetch, jp_base_url, jp_cleanup_subprocesses):
3854
await jp_cleanup_subprocesses()
3955

4056

41-
async def test_main_kernel_handler(jp_fetch, jp_base_url, jp_cleanup_subprocesses):
57+
async def test_main_kernel_handler(jp_fetch, jp_base_url, jp_cleanup_subprocesses, jp_serverapp):
4258
# Start the first kernel
4359
r = await jp_fetch(
4460
"api", "kernels", method="POST", body=json.dumps({"name": NATIVE_KERNEL_NAME})
@@ -83,6 +99,10 @@ async def test_main_kernel_handler(jp_fetch, jp_base_url, jp_cleanup_subprocesse
8399
assert r.code == 204
84100

85101
# Restart a kernel
102+
kernel = jp_serverapp.kernel_manager.get_kernel(kernel2["id"])
103+
if hasattr(kernel, "ready"):
104+
await kernel.ready
105+
86106
r = await jp_fetch(
87107
"api", "kernels", kernel2["id"], "restart", method="POST", allow_nonstandard_methods=True
88108
)
@@ -143,6 +163,32 @@ async def test_kernel_handler(jp_fetch, jp_cleanup_subprocesses):
143163
await jp_cleanup_subprocesses()
144164

145165

166+
async def test_kernel_handler_startup_error(
167+
jp_fetch, jp_cleanup_subprocesses, jp_serverapp, jp_kernelspecs
168+
):
169+
if getattr(jp_serverapp.kernel_manager, "use_pending_kernels", False):
170+
return
171+
172+
# Create a kernel
173+
with pytest.raises(HTTPClientError):
174+
await jp_fetch("api", "kernels", method="POST", body=json.dumps({"name": "bad"}))
175+
176+
177+
async def test_kernel_handler_startup_error_pending(
178+
jp_fetch, jp_ws_fetch, jp_cleanup_subprocesses, jp_serverapp, jp_kernelspecs
179+
):
180+
if not getattr(jp_serverapp.kernel_manager, "use_pending_kernels", False):
181+
return
182+
183+
jp_serverapp.kernel_manager.use_pending_kernels = True
184+
# Create a kernel
185+
r = await jp_fetch("api", "kernels", method="POST", body=json.dumps({"name": "bad"}))
186+
kid = json.loads(r.body.decode())["id"]
187+
188+
with pytest.raises(HTTPClientError):
189+
await jp_ws_fetch("api", "kernels", kid, "channels")
190+
191+
146192
async def test_connection(
147193
jp_fetch, jp_ws_fetch, jp_http_port, jp_auth_header, jp_cleanup_subprocesses
148194
):

jupyter_server/tests/services/kernels/test_cull.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def jp_server_config():
3434
)
3535

3636

37-
async def test_culling(jp_fetch, jp_ws_fetch, jp_cleanup_subprocesses):
37+
async def test_cull_idle(jp_fetch, jp_ws_fetch, jp_cleanup_subprocesses):
3838
r = await jp_fetch("api", "kernels", method="POST", allow_nonstandard_methods=True)
3939
kernel = json.loads(r.body.decode())
4040
kid = kernel["id"]
@@ -53,6 +53,30 @@ async def test_culling(jp_fetch, jp_ws_fetch, jp_cleanup_subprocesses):
5353
await jp_cleanup_subprocesses()
5454

5555

56+
async def test_cull_dead(
57+
jp_fetch, jp_ws_fetch, jp_serverapp, jp_cleanup_subprocesses, jp_kernelspecs
58+
):
59+
if not hasattr(jp_serverapp.kernel_manager, "use_pending_kernels"):
60+
return
61+
62+
jp_serverapp.kernel_manager.use_pending_kernels = True
63+
jp_serverapp.kernel_manager.default_kernel_name = "bad"
64+
r = await jp_fetch("api", "kernels", method="POST", allow_nonstandard_methods=True)
65+
kernel = json.loads(r.body.decode())
66+
kid = kernel["id"]
67+
68+
# Open a websocket connection.
69+
with pytest.raises(HTTPClientError):
70+
await jp_ws_fetch("api", "kernels", kid, "channels")
71+
72+
r = await jp_fetch("api", "kernels", kid, method="GET")
73+
model = json.loads(r.body.decode())
74+
assert model["connections"] == 0
75+
culled = await get_cull_status(kid, jp_fetch) # connected, should not be culled
76+
assert culled
77+
await jp_cleanup_subprocesses()
78+
79+
5680
async def get_cull_status(kid, jp_fetch):
5781
frequency = 0.5
5882
culled = False

jupyter_server/tests/services/kernelspecs/test_api.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010

1111
async def test_list_kernelspecs_bad(jp_fetch, jp_kernelspecs, jp_data_dir):
12-
bad_kernel_dir = jp_data_dir.joinpath(jp_data_dir, "kernels", "bad")
12+
bad_kernel_dir = jp_data_dir.joinpath(jp_data_dir, "kernels", "bad2")
1313
bad_kernel_dir.mkdir(parents=True)
1414
bad_kernel_json = bad_kernel_dir.joinpath("kernel.json")
1515
bad_kernel_json.write_text("garbage")

0 commit comments

Comments
 (0)