Skip to content

Commit 87b2158

Browse files
authored
Gateway manager retry kernel updates (#1256)
* Double check if a Gateway kernel was culled. The GatewayMappingKernelManager keeps an internal record of all the remote kernels, and periodically syncs it with the Gateway server. When it finds that a kernel it previously knew about is no longer in the Gateway server's list of kernels, it has to decide how to reconcile that. Previously, it was assuming that the kernel was probably culled by the Gateway server, and thus removed it from its internal records. However, it is conceivable that the list from the upstream Gateway server might have been incomplete due to any combination of bugs, race conditions, or transient network connectivity issues, etc. If one of those such scenarios occurred, then the previous logic would have declared the kernel as lost. This change makes the GatewayMappingKernelManager more resilient to such issues by double checking whether or not the kernel was actually removed in the upstream Gateway server. It does this by attempting to update the GatewayKernelManager instance's model before deciding that the kernel has been culled. * GatewayClient test for missing kernel list entries This change extends the test_gateway.py suite to simulate kernels being transiently missing from kernel list responses. The new test fails without the update to the GatewayMappingKernelManager to double check if kernels have been culled, and passes with it. * Fix a lint error from a missing type annotation.
1 parent cd8010e commit 87b2158

File tree

2 files changed

+40
-6
lines changed

2 files changed

+40
-6
lines changed

jupyter_server/gateway/managers.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -123,11 +123,34 @@ async def list_kernels(self, **kwargs):
123123
culled_ids = []
124124
for kid, _ in our_kernels.items():
125125
if kid not in kernel_models:
126+
# The upstream kernel was not reported in the list of kernels.
126127
self.log.warning(
127-
f"Kernel {kid} no longer active - probably culled on Gateway server."
128+
f"Kernel {kid} not present in the list of kernels - possibly culled on Gateway server."
128129
)
129-
self._kernels.pop(kid, None)
130-
culled_ids.append(kid) # TODO: Figure out what do with these.
130+
try:
131+
# Try to directly refresh the model for this specific kernel in case
132+
# the upstream list of kernels was erroneously incomplete.
133+
#
134+
# That might happen if the case of a proxy that manages multiple
135+
# backends where there could be transient connectivity issues with
136+
# a single backend.
137+
#
138+
# Alternatively, it could happen if there is simply a bug in the
139+
# upstream gateway server.
140+
#
141+
# Either way, including this check improves our reliability in the
142+
# face of such scenarios.
143+
model = await self._kernels[kid].refresh_model()
144+
except web.HTTPError:
145+
model = None
146+
if model:
147+
kernel_models[kid] = model
148+
else:
149+
self.log.warning(
150+
f"Kernel {kid} no longer active - probably culled on Gateway server."
151+
)
152+
self._kernels.pop(kid, None)
153+
culled_ids.append(kid) # TODO: Figure out what do with these.
131154
return list(kernel_models.values())
132155

133156
async def shutdown_kernel(self, kernel_id, now=False, restart=False):

tests/test_gateway.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from http.cookies import SimpleCookie
1010
from io import BytesIO
1111
from queue import Empty
12-
from typing import Any, Union
12+
from typing import Any, Dict, Union
1313
from unittest.mock import MagicMock, patch
1414

1515
import pytest
@@ -62,6 +62,12 @@ def generate_kernelspec(name):
6262
# maintain a dictionary of expected running kernels. Key = kernel_id, Value = model.
6363
running_kernels = {}
6464

65+
# Dictionary of kernels to transiently omit from list results.
66+
#
67+
# This is used to simulate inconsistency in list results from the Gateway server
68+
# due to issues like race conditions, bugs, etc.
69+
omitted_kernels: Dict[str, bool] = {}
70+
6571

6672
def generate_model(name):
6773
"""Generate a mocked kernel model. Caller is responsible for adding model to running_kernels dictionary."""
@@ -131,8 +137,11 @@ async def mock_gateway_request(url, **kwargs): # noqa
131137
if endpoint.endswith("/api/kernels") and method == "GET":
132138
kernels = []
133139
for kernel_id in running_kernels:
134-
model = running_kernels.get(kernel_id)
135-
kernels.append(model)
140+
if kernel_id in omitted_kernels:
141+
omitted_kernels.pop(kernel_id)
142+
else:
143+
model = running_kernels.get(kernel_id)
144+
kernels.append(model)
136145
response_buf = BytesIO(json.dumps(kernels).encode("utf-8"))
137146
response = await ensure_async(HTTPResponse(request, 200, buffer=response_buf))
138147
return response
@@ -453,6 +462,7 @@ async def test_gateway_session_lifecycle(init_gateway, jp_root_dir, jp_fetch, cu
453462

454463
assert await is_session_active(jp_fetch, session_id) is True
455464

465+
omitted_kernels[kernel_id] = True
456466
if cull_kernel:
457467
running_kernels.pop(kernel_id)
458468

@@ -501,6 +511,7 @@ async def test_gateway_kernel_lifecycle(
501511
# ensure kernel still considered running
502512
assert await is_kernel_running(jp_fetch, kernel_id) is True
503513

514+
omitted_kernels[kernel_id] = True
504515
if cull_kernel:
505516
running_kernels.pop(kernel_id)
506517

0 commit comments

Comments
 (0)