Gateway manager retry kernel updates (#1256)

ojarjur · web-flow · commit 87b215809701 · 2023-04-07T08:05:15.000-07:00
* Double check if a Gateway kernel was culled.

The GatewayMappingKernelManager keeps an internal record of all the remote kernels, and periodically syncs it with the Gateway server.

When it finds that a kernel it previously knew about is no longer in the Gateway server's list of kernels, it has to decide how to reconcile that.

Previously, it was assuming that the kernel was probably culled by the Gateway server, and thus removed it from its internal records.

However, it is conceivable that the list from the upstream Gateway server might have been incomplete due to any combination of bugs, race conditions, or transient network connectivity issues, etc.

If one of those such scenarios occurred, then the previous logic would have declared the kernel as lost.

This change makes the GatewayMappingKernelManager more resilient to such issues by double checking whether or not the kernel was actually removed in the upstream Gateway server.

It does this by attempting to update the GatewayKernelManager instance's model before deciding that the kernel has been culled.

* GatewayClient test for missing kernel list entries

This change extends the test_gateway.py suite to simulate kernels
being transiently missing from kernel list responses.

The new test fails without the update to the GatewayMappingKernelManager
to double check if kernels have been culled, and passes with it.

* Fix a lint error from a missing type annotation.
diff --git a/jupyter_server/gateway/managers.py b/jupyter_server/gateway/managers.py
@@ -123,11 +123,34 @@ async def list_kernels(self, **kwargs):
         culled_ids = []
         for kid, _ in our_kernels.items():
             if kid not in kernel_models:
+                # The upstream kernel was not reported in the list of kernels.
                 self.log.warning(
-                    f"Kernel {kid} no longer active - probably culled on Gateway server."
+                    f"Kernel {kid} not present in the list of kernels - possibly culled on Gateway server."
                 )
-                self._kernels.pop(kid, None)
-                culled_ids.append(kid)  # TODO: Figure out what do with these.
+                try:
+                    # Try to directly refresh the model for this specific kernel in case
+                    # the upstream list of kernels was erroneously incomplete.
+                    #
+                    # That might happen if the case of a proxy that manages multiple
+                    # backends where there could be transient connectivity issues with
+                    # a single backend.
+                    #
+                    # Alternatively, it could happen if there is simply a bug in the
+                    # upstream gateway server.
+                    #
+                    # Either way, including this check improves our reliability in the
+                    # face of such scenarios.
+                    model = await self._kernels[kid].refresh_model()
+                except web.HTTPError:
+                    model = None
+                if model:
+                    kernel_models[kid] = model
+                else:
+                    self.log.warning(
+                        f"Kernel {kid} no longer active - probably culled on Gateway server."
+                    )
+                    self._kernels.pop(kid, None)
+                    culled_ids.append(kid)  # TODO: Figure out what do with these.
         return list(kernel_models.values())
 
     async def shutdown_kernel(self, kernel_id, now=False, restart=False):
diff --git a/tests/test_gateway.py b/tests/test_gateway.py
@@ -9,7 +9,7 @@
 from http.cookies import SimpleCookie
 from io import BytesIO
 from queue import Empty
-from typing import Any, Union
+from typing import Any, Dict, Union
 from unittest.mock import MagicMock, patch
 
 import pytest
@@ -62,6 +62,12 @@ def generate_kernelspec(name):
 # maintain a dictionary of expected running kernels.  Key = kernel_id, Value = model.
 running_kernels = {}
 
+# Dictionary of kernels to transiently omit from list results.
+#
+# This is used to simulate inconsistency in list results from the Gateway server
+# due to issues like race conditions, bugs, etc.
+omitted_kernels: Dict[str, bool] = {}
+
 
 def generate_model(name):
     """Generate a mocked kernel model.  Caller is responsible for adding model to running_kernels dictionary."""
@@ -131,8 +137,11 @@ async def mock_gateway_request(url, **kwargs):  # noqa
     if endpoint.endswith("/api/kernels") and method == "GET":
         kernels = []
         for kernel_id in running_kernels:
-            model = running_kernels.get(kernel_id)
-            kernels.append(model)
+            if kernel_id in omitted_kernels:
+                omitted_kernels.pop(kernel_id)
+            else:
+                model = running_kernels.get(kernel_id)
+                kernels.append(model)
         response_buf = BytesIO(json.dumps(kernels).encode("utf-8"))
         response = await ensure_async(HTTPResponse(request, 200, buffer=response_buf))
         return response
@@ -453,6 +462,7 @@ async def test_gateway_session_lifecycle(init_gateway, jp_root_dir, jp_fetch, cu
 
     assert await is_session_active(jp_fetch, session_id) is True
 
+    omitted_kernels[kernel_id] = True
     if cull_kernel:
         running_kernels.pop(kernel_id)
 
@@ -501,6 +511,7 @@ async def test_gateway_kernel_lifecycle(
     # ensure kernel still considered running
     assert await is_kernel_running(jp_fetch, kernel_id) is True
 
+    omitted_kernels[kernel_id] = True
     if cull_kernel:
         running_kernels.pop(kernel_id)