Skip to content

Commit 6345008

Browse files
authored
Merge pull request #5875 from kevin-bates/kernel-list-race-condition
Fix race condition with async kernel management
2 parents 24bf3a5 + ed0c640 commit 6345008

File tree

1 file changed

+17
-6
lines changed

1 file changed

+17
-6
lines changed

notebook/services/kernels/kernelmanager.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -294,15 +294,19 @@ def shutdown_kernel(self, kernel_id, now=False, restart=False):
294294
kernel._activity_stream.close()
295295
kernel._activity_stream = None
296296
self.stop_buffering(kernel_id)
297-
self._kernel_connections.pop(kernel_id, None)
298297

299298
# Decrease the metric of number of kernels
300299
# running for the relevant kernel type by 1
301300
KERNEL_CURRENTLY_RUNNING_TOTAL.labels(
302301
type=self._kernels[kernel_id].kernel_name
303302
).dec()
304303

305-
return self.pinned_superclass.shutdown_kernel(self, kernel_id, now=now, restart=restart)
304+
self.pinned_superclass.shutdown_kernel(self, kernel_id, now=now, restart=restart)
305+
# Unlike its async sibling method in AsyncMappingKernelManager, removing the kernel_id
306+
# from the connections dictionary isn't as problematic before the shutdown since the
307+
# method is synchronous. However, we'll keep the relative call orders the same from
308+
# a maintenance perspective.
309+
self._kernel_connections.pop(kernel_id, None)
306310

307311
async def restart_kernel(self, kernel_id, now=False):
308312
"""Restart a kernel by kernel_id"""
@@ -376,8 +380,11 @@ def list_kernels(self):
376380
kernels = []
377381
kernel_ids = self.pinned_superclass.list_kernel_ids(self)
378382
for kernel_id in kernel_ids:
379-
model = self.kernel_model(kernel_id)
380-
kernels.append(model)
383+
try:
384+
model = self.kernel_model(kernel_id)
385+
kernels.append(model)
386+
except (web.HTTPError, KeyError):
387+
pass # Probably due to a (now) non-existent kernel, continue building the list
381388
return kernels
382389

383390
# override _check_kernel_id to raise 404 instead of KeyError
@@ -498,12 +505,16 @@ async def shutdown_kernel(self, kernel_id, now=False, restart=False):
498505
kernel._activity_stream.close()
499506
kernel._activity_stream = None
500507
self.stop_buffering(kernel_id)
501-
self._kernel_connections.pop(kernel_id, None)
502508

503509
# Decrease the metric of number of kernels
504510
# running for the relevant kernel type by 1
505511
KERNEL_CURRENTLY_RUNNING_TOTAL.labels(
506512
type=self._kernels[kernel_id].kernel_name
507513
).dec()
508514

509-
return await self.pinned_superclass.shutdown_kernel(self, kernel_id, now=now, restart=restart)
515+
await self.pinned_superclass.shutdown_kernel(self, kernel_id, now=now, restart=restart)
516+
# Remove kernel_id from the connections dictionary only after kernel has been shutdown,
517+
# otherwise a race condition can occur since the shutdown may take a while - allowing
518+
# list/fetch kernel operations to access _kernel_connections for a non-existent key
519+
# (kernel_id) while "awaiting" the result of the shutdown.
520+
self._kernel_connections.pop(kernel_id, None)

0 commit comments

Comments
 (0)