Skip to content

Commit adbdb77

Browse files
committed
BUG: Kill subprocesses on shutdown.
Fixes #jupyter/jupyter_client#104 This should make sure we properly cull all subprocesses at shutdown, it does change one of the private method from sync to async in order to no user time.sleep or thread so this may affect subclasses, though I doubt it. It's also not completely clear to me whether this works on windows as SIGINT I belove is not a thing. Regardless as this affects things like dask, and others that are mostly on unix, it should be an improvement. It does the following, stopping as soon as it does not find any more children to current process. - Send sigint to everything - Immediately send sigterm in look with an exponential backoff from 0.01 to 1 second roughtly multiplying the delay until next send by 3 each time. - Switch to sending sigkill with same backoff. There is no delay after sigint, as this is just a courtesy. The delays backoff are not configurable. I can imagine that on slow systems it may make sens
1 parent 221dca6 commit adbdb77

File tree

1 file changed

+71
-18
lines changed

1 file changed

+71
-18
lines changed

ipykernel/kernelbase.py

Lines changed: 71 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -5,44 +5,44 @@
55

66
import asyncio
77
import concurrent.futures
8-
from datetime import datetime
9-
from functools import partial
8+
import inspect
109
import itertools
1110
import logging
12-
import inspect
1311
import os
14-
from signal import signal, default_int_handler, SIGINT
15-
import sys
1612
import socket
13+
import sys
1714
import time
1815
import uuid
1916
import warnings
17+
from datetime import datetime
18+
from functools import partial
19+
from signal import (SIGINT, SIGKILL, SIGTERM, Signals, default_int_handler,
20+
signal)
21+
2022
try:
2123
import psutil
2224
except ImportError:
2325
psutil = None
2426

27+
2528
try:
2629
# jupyter_client >= 5, use tz-aware now
2730
from jupyter_client.session import utcnow as now
2831
except ImportError:
2932
# jupyter_client < 5, use local now()
3033
now = datetime.now
3134

35+
import zmq
36+
from IPython.core.error import StdinNotImplementedError
37+
from jupyter_client.session import Session
3238
from tornado import ioloop
3339
from tornado.queues import Queue, QueueEmpty
34-
import zmq
40+
from traitlets import (Any, Bool, Dict, Float, Instance, Integer, List, Set,
41+
Unicode, default, observe)
42+
from traitlets.config.configurable import SingletonConfigurable
3543
from zmq.eventloop.zmqstream import ZMQStream
3644

37-
from traitlets.config.configurable import SingletonConfigurable
38-
from IPython.core.error import StdinNotImplementedError
3945
from ipykernel.jsonutil import json_clean
40-
from traitlets import (
41-
Any, Instance, Float, Dict, List, Set, Integer, Unicode, Bool,
42-
observe, default
43-
)
44-
45-
from jupyter_client.session import Session
4646

4747
from ._version import kernel_protocol_version
4848

@@ -796,13 +796,13 @@ async def comm_info_request(self, stream, ident, parent):
796796
reply_content, parent, ident)
797797
self.log.debug("%s", msg)
798798

799-
async def interrupt_request(self, stream, ident, parent):
799+
def _send_interupt_children(self):
800+
800801
pid = os.getpid()
801802
pgid = os.getpgid(pid)
802803

803804
if os.name == "nt":
804805
self.log.error("Interrupt message not supported on Windows")
805-
806806
else:
807807
# Prefer process-group over process
808808
if pgid and hasattr(os, "killpg"):
@@ -816,6 +816,8 @@ async def interrupt_request(self, stream, ident, parent):
816816
except OSError:
817817
pass
818818

819+
async def interrupt_request(self, stream, ident, parent):
820+
self._send_interupt_children()
819821
content = parent['content']
820822
self.session.send(stream, 'interrupt_reply', content, parent, ident=ident)
821823
return
@@ -830,7 +832,7 @@ async def shutdown_request(self, stream, ident, parent):
830832
content, parent
831833
)
832834

833-
self._at_shutdown()
835+
await self._at_shutdown()
834836

835837
self.log.debug('Stopping control ioloop')
836838
control_io_loop = self.control_stream.io_loop
@@ -1131,9 +1133,60 @@ def _input_request(self, prompt, ident, parent, password=False):
11311133
raise EOFError
11321134
return value
11331135

1134-
def _at_shutdown(self):
1136+
async def _progressively_terminate_all_children(self):
1137+
1138+
pgid = os.getpgid(os.getpid())
1139+
if not pgid:
1140+
self.log.warning(f"No Pgid ({pgid=}), not trying to stop subprocesses.")
1141+
return
1142+
if psutil is None:
1143+
# blindly send quickly sigterm/sigkill to processes if psutil not there.
1144+
self.log.warning(
1145+
f"Please install psutil for a cleaner subprocess shutdown."
1146+
)
1147+
self._send_interupt_children()
1148+
try:
1149+
await asyncio.sleep(0.05)
1150+
self.log.debug("Sending SIGTERM to {pgid=}")
1151+
os.killpg(pgid, SIGTERM)
1152+
await asyncio.sleep(0.05)
1153+
self.log.debug("Sending SIGKILL to {pgid=}")
1154+
os.killpg(pgid, SIGKILL)
1155+
except Exception:
1156+
self.log.exception("Exception during subprocesses termination")
1157+
return
1158+
1159+
sleeps = (0.01, 0.03, 0.1, 0.3, 1)
1160+
children = psutil.Process().children(recursive=True)
1161+
if not children:
1162+
self.log.debug("Kernel has no children.")
1163+
return
1164+
self.log.debug(f"Trying to interrupt then kill subprocesses : {children=}")
1165+
self._send_interupt_children()
1166+
for signum in (SIGTERM, SIGKILL):
1167+
self.log.debug(
1168+
f"Will try to send {signum} ({Signals(signum)}) to subprocesses :{children}"
1169+
)
1170+
for delay in sleeps:
1171+
children = psutil.Process().children(recursive=True)
1172+
if not children:
1173+
self.log.debug("No more children, continuing shutdown routine.")
1174+
return
1175+
if pgid and hasattr(os, "killpg"):
1176+
try:
1177+
os.killpg(pgid, signum)
1178+
except OSError:
1179+
self.log.warning("OSError running killpg, not killing children")
1180+
return
1181+
self.log.debug(
1182+
f"Will sleep {delay}s before checking for children and retrying."
1183+
)
1184+
await ascynio.sleep(delay)
1185+
1186+
async def _at_shutdown(self):
11351187
"""Actions taken at shutdown by the kernel, called by python's atexit.
11361188
"""
1189+
await self._progressively_terminate_all_children()
11371190
if self._shutdown_message is not None:
11381191
self.session.send(self.iopub_socket, self._shutdown_message, ident=self._topic('shutdown'))
11391192
self.log.debug("%s", self._shutdown_message)

0 commit comments

Comments
 (0)