Skip to content

Commit 65851a3

Browse files
authored
Harden stress test kill on exit (#267)
* Harden stress test kill on exit * only import psutil in the kill child process function * Apply formatting * Address Review comments, remove unused import
1 parent da023ab commit 65851a3

4 files changed

Lines changed: 119 additions & 12 deletions

File tree

s_tui/helper_functions.py

Lines changed: 47 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import os
2121
import logging
2222
import platform
23+
import signal
2324
import subprocess
2425
import re
2526
import csv
@@ -74,17 +75,53 @@ def get_processor_name():
7475
return platform.processor()
7576

7677

77-
def kill_child_processes(parent_proc):
78-
"""Kills a process and all its children"""
79-
logging.debug("Killing stress process")
78+
def kill_child_processes(parent_proc, timeout=3):
79+
"""Kills a process and all its children gracefully.
80+
81+
Attempts SIGTERM via process group first, falls back to per-process
82+
terminate, then SIGKILL after timeout.
83+
"""
84+
import psutil
85+
86+
if parent_proc is None:
87+
logging.debug("No stress process to kill")
88+
return
89+
90+
logging.debug("Killing stress process %s", parent_proc)
91+
92+
# Try process-group SIGTERM first (covers the whole tree)
8093
try:
81-
for proc in parent_proc.children(recursive=True):
82-
logging.debug("Killing %s", proc)
83-
proc.kill()
84-
parent_proc.kill()
85-
except AttributeError:
86-
logging.debug("No such process")
87-
logging.debug("Could not kill process")
94+
pgid = os.getpgid(parent_proc.pid)
95+
os.killpg(pgid, signal.SIGTERM)
96+
logging.debug("Sent SIGTERM to process group %s", pgid)
97+
except (OSError, ProcessLookupError, AttributeError):
98+
# Process group kill failed — fall back to per-process terminate
99+
logging.debug("Process group kill failed, falling back to per-process")
100+
try:
101+
for proc in parent_proc.children(recursive=True):
102+
try:
103+
proc.terminate()
104+
except (psutil.NoSuchProcess, ProcessLookupError):
105+
pass
106+
parent_proc.terminate()
107+
except (psutil.NoSuchProcess, AttributeError):
108+
logging.debug("Process already gone during terminate")
109+
return
110+
111+
# Wait for graceful exit, then SIGKILL stragglers
112+
try:
113+
gone, alive = psutil.wait_procs(
114+
[parent_proc] + parent_proc.children(recursive=True),
115+
timeout=timeout,
116+
)
117+
for proc in alive:
118+
logging.debug("Sending SIGKILL to straggler %s", proc)
119+
try:
120+
proc.kill()
121+
except (psutil.NoSuchProcess, ProcessLookupError):
122+
pass
123+
except (psutil.NoSuchProcess, AttributeError):
124+
logging.debug("Process already gone during wait")
88125

89126

90127
def output_to_csv(sources, csv_writeable_file):

s_tui/s_tui.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from __future__ import absolute_import
2424

2525
import argparse
26+
import atexit
2627
import signal
2728
import logging
2829
import os
@@ -122,6 +123,7 @@ def unhandled_input(self, uinput):
122123
graph_controller.view.on_menu_close()
123124

124125
signal.signal(signal.SIGINT, signal_handler)
126+
signal.signal(signal.SIGTERM, signal_handler)
125127

126128

127129
class StressController:
@@ -175,7 +177,10 @@ def start_stress(self, stress_cmd):
175177
with open(os.devnull, "w") as dev_null:
176178
try:
177179
stress_proc = subprocess.Popen(
178-
stress_cmd, stdout=dev_null, stderr=dev_null
180+
stress_cmd,
181+
stdout=dev_null,
182+
stderr=dev_null,
183+
start_new_session=True,
179184
)
180185
self.set_stress_process(psutil.Process(stress_proc.pid))
181186
except OSError:
@@ -957,6 +962,7 @@ def main():
957962

958963
global graph_controller
959964
graph_controller = GraphController(args)
965+
atexit.register(graph_controller.stress_controller.kill_stress_process)
960966
graph_controller.main()
961967

962968

tests/test_helper_functions.py

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
"""Tests for s_tui.helper_functions module."""
22

33
import os
4+
import signal
45
import sys
56
import json
67
import tempfile
78

9+
import psutil
810
import pytest
11+
from unittest.mock import MagicMock, patch
912

1013
from s_tui.helper_functions import (
1114
__version__,
@@ -201,9 +204,59 @@ def test_make_user_config_dir(self, tmp_path, monkeypatch):
201204

202205
class TestKillChildProcesses:
203206
def test_none_parent(self):
204-
"""Should handle None gracefully (AttributeError caught)."""
207+
"""Should handle None gracefully."""
205208
kill_child_processes(None) # should not raise
206209

210+
def test_sigterm_via_process_group(self):
211+
"""Should send SIGTERM to the process group first."""
212+
parent = MagicMock()
213+
parent.pid = 12345
214+
parent.children.return_value = []
215+
216+
with patch("os.getpgid", return_value=12345) as mock_getpgid, patch(
217+
"os.killpg"
218+
) as mock_killpg, patch("psutil.wait_procs", return_value=([], [])):
219+
kill_child_processes(parent, timeout=1)
220+
mock_getpgid.assert_called_once_with(12345)
221+
mock_killpg.assert_called_once_with(12345, signal.SIGTERM)
222+
223+
def test_fallback_to_per_process_terminate(self):
224+
"""When process group kill fails, falls back to per-process terminate."""
225+
child = MagicMock()
226+
parent = MagicMock()
227+
parent.pid = 100
228+
parent.children.return_value = [child]
229+
230+
with patch("os.getpgid", side_effect=OSError("no pgid")), patch(
231+
"psutil.wait_procs", return_value=([], [])
232+
):
233+
kill_child_processes(parent, timeout=1)
234+
child.terminate.assert_called_once()
235+
parent.terminate.assert_called_once()
236+
237+
def test_sigkill_stragglers_after_timeout(self):
238+
"""Processes still alive after timeout get SIGKILL."""
239+
straggler = MagicMock()
240+
parent = MagicMock()
241+
parent.pid = 200
242+
parent.children.return_value = []
243+
244+
with patch("os.getpgid", return_value=200), patch("os.killpg"), patch(
245+
"psutil.wait_procs", return_value=([], [straggler])
246+
):
247+
kill_child_processes(parent, timeout=1)
248+
straggler.kill.assert_called_once()
249+
250+
def test_already_dead_process(self):
251+
"""Should not raise when process is already dead."""
252+
parent = MagicMock()
253+
parent.pid = 300
254+
255+
with patch("os.getpgid", side_effect=ProcessLookupError()), patch.object(
256+
parent, "children", side_effect=psutil.NoSuchProcess(300)
257+
):
258+
kill_child_processes(parent, timeout=1) # should not raise
259+
207260

208261
# ---------------------------------------------------------------------------
209262
# output_to_terminal / output_to_json

tests/test_stress_controller.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,17 @@ def test_start_stress(self, mocker):
9494
sc.start_stress(["stress", "-c", "4"])
9595
assert sc.get_stress_process() is mock_psutil_proc
9696

97+
def test_start_stress_uses_new_session(self, mocker):
98+
"""start_stress passes start_new_session=True for process group isolation."""
99+
mock_popen = mocker.patch("subprocess.Popen")
100+
mock_popen.return_value.pid = 99999
101+
mocker.patch("psutil.Process", return_value=MagicMock())
102+
103+
sc = StressController(True, False)
104+
sc.start_stress(["stress", "-c", "4"])
105+
_, kwargs = mock_popen.call_args
106+
assert kwargs.get("start_new_session") is True
107+
97108
def test_start_stress_oserror(self, mocker):
98109
"""start_stress handles OSError gracefully."""
99110
mocker.patch("subprocess.Popen", side_effect=OSError("not found"))

0 commit comments

Comments
 (0)