Skip to content

Commit 56db683

Browse files
committed
feat(cluster_management): kill all leftover processes
It can happen that leftover processes from previous cluster instance prevent new cluster instance from starting. This change makes sure to kill all the leftover processes. - Introduced `netstat_tools` module to handle netstat operations. - Enhanced process killing logic to handle leftover processes more robustly. - Moved `_kill_supervisor` and `_get_netstat_out` functions to the new module. - Updated `ClusterGetter` to use the new `netstat_tools` functions.
1 parent ef59fca commit 56db683

File tree

2 files changed

+87
-34
lines changed

2 files changed

+87
-34
lines changed

cardano_node_tests/cluster_management/cluster_getter.py

Lines changed: 3 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from _pytest.config import Config
1515

1616
from cardano_node_tests.cluster_management import common
17+
from cardano_node_tests.cluster_management import netstat_tools
1718
from cardano_node_tests.cluster_management import resources
1819
from cardano_node_tests.cluster_management import resources_management
1920
from cardano_node_tests.utils import artifacts
@@ -36,35 +37,6 @@ def _xdist_sleep(
3637
"""No need to sleep if tests are running on a single worker."""
3738

3839

39-
def _kill_supervisor(instance_num: int) -> None:
40-
"""Attempt to kill the `supervisord` process."""
41-
try:
42-
netstat = helpers.run_command("netstat -plnt").decode().splitlines()
43-
except Exception:
44-
return
45-
46-
port_num = (
47-
cluster_nodes.get_cluster_type().cluster_scripts.get_instance_ports(instance_num).supervisor
48-
)
49-
port_str = f":{port_num}"
50-
51-
for line in netstat:
52-
if port_str not in line:
53-
continue
54-
line_p = line.replace(" ", " ").strip()
55-
pid = line_p.split()[-1].split("/")[0]
56-
os.kill(int(pid), 15)
57-
return
58-
59-
60-
def _get_netstat_out() -> str:
61-
"""Get output of the `netstat` command."""
62-
try:
63-
return helpers.run_command("netstat -plnt").decode()
64-
except Exception:
65-
return ""
66-
67-
6840
@dataclasses.dataclass
6941
class _ClusterGetStatus:
7042
"""Intermediate status while trying to `get` suitable cluster instance."""
@@ -198,10 +170,7 @@ def _respin(self, start_cmd: str = "", stop_cmd: str = "") -> bool: # noqa: C90
198170
except Exception as err:
199171
self.log(f"c{self.cluster_instance_num}: failed to stop cluster:\n{err}")
200172

201-
_kill_supervisor(self.cluster_instance_num)
202-
203-
# Give the cluster time to stop
204-
time.sleep(5)
173+
netstat_tools.kill_old_cluster(instance_num=self.cluster_instance_num)
205174

206175
# Save artifacts only when produced during this test run
207176
if cluster_running_file.exists() or i > 0:
@@ -229,7 +198,7 @@ def _respin(self, start_cmd: str = "", stop_cmd: str = "") -> bool: # noqa: C90
229198
if _cluster_started:
230199
break
231200
else:
232-
netstat_out = _get_netstat_out()
201+
netstat_out = netstat_tools.get_netstat_out()
233202
self.log(
234203
f"c{self.cluster_instance_num}: failed to start cluster:\n{excp}"
235204
f"\nnetstat:\n{netstat_out}"
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
"""Functions based on `netstat`."""
2+
3+
import logging
4+
import os
5+
import re
6+
import time
7+
8+
from cardano_node_tests.utils import cluster_nodes
9+
from cardano_node_tests.utils import helpers
10+
11+
LOGGER = logging.getLogger(__name__)
12+
13+
14+
def get_netstat_out() -> str:
15+
"""Get output of the `netstat` command."""
16+
try:
17+
return helpers.run_command(
18+
"netstat -pant | grep -E 'LISTEN|TIME_WAIT|CLOSE_WAIT|FIN_WAIT'"
19+
).decode()
20+
except Exception as excp:
21+
LOGGER.error(f"Failed to fetch netstat output: {excp}") # noqa: TRY400
22+
return ""
23+
24+
25+
def kill_old_cluster(instance_num: int) -> None: # noqa: C901
26+
"""Attempt to kill all processes left over from a previous cluster instance."""
27+
28+
def _get_netstat_split() -> list[str]:
29+
return get_netstat_out().splitlines()
30+
31+
def _get_pid(line: str) -> int | None:
32+
try:
33+
pid_str = line.replace(" ", " ").strip().split()[-1].split("/")[0]
34+
return int(pid_str)
35+
except (IndexError, ValueError):
36+
return None
37+
38+
def _try_kill(pid: int) -> None:
39+
try:
40+
os.kill(pid, 15)
41+
except Exception as excp:
42+
LOGGER.error(f"Failed to kill leftover process PID {pid}: {excp}") # noqa: TRY400
43+
return
44+
45+
port_nums = cluster_nodes.get_cluster_type().cluster_scripts.get_instance_ports(instance_num)
46+
port_strs = [
47+
f":{p}"
48+
for p in (
49+
port_nums.supervisor,
50+
port_nums.webserver,
51+
port_nums.submit_api,
52+
*port_nums.node_ports,
53+
)
54+
]
55+
56+
# Attempt to kill the `supervisord` process first. If successful, this will also kill all the
57+
# processes started by supervisor.
58+
port_supervisor_str = port_strs[0]
59+
for line in _get_netstat_split():
60+
if port_supervisor_str not in line:
61+
continue
62+
pid = _get_pid(line)
63+
if pid:
64+
LOGGER.info(f"Killing supervisor process: PID {pid}")
65+
_try_kill(pid)
66+
time.sleep(5)
67+
break
68+
69+
# Kill all the leftover processes, if possible, and wait for them to finish
70+
ports_re = re.compile(r"|".join(re.escape(p) for p in port_strs))
71+
for _ in range(5):
72+
found = False
73+
for line in _get_netstat_split():
74+
if not ports_re.search(line):
75+
continue
76+
found = True
77+
pid = _get_pid(line)
78+
if pid:
79+
LOGGER.info(f"Killing leftover process: PID {pid}")
80+
_try_kill(pid)
81+
time.sleep(5)
82+
break
83+
if not found:
84+
break

0 commit comments

Comments
 (0)