Skip to content

Commit dcfc4c1

Browse files
committed
test: clean-up potentially dead microVMs
We have some tests that stress the microVM in a way that _might_ cause it to die. We do check whether that's the case, however this is inherently racy, i.e. the microVM might die on us after the check but before the fixture logic tries to recycle it. Such a test is test_balloon.py::test_deflate_oom. In order to avoid such situations, teach MicroVM.kill() that there might be cases where the microVM process has already died and let it finish its cleanup gracefully. Signed-off-by: Babis Chalios <[email protected]>
1 parent c28dbdb commit dcfc4c1

File tree

3 files changed

+33
-15
lines changed

3 files changed

+33
-15
lines changed

tests/framework/microvm.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,7 @@ def mark_killed(self):
301301

302302
self._killed = True
303303

304-
def kill(self):
304+
def kill(self, might_be_dead=False):
305305
"""All clean up associated with this microVM should go here."""
306306
# pylint: disable=subprocess-run-check
307307
# if it was already killed, return
@@ -314,7 +314,7 @@ def kill(self):
314314

315315
# Kill all background SSH connections
316316
for connection in self._connections:
317-
connection.close()
317+
connection.close(strict=not might_be_dead)
318318

319319
# We start with vhost-user backends,
320320
# because if we stop Firecracker first, the backend will want
@@ -325,24 +325,27 @@ def kill(self):
325325

326326
assert (
327327
"Shutting down VM after intercepting signal" not in self.log_data
328+
or might_be_dead
328329
), self.log_data
329330

331+
# pylint: disable=bare-except
330332
try:
331333
if self.firecracker_pid:
332334
os.kill(self.firecracker_pid, signal.SIGKILL)
333335

334336
if self.screen_pid:
335337
os.kill(self.screen_pid, signal.SIGKILL)
336338
except:
337-
msg = (
338-
"Failed to kill Firecracker Process. Did it already die (or did the UFFD handler process die and take it down)?"
339-
if self.uffd_handler
340-
else "Failed to kill Firecracker Process. Did it already die?"
341-
)
339+
if not might_be_dead:
340+
msg = (
341+
"Failed to kill Firecracker Process. Did it already die (or did the UFFD handler process die and take it down)?"
342+
if self.uffd_handler
343+
else "Failed to kill Firecracker Process. Did it already die?"
344+
)
342345

343-
self._dump_debug_information(msg)
346+
self._dump_debug_information(msg)
344347

345-
raise
348+
raise
346349

347350
# if microvm was spawned then check if it gets killed
348351
if self._spawned:

tests/host_tools/network.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -145,21 +145,33 @@ def _init_connection(self):
145145
self.close()
146146
raise
147147

148-
def _check_liveness(self) -> int:
149-
"""Checks whether the ControlPersist connection is still alive"""
148+
def _check_liveness(self, strict=True) -> int | None:
149+
"""Checks whether the ControlPersist connection is still alive
150+
151+
It will return the pid of the ControlMaster if it is still running,
152+
otherwise None
153+
"""
150154
check_cmd = ["ssh", "-O", "check", *self.options, self.user_host]
151155

152-
_, _, stderr = self._exec(check_cmd, check=True)
156+
try:
157+
_, _, stderr = self._exec(check_cmd, check=True)
158+
except ChildProcessError:
159+
if strict:
160+
raise
161+
162+
return None
153163

154164
pid_match = re.match(r"Master running \(pid=(\d+)\)", stderr)
155165

156166
assert pid_match, f"SSH ControlMaster connection not alive anymore: {stderr}"
157167

158168
return int(pid_match.group(1))
159169

160-
def close(self):
170+
def close(self, strict=True):
161171
"""Closes the ControlPersist connection"""
162-
master_pid = self._check_liveness()
172+
master_pid = self._check_liveness(strict)
173+
if master_pid is None:
174+
return
163175

164176
stop_cmd = ["ssh", "-O", "stop", *self.options, self.user_host]
165177

@@ -182,7 +194,7 @@ def run(self, cmd_string, timeout=100, *, check=False, debug=False):
182194
183195
If `debug` is set, pass `-vvv` to `ssh`. Note that this will clobber stderr.
184196
"""
185-
self._check_liveness()
197+
self._check_liveness(True)
186198

187199
command = ["ssh", *self.options, self.user_host, cmd_string]
188200

tests/integration_tests/functional/test_balloon.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,9 @@ def test_deflate_on_oom(uvm_plain_any, deflate_on_oom):
228228
assert balloon_size_after < balloon_size_before, "Balloon did not deflate"
229229
else:
230230
assert balloon_size_after >= balloon_size_before, "Balloon deflated"
231+
# Kill it here, letting the infrastructure know that the process might
232+
# be dead already.
233+
test_microvm.kill(might_be_dead=True)
231234

232235

233236
# pylint: disable=C0103

0 commit comments

Comments
 (0)