diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index a46f742d0a2..3f1aceccd20 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -341,6 +341,9 @@ def kill(self): stderr == "" and "firecracker" not in stdout ), f"Firecracker reported its pid {self.firecracker_pid}, which was killed, but there still exist processes using the supposedly dead Firecracker's jailer_id: {stdout}" + if self.uffd_handler and self.uffd_handler.is_running(): + self.uffd_handler.kill() + # Mark the microVM as not spawned, so we avoid trying to kill twice. self._spawned = False self._killed = True @@ -975,19 +978,22 @@ def snapshot_full(self, *, mem_path: str = "mem", vmstate_path="vmstate"): def restore_from_snapshot( self, - snapshot: Snapshot = None, + snapshot: Snapshot, resume: bool = False, rename_interfaces: dict = None, + *, + uffd_handler_name: str = None, ): """Restore a snapshot""" - if self.uffd_handler is None: - assert ( - snapshot is not None - ), "snapshot file must be provided if no uffd handler is attached!" - jailed_snapshot = snapshot.copy_to_chroot(Path(self.chroot())) - else: - jailed_snapshot = self.uffd_handler.snapshot + jailed_snapshot = snapshot.copy_to_chroot(Path(self.chroot())) + + if uffd_handler_name: + self.uffd_handler = spawn_pf_handler( + self, + uffd_handler(uffd_handler_name, binary_dir=self.fc_binary_path.parent), + jailed_snapshot, + ) jailed_mem = Path("/") / jailed_snapshot.mem.name jailed_vmstate = Path("/") / jailed_snapshot.vmstate.name @@ -1180,14 +1186,9 @@ def build_n_from_snapshot( microvm = self.build() microvm.spawn() - if uffd_handler_name is not None: - spawn_pf_handler( - microvm, - uffd_handler(uffd_handler_name, binary_dir=self.binary_path), - current_snapshot, - ) - - snapshot_copy = microvm.restore_from_snapshot(current_snapshot, resume=True) + snapshot_copy = microvm.restore_from_snapshot( + current_snapshot, resume=True, uffd_handler_name=uffd_handler_name + ) yield microvm diff --git a/tests/framework/utils_uffd.py b/tests/framework/utils_uffd.py index b25ca6cd928..01873e5c289 100644 --- a/tests/framework/utils_uffd.py +++ b/tests/framework/utils_uffd.py @@ -77,16 +77,27 @@ def log_data(self): return "" return self.log_file.read_text(encoding="utf-8") + def kill(self): + """Kills the uffd handler process""" + assert self.is_running() + + self.proc.kill() + + def mark_killed(self): + """Marks the uffd handler as already dead""" + assert not self.is_running() + + self._proc = None + def __del__(self): """Tear down the UFFD handler process.""" - if self.proc is not None: - self.proc.kill() + if self.is_running(): + self.kill() -def spawn_pf_handler(vm, handler_path, snapshot): +def spawn_pf_handler(vm, handler_path, jailed_snapshot): """Spawn page fault handler process.""" # Copy snapshot memory file into chroot of microVM. - jailed_snapshot = snapshot.copy_to_chroot(Path(vm.chroot())) # Copy the valid page fault binary into chroot of microVM. jailed_handler = vm.create_jailed_resource(handler_path) handler_name = os.path.basename(jailed_handler) @@ -95,7 +106,6 @@ def spawn_pf_handler(vm, handler_path, snapshot): handler_name, SOCKET_PATH, jailed_snapshot, vm.chroot(), "uffd.log" ) uffd_handler.spawn(vm.jailer.uid, vm.jailer.gid) - vm.uffd_handler = uffd_handler return uffd_handler diff --git a/tests/host_tools/fcmetrics.py b/tests/host_tools/fcmetrics.py index 7c984dca3dd..6e88612d458 100644 --- a/tests/host_tools/fcmetrics.py +++ b/tests/host_tools/fcmetrics.py @@ -9,6 +9,7 @@ import datetime import json +import logging import math import platform import time @@ -544,8 +545,14 @@ def stop(self): # this should also avoid any race condition leading to # uploading the same metrics twice self.join() - self.vm.api.actions.put(action_type="FlushMetrics") - self._flush_metrics() + try: + self.vm.api.actions.put(action_type="FlushMetrics") + except: # pylint: disable=bare-except + # if this doesn't work, ignore the failure. This function is called during teardown, + # and if it fails there, then the resulting exception hides the actual test failure. + logging.error("Failed to flush Firecracker metrics!") + finally: + self._flush_metrics() def run(self): self.running = True diff --git a/tests/integration_tests/functional/test_uffd.py b/tests/integration_tests/functional/test_uffd.py index e003807d282..a67a24a4f6b 100644 --- a/tests/integration_tests/functional/test_uffd.py +++ b/tests/integration_tests/functional/test_uffd.py @@ -9,7 +9,6 @@ import requests from framework.utils import Timeout, check_output -from framework.utils_uffd import spawn_pf_handler, uffd_handler @pytest.fixture(scope="function", name="snapshot") @@ -90,11 +89,7 @@ def test_valid_handler(uvm_plain, snapshot): vm = uvm_plain vm.memory_monitor = None vm.spawn() - - # Spawn page fault handler process. - spawn_pf_handler(vm, uffd_handler("on_demand"), snapshot) - - vm.restore_from_snapshot(resume=True) + vm.restore_from_snapshot(snapshot, resume=True, uffd_handler_name="on_demand") # Inflate balloon. vm.api.balloon.patch(amount_mib=200) @@ -124,14 +119,13 @@ def test_malicious_handler(uvm_plain, snapshot): vm.memory_monitor = None vm.spawn() - # Spawn page fault handler process. - spawn_pf_handler(vm, uffd_handler("malicious"), snapshot) - # We expect Firecracker to freeze while resuming from a snapshot # due to the malicious handler's unavailability. try: with Timeout(seconds=30): - vm.restore_from_snapshot(resume=True) + vm.restore_from_snapshot( + snapshot, resume=True, uffd_handler_name="malicious" + ) assert False, "Firecracker should freeze" except (TimeoutError, requests.exceptions.ReadTimeout): - pass + vm.uffd_handler.mark_killed() diff --git a/tests/integration_tests/performance/test_huge_pages.py b/tests/integration_tests/performance/test_huge_pages.py index 6015cf6032b..ae1cb595311 100644 --- a/tests/integration_tests/performance/test_huge_pages.py +++ b/tests/integration_tests/performance/test_huge_pages.py @@ -10,7 +10,6 @@ from framework.microvm import HugePagesConfig from framework.properties import global_props from framework.utils_ftrace import ftrace_events -from framework.utils_uffd import spawn_pf_handler, uffd_handler def check_hugetlbfs_in_use(pid: int, allocation_name: str): @@ -91,11 +90,7 @@ def test_hugetlbfs_snapshot(microvm_factory, guest_kernel_linux_5_10, rootfs): ### Restore Snapshot ### vm = microvm_factory.build() vm.spawn() - - # Spawn page fault handler process. - spawn_pf_handler(vm, uffd_handler("on_demand"), snapshot) - - vm.restore_from_snapshot(resume=True) + vm.restore_from_snapshot(snapshot, resume=True, uffd_handler_name="on_demand") check_hugetlbfs_in_use(vm.firecracker_pid, "/anon_hugepage") @@ -133,11 +128,9 @@ def test_hugetlbfs_diff_snapshot(microvm_factory, uvm_plain): vm = microvm_factory.build() vm.spawn() - - # Spawn page fault handler process. - spawn_pf_handler(vm, uffd_handler("on_demand"), snapshot_merged) - - vm.restore_from_snapshot(resume=True) + vm.restore_from_snapshot( + snapshot_merged, resume=True, uffd_handler_name="on_demand" + ) # Verify if the restored microvm works. @@ -192,11 +185,8 @@ def test_ept_violation_count( vm.jailer.extra_args.update({"no-seccomp": None}) vm.spawn() - # Spawn page fault handler process. - spawn_pf_handler(vm, uffd_handler("fault_all"), snapshot) - with ftrace_events("kvm:*"): - vm.restore_from_snapshot(resume=True) + vm.restore_from_snapshot(snapshot, resume=True, uffd_handler_name="fault_all") # Verify if guest can run commands, and also wake up the fast page fault helper to trigger page faults. vm.ssh.check_output(f"kill -s {signal.SIGUSR1} {pid}")