test(virtio-mem): add functional integration tests for device

Manciukic · Manciukic · commit 5c32222118e2 · 2025-10-03T18:18:24.000+01:00
Add integration tests for the new device:
 - check that the device is detected
 - check that hotplugging and unplugging works
 - check that memory can be used after hotplugging
 - check that memory is freed on hotunplug
 - check different config combinations
 - check different uvm types
 - check that contents are preserved across snapshot-restore

Signed-off-by: Riccardo Mancini &lt;mancio@amazon.com&gt;
diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py
@@ -1186,6 +1186,22 @@ def wait_for_ssh_up(self):
         # run commands. The actual connection retry loop happens in SSHConnection._init_connection
         _ = self.ssh_iface(0)
 
+    def hotplug_memory(
+        self, requested_size_mib: int, timeout: int = 60, poll: float = 0.1
+    ):
+        """Send a hot(un)plug request and wait up to timeout seconds for completion polling every poll seconds"""
+        self.api.memory_hotplug.patch(requested_size_mib=requested_size_mib)
+        # Wait for the hotplug to complete
+        deadline = time.time() + timeout
+        while time.time() < deadline:
+            if (
+                self.api.memory_hotplug.get().json()["plugged_size_mib"]
+                == requested_size_mib
+            ):
+                return
+            time.sleep(poll)
+        raise TimeoutError(f"Hotplug did not complete within {timeout} seconds")
+
 
 class MicroVMFactory:
     """MicroVM factory"""
@@ -1300,6 +1316,18 @@ def build_n_from_snapshot(
             last_snapshot.delete()
         current_snapshot.delete()
 
+    def clone_uvm(self, uvm, uffd_handler_name=None):
+        """
+        Clone the given VM and start it.
+        """
+        snapshot = uvm.snapshot_full()
+        restored_vm = self.build()
+        restored_vm.spawn()
+        restored_vm.restore_from_snapshot(
+            snapshot, resume=True, uffd_handler_name=uffd_handler_name
+        )
+        return restored_vm
+
     def kill(self):
         """Clean up all built VMs"""
         for vm in self.vms:
diff --git a/tests/integration_tests/functional/test_memory_hp.py b/tests/integration_tests/functional/test_memory_hp.py
@@ -3,21 +3,120 @@
 
 """Tests for verifying the virtio-mem is working correctly"""
 
+import pytest
+from packaging import version
+from tenacity import Retrying, retry_if_exception_type, stop_after_delay, wait_fixed
+
+from framework.guest_stats import MeminfoGuest
+from framework.microvm import HugePagesConfig
+from framework.utils import get_kernel_version, get_resident_memory
+
+MEMHP_BOOTARGS = "console=ttyS0 reboot=k panic=1 memhp_default_state=online_movable"
+DEFAULT_CONFIG = {"total_size_mib": 1024, "slot_size_mib": 128, "block_size_mib": 2}
+
+
+def uvm_booted_memhp(
+    uvm, rootfs, _microvm_factory, vhost_user, memhp_config, huge_pages, _uffd_handler
+):
+    """Boots a VM with the given memory hotplugging config"""
 
-def test_virtio_mem_detected(uvm_plain_6_1):
-    """
-    Check that the guest kernel has enabled PV steal time.
-    """
-    uvm = uvm_plain_6_1
     uvm.spawn()
     uvm.memory_monitor = None
-    uvm.basic_config(
-        boot_args="console=ttyS0 reboot=k panic=1 memhp_default_state=online_movable"
-    )
+    if vhost_user:
+        # We need to setup ssh keys manually because we did not specify rootfs
+        # in microvm_factory.build method
+        ssh_key = rootfs.with_suffix(".id_rsa")
+        uvm.ssh_key = ssh_key
+        uvm.basic_config(
+            boot_args=MEMHP_BOOTARGS, add_root_device=False, huge_pages=huge_pages
+        )
+        uvm.add_vhost_user_drive(
+            "rootfs", rootfs, is_root_device=True, is_read_only=True
+        )
+    else:
+        uvm.basic_config(boot_args=MEMHP_BOOTARGS, huge_pages=huge_pages)
+
+    uvm.api.memory_hotplug.put(**memhp_config)
     uvm.add_net_iface()
-    uvm.api.memory_hotplug.put(total_size_mib=1024)
     uvm.start()
+    return uvm
+
+
+def uvm_resumed_memhp(
+    uvm_plain,
+    rootfs,
+    microvm_factory,
+    vhost_user,
+    memhp_config,
+    huge_pages,
+    uffd_handler,
+):
+    """Restores a VM with the given memory hotplugging config after booting and snapshotting"""
+    if vhost_user:
+        pytest.skip("vhost-user doesn't support snapshot/restore")
+    if huge_pages and huge_pages != HugePagesConfig.NONE and not uffd_handler:
+        pytest.skip("Hugepages requires a UFFD handler")
+    uvm = uvm_booted_memhp(
+        uvm_plain, rootfs, microvm_factory, vhost_user, memhp_config, huge_pages, None
+    )
+    return microvm_factory.clone_uvm(uvm, uffd_handler_name=uffd_handler)
+
+
+@pytest.fixture(
+    params=[
+        (uvm_booted_memhp, False, HugePagesConfig.NONE, None),
+        (uvm_booted_memhp, False, HugePagesConfig.HUGETLBFS_2MB, None),
+        (uvm_booted_memhp, True, HugePagesConfig.NONE, None),
+        (uvm_resumed_memhp, False, HugePagesConfig.NONE, None),
+        (uvm_resumed_memhp, False, HugePagesConfig.NONE, "on_demand"),
+        (uvm_resumed_memhp, False, HugePagesConfig.HUGETLBFS_2MB, "on_demand"),
+    ],
+    ids=[
+        "booted",
+        "booted-huge-pages",
+        "booted-vhost-user",
+        "resumed",
+        "resumed-uffd",
+        "resumed-uffd-huge-pages",
+    ],
+)
+def uvm_any_memhp(request, uvm_plain_6_1, rootfs, microvm_factory):
+    """Fixture that yields a booted or resumed VM with memory hotplugging"""
+    ctor, vhost_user, huge_pages, uffd_handler = request.param
+    yield ctor(
+        uvm_plain_6_1,
+        rootfs,
+        microvm_factory,
+        vhost_user,
+        DEFAULT_CONFIG,
+        huge_pages,
+        uffd_handler,
+    )
+
+
+def supports_hugetlbfs_discard():
+    """Returns True if the kernel supports hugetlbfs discard"""
+    return version.parse(get_kernel_version()) >= version.parse("5.18.0")
+
+
+def validate_metrics(uvm):
+    """Validates that there are no fails in the metrics"""
+    metrics_to_check = ["plug_fails", "unplug_fails", "unplug_all_fails", "state_fails"]
+    if supports_hugetlbfs_discard():
+        metrics_to_check.append("unplug_discard_fails")
+    uvm.flush_metrics()
+    for metrics in uvm.get_all_metrics():
+        for k in metrics_to_check:
+            assert (
+                metrics["memory_hotplug"][k] == 0
+            ), f"{k}={metrics[k]} is greater than zero"
 
+
+def check_device_detected(uvm):
+    """
+    Check that the guest kernel has enabled virtio-mem.
+    """
+    hp_config = uvm.api.memory_hotplug.get().json()
     _, stdout, _ = uvm.ssh.check_output("dmesg | grep 'virtio_mem'")
     for line in stdout.splitlines():
         _, key, value = line.strip().split(":")
@@ -27,12 +126,162 @@ def test_virtio_mem_detected(uvm_plain_6_1):
             case "start address":
                 assert value == (512 << 30), "start address doesn't match"
             case "region size":
-                assert value == 1024 << 20, "region size doesn't match"
+                assert (
+                    value == hp_config["total_size_mib"] << 20
+                ), "region size doesn't match"
             case "device block size":
-                assert value == 2 << 20, "block size doesn't match"
+                assert (
+                    value == hp_config["block_size_mib"] << 20
+                ), "block size doesn't match"
             case "plugged size":
                 assert value == 0, "plugged size doesn't match"
             case "requested size":
                 assert value == 0, "requested size doesn't match"
             case _:
                 continue
+
+
+def check_memory_usable(uvm):
+    """Allocates memory to verify it's usable (5% margin to avoid OOM-kill)"""
+    mem_available = MeminfoGuest(uvm).get().mem_available.bytes()
+    # number of 64b ints to allocate as 95% of available memory
+    count = mem_available * 95 // 100 // 8
+
+    uvm.ssh.check_output(
+        f"python3 -c 'Q = 0x0123456789abcdef; a = [Q] * {count}; assert all(q == Q for q in a)'"
+    )
+
+
+def check_hotplug(uvm, requested_size_mib):
+    """Verifies memory can be hot(un)plugged"""
+    meminfo = MeminfoGuest(uvm)
+    mem_total_fixed = (
+        meminfo.get().mem_total.mib()
+        - uvm.api.memory_hotplug.get().json()["plugged_size_mib"]
+    )
+    uvm.hotplug_memory(requested_size_mib)
+
+    # verify guest driver received the request
+    _, stdout, _ = uvm.ssh.check_output(
+        "dmesg | grep 'virtio_mem' | grep 'requested size' | tail -1"
+    )
+    assert (
+        int(stdout.strip().split(":")[-1].strip(), base=0) == requested_size_mib << 20
+    )
+
+    for attempt in Retrying(
+        retry=retry_if_exception_type(AssertionError),
+        stop=stop_after_delay(5),
+        wait=wait_fixed(1),
+        reraise=True,
+    ):
+        with attempt:
+            # verify guest driver executed the request
+            mem_total_after = meminfo.get().mem_total.mib()
+            assert mem_total_after == mem_total_fixed + requested_size_mib
+
+
+def check_hotunplug(uvm, requested_size_mib):
+    """Verifies memory can be hotunplugged and gets released"""
+
+    rss_before = get_resident_memory(uvm.ps)
+
+    check_hotplug(uvm, requested_size_mib)
+
+    rss_after = get_resident_memory(uvm.ps)
+
+    print(f"RSS before: {rss_before}, after: {rss_after}")
+
+    huge_pages = HugePagesConfig(uvm.api.machine_config.get().json()["huge_pages"])
+    if huge_pages == HugePagesConfig.HUGETLBFS_2MB and supports_hugetlbfs_discard():
+        assert rss_after < rss_before, "RSS didn't decrease"
+
+
+def test_virtio_mem_hotplug_hotunplug(uvm_any_memhp):
+    """
+    Check that memory can be hotplugged into the VM.
+    """
+    uvm = uvm_any_memhp
+    check_device_detected(uvm)
+
+    check_hotplug(uvm, 1024)
+    check_memory_usable(uvm)
+
+    check_hotunplug(uvm, 0)
+
+    # Check it works again
+    check_hotplug(uvm, 1024)
+    check_memory_usable(uvm)
+
+    validate_metrics(uvm)
+
+
+@pytest.mark.parametrize(
+    "memhp_config",
+    [
+        {"total_size_mib": 256, "slot_size_mib": 128, "block_size_mib": 64},
+        {"total_size_mib": 256, "slot_size_mib": 128, "block_size_mib": 128},
+        {"total_size_mib": 256, "slot_size_mib": 256, "block_size_mib": 64},
+        {"total_size_mib": 256, "slot_size_mib": 256, "block_size_mib": 256},
+    ],
+    ids=["all_different", "slot_sized_block", "single_slot", "single_block"],
+)
+def test_virtio_mem_configs(uvm_plain_6_1, memhp_config):
+    """
+    Check that the virtio mem device is working as expected for different configs
+    """
+    uvm = uvm_booted_memhp(uvm_plain_6_1, None, None, False, memhp_config, None, None)
+    if not uvm.pci_enabled:
+        pytest.skip(
+            "Skip tests on MMIO transport to save time as we don't expect any difference."
+        )
+
+    check_device_detected(uvm)
+
+    for size in range(
+        0, memhp_config["total_size_mib"] + 1, memhp_config["block_size_mib"]
+    ):
+        check_hotplug(uvm, size)
+
+    check_memory_usable(uvm)
+
+    for size in range(
+        memhp_config["total_size_mib"] - memhp_config["block_size_mib"],
+        -1,
+        -memhp_config["block_size_mib"],
+    ):
+        check_hotunplug(uvm, size)
+
+    validate_metrics(uvm)
+
+
+def test_snapshot_restore_persistence(uvm_plain_6_1, microvm_factory):
+    """
+    Check that hptplugged memory is persisted across snapshot/restore.
+    """
+    if not uvm_plain_6_1.pci_enabled:
+        pytest.skip(
+            "Skip tests on MMIO transport to save time as we don't expect any difference."
+        )
+    uvm = uvm_booted_memhp(
+        uvm_plain_6_1, None, microvm_factory, False, DEFAULT_CONFIG, None, None
+    )
+
+    uvm.hotplug_memory(1024)
+
+    # Increase /dev/shm size as it defaults to half of the boot memory
+    uvm.ssh.check_output("mount -o remount,size=1024M -t tmpfs tmpfs /dev/shm")
+
+    uvm.ssh.check_output("dd if=/dev/urandom of=/dev/shm/mem_hp_test bs=1M count=1024")
+
+    _, checksum_before, _ = uvm.ssh.check_output("sha256sum /dev/shm/mem_hp_test")
+
+    restored_vm = microvm_factory.clone_uvm(uvm)
+
+    _, checksum_after, _ = restored_vm.ssh.check_output(
+        "sha256sum /dev/shm/mem_hp_test"
+    )
+
+    assert checksum_before == checksum_after, "Checksums didn't match"
+
+    validate_metrics(restored_vm)