test: Add post-restore population latency test

roypat · roypat · commit 2daa6bf7958e · 2025-03-04T09:21:41.000Z
While the post_restore_latency test measures the time on-demand faulting
takes from the guest's perspective, this test measures the actual time
it takes to fault in guest memory. We cannot collect this data by simply
using the fault_all handler in the post-restore test, because the
handler will get triggered way before the fast_page_fault_helper script
continues running (because the first page fault will be triggered by
sshd or the kernel, so by the time out helper runs, the uffd handler
will already be done and we won't notice any latency). Therefore, have
the fault_all handler print the time it took to populate all of guest
memory to its log file, and parse this number.

Signed-off-by: Patrick Roy &lt;roypat@amazon.co.uk&gt;
diff --git a/src/firecracker/examples/uffd/fault_all_handler.rs b/src/firecracker/examples/uffd/fault_all_handler.rs
@@ -11,6 +11,7 @@ use std::fs::File;
 use std::os::unix::net::UnixListener;
 
 use uffd_utils::{Runtime, UffdHandler};
+use utils::time::{get_time_us, ClockType};
 
 fn main() {
     let mut args = std::env::args();
@@ -34,9 +35,13 @@ fn main() {
 
         match event {
             userfaultfd::Event::Pagefault { .. } => {
+                let start = get_time_us(ClockType::Monotonic);
                 for region in uffd_handler.mem_regions.clone() {
                     uffd_handler.serve_pf(region.base_host_virt_addr as _, region.size);
                 }
+                let end = get_time_us(ClockType::Monotonic);
+
+                println!("Finished Faulting All: {}us", end - start);
             }
             _ => panic!("Unexpected event on userfaultfd"),
         }
diff --git a/tests/integration_tests/performance/test_snapshot_ab.py b/tests/integration_tests/performance/test_snapshot_ab.py
@@ -1,6 +1,7 @@
 # Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 # SPDX-License-Identifier: Apache-2.0
 """Performance benchmark for snapshot restore."""
+import re
 import signal
 import tempfile
 import time
@@ -172,3 +173,40 @@ def test_post_restore_latency(
         )
 
         metrics.put_metric("fault_latency", int(duration) / NS_IN_MSEC, "Milliseconds")
+
+
+@pytest.mark.nonci
+@pytest.mark.parametrize("huge_pages", HugePagesConfig)
+def test_population_latency(
+    microvm_factory, rootfs, guest_kernel_linux_5_10, metrics, huge_pages
+):
+    """Collects population latency metrics (e.g. how long it takes UFFD handler to fault in all memory)"""
+    test_setup = SnapshotRestoreTest(mem=128, vcpus=1, huge_pages=huge_pages)
+    vm = test_setup.configure_vm(
+        microvm_factory, guest_kernel_linux_5_10, rootfs, metrics
+    )
+    snapshot = vm.snapshot_full()
+    vm.kill()
+
+    for microvm in microvm_factory.build_n_from_snapshot(
+        snapshot, ITERATIONS, uffd_handler_name="fault_all"
+    ):
+        # do _something_ to trigger a pagefault, which will then cause the UFFD handler to fault in _everything_
+        microvm.ssh.check_output("true")
+
+        for _ in range(5):
+            time.sleep(1)
+
+            match = re.match(
+                r"Finished Faulting All: (\d+)us", microvm.uffd_handler.log_data
+            )
+
+            if match:
+                latency_us = int(match.group(1))
+
+                metrics.put_metric(
+                    "populate_latency", latency_us / 1000, "Milliseconds"
+                )
+                break
+        else:
+            raise RuntimeError("UFFD handler did not print population latency after 5s")

Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,7 @@ use std::fs::File;`
`11`	`11`	`use std::os::unix::net::UnixListener;`
`12`	`12`
`13`	`13`	`use uffd_utils::{Runtime, UffdHandler};`
	`14`	`+use utils::time::{get_time_us, ClockType};`
`14`	`15`
`15`	`16`	`fn main() {`
`16`	`17`	`let mut args = std::env::args();`
`@@ -34,9 +35,13 @@ fn main() {`
`34`	`35`
`35`	`36`	`match event {`
`36`	`37`	`userfaultfd::Event::Pagefault { .. } => {`
	`38`	`+ let start = get_time_us(ClockType::Monotonic);`
`37`	`39`	`for region in uffd_handler.mem_regions.clone() {`
`38`	`40`	`uffd_handler.serve_pf(region.base_host_virt_addr as _, region.size);`
`39`	`41`	`}`
	`42`	`+ let end = get_time_us(ClockType::Monotonic);`
	`43`	`+`
	`44`	`+ println!("Finished Faulting All: {}us", end - start);`
`40`	`45`	`}`
`41`	`46`	`_ => panic!("Unexpected event on userfaultfd"),`
`42`	`47`	`}`