Skip to content

Commit 2daa6bf

Browse files
committed
test: Add post-restore population latency test
While the post_restore_latency test measures the time on-demand faulting takes from the guest's perspective, this test measures the actual time it takes to fault in guest memory. We cannot collect this data by simply using the fault_all handler in the post-restore test, because the handler will get triggered way before the fast_page_fault_helper script continues running (because the first page fault will be triggered by sshd or the kernel, so by the time out helper runs, the uffd handler will already be done and we won't notice any latency). Therefore, have the fault_all handler print the time it took to populate all of guest memory to its log file, and parse this number. Signed-off-by: Patrick Roy <[email protected]>
1 parent 6a46105 commit 2daa6bf

File tree

2 files changed

+43
-0
lines changed

2 files changed

+43
-0
lines changed

src/firecracker/examples/uffd/fault_all_handler.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ use std::fs::File;
1111
use std::os::unix::net::UnixListener;
1212

1313
use uffd_utils::{Runtime, UffdHandler};
14+
use utils::time::{get_time_us, ClockType};
1415

1516
fn main() {
1617
let mut args = std::env::args();
@@ -34,9 +35,13 @@ fn main() {
3435

3536
match event {
3637
userfaultfd::Event::Pagefault { .. } => {
38+
let start = get_time_us(ClockType::Monotonic);
3739
for region in uffd_handler.mem_regions.clone() {
3840
uffd_handler.serve_pf(region.base_host_virt_addr as _, region.size);
3941
}
42+
let end = get_time_us(ClockType::Monotonic);
43+
44+
println!("Finished Faulting All: {}us", end - start);
4045
}
4146
_ => panic!("Unexpected event on userfaultfd"),
4247
}

tests/integration_tests/performance/test_snapshot_ab.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
22
# SPDX-License-Identifier: Apache-2.0
33
"""Performance benchmark for snapshot restore."""
4+
import re
45
import signal
56
import tempfile
67
import time
@@ -172,3 +173,40 @@ def test_post_restore_latency(
172173
)
173174

174175
metrics.put_metric("fault_latency", int(duration) / NS_IN_MSEC, "Milliseconds")
176+
177+
178+
@pytest.mark.nonci
179+
@pytest.mark.parametrize("huge_pages", HugePagesConfig)
180+
def test_population_latency(
181+
microvm_factory, rootfs, guest_kernel_linux_5_10, metrics, huge_pages
182+
):
183+
"""Collects population latency metrics (e.g. how long it takes UFFD handler to fault in all memory)"""
184+
test_setup = SnapshotRestoreTest(mem=128, vcpus=1, huge_pages=huge_pages)
185+
vm = test_setup.configure_vm(
186+
microvm_factory, guest_kernel_linux_5_10, rootfs, metrics
187+
)
188+
snapshot = vm.snapshot_full()
189+
vm.kill()
190+
191+
for microvm in microvm_factory.build_n_from_snapshot(
192+
snapshot, ITERATIONS, uffd_handler_name="fault_all"
193+
):
194+
# do _something_ to trigger a pagefault, which will then cause the UFFD handler to fault in _everything_
195+
microvm.ssh.check_output("true")
196+
197+
for _ in range(5):
198+
time.sleep(1)
199+
200+
match = re.match(
201+
r"Finished Faulting All: (\d+)us", microvm.uffd_handler.log_data
202+
)
203+
204+
if match:
205+
latency_us = int(match.group(1))
206+
207+
metrics.put_metric(
208+
"populate_latency", latency_us / 1000, "Milliseconds"
209+
)
210+
break
211+
else:
212+
raise RuntimeError("UFFD handler did not print population latency after 5s")

0 commit comments

Comments
 (0)