test(perf): add block latency test

roypat · roypat · commit 391331a71819 · 2025-04-23T18:17:21.000+01:00
fio emits latency metrics regarding how much time was spent inside the
guest operating system (submission latency, slat) or how much time was
spent in the device (clat). For firecracker, the latter could be
relevant, so add a test that specifically emits these.

We have to add a separate test, as we need to use a synchronous fio
worker to get non-volatile metrics. However, for throughput tests the
use of the async engine in the guest is required to get maximum
throughput. We can reduce the permutations slightly by using randrw
instead of separate randread and randwrite (with the idea being that if
all requests are synchrnous and only measure latency, then ddoing
alternating reads and writes is okay, while for the throughput tests its
not wanted).

Signed-off-by: Patrick Roy &lt;roypat@amazon.co.uk&gt;
diff --git a/.buildkite/pipeline_perf.py b/.buildkite/pipeline_perf.py
@@ -18,7 +18,7 @@
 perf_test = {
     "virtio-block": {
         "label": "💿 Virtio Block Performance",
-        "test_path": "integration_tests/performance/test_block_ab.py::test_block_performance",
+        "test_path": "integration_tests/performance/test_block_ab.py::test_block_performance integration_tests/performance/test_block_ab.py::test_block_latency",
         "devtool_opts": "-c 1-10 -m 0",
     },
     "vhost-user-block": {
diff --git a/tests/integration_tests/performance/test_block_ab.py b/tests/integration_tests/performance/test_block_ab.py
@@ -3,6 +3,7 @@
 """Performance benchmark for block device emulation."""
 
 import concurrent
+import glob
 import os
 import shutil
 from pathlib import Path
@@ -44,7 +45,7 @@ def prepare_microvm_for_test(microvm):
     check_output("echo 3 > /proc/sys/vm/drop_caches")
 
 
-def run_fio(microvm, mode, block_size):
+def run_fio(microvm, mode, block_size, *, fio_engine="libaio"):
     """Run a fio test in the specified mode with block size bs."""
     cmd = (
         CmdBuilder("fio")
@@ -59,7 +60,7 @@ def run_fio(microvm, mode, block_size):
         .with_arg("--randrepeat=0")
         .with_arg(f"--bs={block_size}")
         .with_arg(f"--size={BLOCK_DEVICE_SIZE_MB}M")
-        .with_arg("--ioengine=libaio")
+        .with_arg(f"--ioengine={fio_engine}")
         .with_arg("--iodepth=32")
         # Set affinity of the entire fio process to a set of vCPUs equal in size to number of workers
         .with_arg(
@@ -68,6 +69,7 @@ def run_fio(microvm, mode, block_size):
         # Instruct fio to pin one worker per vcpu
         .with_arg("--cpus_allowed_policy=split")
         .with_arg(f"--write_bw_log={mode}")
+        .with_arg(f"--write_lat_log={mode}")
         .with_arg("--log_avg_msec=1000")
         .build()
     )
@@ -101,40 +103,55 @@ def run_fio(microvm, mode, block_size):
         return logs_path, cpu_load_future.result()
 
 
-def process_fio_logs(vm, fio_mode, logs_dir, metrics):
-    """Parses the fio logs in `{logs_dir}/{fio_mode}_bw.*.log and emits their contents as CloudWatch metrics"""
-
+def process_fio_log_files(logs_glob):
+    """Parses all fio log files matching the given glob and yields tuples of same-timestamp read and write metrics"""
     data = [
-        Path(f"{logs_dir}/{fio_mode}_bw.{job_id + 1}.log")
-        .read_text("UTF-8")
-        .splitlines()
-        for job_id in range(vm.vcpus_count)
+        Path(pathname).read_text("UTF-8").splitlines()
+        for pathname in glob.glob(logs_glob)
     ]
 
+    assert data, "no log files found!"
+
     for tup in zip(*data):
-        bw_read = 0
-        bw_write = 0
+        read_values = []
+        write_values = []
 
         for line in tup:
+            # See https://fio.readthedocs.io/en/latest/fio_doc.html#log-file-formats
             _, value, direction, _ = line.split(",", maxsplit=3)
             value = int(value.strip())
 
-            # See https://fio.readthedocs.io/en/latest/fio_doc.html#log-file-formats
             match direction.strip():
                 case "0":
-                    bw_read += value
+                    read_values.append(value)
                 case "1":
-                    bw_write += value
+                    write_values.append(value)
                 case _:
                     assert False
 
+        yield read_values, write_values
+
+
+def emit_fio_throughput_metrics(logs_dir, metrics):
+    """Parses the fio logs in `{logs_dir}/*_bw.*.log and emits their contents as CloudWatch metrics"""
+    for bw_read, bw_write in process_fio_log_files(f"{logs_dir}/*_bw.*.log"):
         if bw_read:
-            metrics.put_metric("bw_read", bw_read, "Kilobytes/Second")
+            metrics.put_metric("bw_read", sum(bw_read), "Kilobytes/Second")
         if bw_write:
-            metrics.put_metric("bw_write", bw_write, "Kilobytes/Second")
+            metrics.put_metric("bw_write", sum(bw_write), "Kilobytes/Second")
+
+
+def emit_fio_latency_metrics(logs_dir, metrics):
+    """Parses the fio logs in `{logs_dir}/*_clat.*.log and emits their contents as CloudWatch metrics"""
+    for lat_read, lat_write in process_fio_log_files(f"{logs_dir}/*_clat.*.log"):
+        # latency values in fio logs are in nanosecons, but cloudwatch only supports
+        # microseconds as the more granular unit, so need to divide by 1000.
+        for value in lat_read:
+            metrics.put_metric("clat_read", value / 1000, "Microseconds")
+        for value in lat_write:
+            metrics.put_metric("clat_write", value / 1000, "Microseconds")
 
 
-@pytest.mark.timeout(120)
 @pytest.mark.nonci
 @pytest.mark.parametrize("vcpus", [1, 2], ids=["1vcpu", "2vcpu"])
 @pytest.mark.parametrize("fio_mode", ["randread", "randwrite"])
@@ -175,15 +192,61 @@ def test_block_performance(
 
     vm.pin_threads(0)
 
-    logs_dir, cpu_util = run_fio(vm, fio_mode, fio_block_size)
+    # latency metrics with async engine are very volatile, so use fio's default sync engine.
+    logs_dir, cpu_util = run_fio(vm, fio_mode, fio_block_size, fio_engine="psync")
 
-    process_fio_logs(vm, fio_mode, logs_dir, metrics)
+    emit_fio_throughput_metrics(logs_dir, metrics)
 
     for thread_name, values in cpu_util.items():
         for value in values:
             metrics.put_metric(f"cpu_utilization_{thread_name}", value, "Percent")
 
 
+@pytest.mark.nonci
+@pytest.mark.parametrize("vcpus", [1, 2], ids=["1vcpu", "2vcpu"])
+@pytest.mark.parametrize("fio_mode", ["randrw"])
+@pytest.mark.parametrize("fio_block_size", [4096], ids=["bs4096"])
+def test_block_latency(
+    microvm_factory,
+    guest_kernel_acpi,
+    rootfs,
+    vcpus,
+    fio_mode,
+    fio_block_size,
+    io_engine,
+    metrics,
+):
+    """
+    Execute block device emulation benchmarking scenarios.
+    """
+    vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False)
+    vm.spawn(log_level="Info", emit_metrics=True)
+    vm.basic_config(vcpu_count=vcpus, mem_size_mib=GUEST_MEM_MIB)
+    vm.add_net_iface()
+    # Add a secondary block device for benchmark tests.
+    fs = drive_tools.FilesystemFile(
+        os.path.join(vm.fsfiles, "scratch"), BLOCK_DEVICE_SIZE_MB
+    )
+    vm.add_drive("scratch", fs.path, io_engine=io_engine)
+    vm.start()
+
+    metrics.set_dimensions(
+        {
+            "performance_test": "test_block_latency",
+            "io_engine": io_engine,
+            "fio_mode": fio_mode,
+            "fio_block_size": str(fio_block_size),
+            **vm.dimensions,
+        }
+    )
+
+    vm.pin_threads(0)
+
+    logs_dir, _ = run_fio(vm, fio_mode, fio_block_size)
+
+    emit_fio_latency_metrics(logs_dir, metrics)
+
+
 @pytest.mark.nonci
 @pytest.mark.parametrize("vcpus", [1, 2], ids=["1vcpu", "2vcpu"])
 @pytest.mark.parametrize("fio_mode", ["randread"])
@@ -226,7 +289,7 @@ def test_block_vhost_user_performance(
 
     logs_dir, cpu_util = run_fio(vm, fio_mode, fio_block_size)
 
-    process_fio_logs(vm, fio_mode, logs_dir, metrics)
+    emit_fio_throughput_metrics(logs_dir, metrics)
 
     for thread_name, values in cpu_util.items():
         for value in values: