test: Only emit metrics from PR CI perf tests

roypat · roypat · commit 00ae82a21c80 · 2023-07-13T12:23:20.000Z
The snapshot performance tests where regressions are critical have been
moved to the nightly CI, where we will get notified by our nightly
infrastructure. The remaining (creation latency and resume of old
snapshot versions) scenarios are either not on any hotpath (creation) or
simply not used by anymore. Since these tests are also known for being
very flaky in our CI, make them only emit metrics, but otherwise
infallible.

Signed-off-by: Patrick Roy &lt;roypat@amazon.co.uk&gt;
diff --git a/tests/integration_tests/performance/test_snapshot_perf.py b/tests/integration_tests/performance/test_snapshot_perf.py
@@ -11,150 +11,36 @@
 
 import host_tools.logging as log_tools
 from framework.artifacts import NetIfaceConfig
-from framework.builder import MicrovmBuilder, SnapshotBuilder, SnapshotType
-from framework.stats import consumer, criteria, function, producer, types
-from framework.utils import CpuMap, eager_map, get_kernel_version
-from framework.utils_cpuid import get_instance_type
+from framework.builder import MicrovmBuilder, SnapshotBuilder
+from framework.stats import consumer, producer, types
+from framework.utils import CpuMap
 
 # How many latencies do we sample per test.
 SAMPLE_COUNT = 3
 USEC_IN_MSEC = 1000
 PLATFORM = platform.machine()
 
-
-# Latencies in milliseconds.
-# The latency for snapshot creation has high variance due to scheduler noise.
-# The issue is tracked here:
-# https://github.com/firecracker-microvm/firecracker/issues/2346
-# TODO: Update baseline values after fix.
-CREATE_LATENCY_BASELINES = {
-    ("x86_64", "2vcpu_256mb.json", "FULL"): 180,
-    ("x86_64", "2vcpu_256mb.json", "DIFF"): 70,
-    ("x86_64", "2vcpu_512mb.json", "FULL"): 280,
-    ("x86_64", "2vcpu_512mb.json", "DIFF"): 90,
-    ("aarch64", "2vcpu_256mb.json", "FULL"): 160,
-    ("aarch64", "2vcpu_256mb.json", "DIFF"): 70,
-    ("aarch64", "2vcpu_512mb.json", "FULL"): 300,
-    ("aarch64", "2vcpu_512mb.json", "DIFF"): 75,
-}
-
-# The latencies for x86 are pretty high due to a design
-# in the cgroups V1 implementation in the kernel. We recommend
-# switching to cgroups v2 for much lower snap resume latencies.
-# More details on this:
-# https://github.com/firecracker-microvm/firecracker/issues/2027
-# Latencies for snap resume on cgroups V2 can be found in our
-# long-running performance configs (i.e. integration_tests/performance/configs).
-LOAD_LATENCY_BASELINES = {
-    ("m5d.metal", "4.14", "sync", "2vcpu_256mb.json"): 9,
-    ("m5d.metal", "4.14", "sync", "2vcpu_512mb.json"): 9,
-    ("m5d.metal", "5.10", "sync", "2vcpu_256mb.json"): 70,
-    ("m5d.metal", "5.10", "sync", "2vcpu_512mb.json"): 90,
-    ("m5d.metal", "5.10", "async", "2vcpu_256mb.json"): 210,
-    ("m5d.metal", "5.10", "async", "2vcpu_512mb.json"): 210,
-    ("m5d.metal", "6.1", "sync", "2vcpu_256mb.json"): 255,
-    ("m5d.metal", "6.1", "sync", "2vcpu_512mb.json"): 245,
-    ("m5d.metal", "6.1", "async", "2vcpu_256mb.json"): 245,
-    ("m5d.metal", "6.1", "async", "2vcpu_512mb.json"): 225,
-    ("m6a.metal", "4.14", "sync", "2vcpu_256mb.json"): 15,
-    ("m6a.metal", "4.14", "sync", "2vcpu_512mb.json"): 19,
-    ("m6a.metal", "5.10", "sync", "2vcpu_256mb.json"): 75,
-    ("m6a.metal", "5.10", "sync", "2vcpu_512mb.json"): 75,
-    ("m6a.metal", "5.10", "async", "2vcpu_256mb.json"): 220,
-    ("m6a.metal", "5.10", "async", "2vcpu_512mb.json"): 220,
-    ("m6a.metal", "6.1", "sync", "2vcpu_256mb.json"): 250,
-    ("m6a.metal", "6.1", "sync", "2vcpu_512mb.json"): 250,
-    ("m6a.metal", "6.1", "async", "2vcpu_256mb.json"): 250,
-    ("m6a.metal", "6.1", "async", "2vcpu_512mb.json"): 300,
-    ("m6i.metal", "4.14", "sync", "2vcpu_256mb.json"): 9,
-    ("m6i.metal", "4.14", "sync", "2vcpu_512mb.json"): 9,
-    ("m6i.metal", "5.10", "sync", "2vcpu_256mb.json"): 70,
-    ("m6i.metal", "5.10", "sync", "2vcpu_512mb.json"): 70,
-    ("m6i.metal", "5.10", "async", "2vcpu_256mb.json"): 245,
-    ("m6i.metal", "5.10", "async", "2vcpu_512mb.json"): 245,
-    ("m6i.metal", "6.1", "sync", "2vcpu_256mb.json"): 220,
-    ("m6i.metal", "6.1", "sync", "2vcpu_512mb.json"): 250,
-    ("m6i.metal", "6.1", "async", "2vcpu_256mb.json"): 220,
-    ("m6i.metal", "6.1", "async", "2vcpu_512mb.json"): 220,
-    ("m6g.metal", "4.14", "sync", "2vcpu_256mb.json"): 3,
-    ("m6g.metal", "4.14", "sync", "2vcpu_512mb.json"): 3,
-    ("m6g.metal", "5.10", "sync", "2vcpu_256mb.json"): 3,
-    ("m6g.metal", "5.10", "sync", "2vcpu_512mb.json"): 3,
-    ("m6g.metal", "5.10", "async", "2vcpu_256mb.json"): 320,
-    ("m6g.metal", "5.10", "async", "2vcpu_512mb.json"): 380,
-    ("m6g.metal", "6.1", "sync", "2vcpu_256mb.json"): 2,
-    ("m6g.metal", "6.1", "sync", "2vcpu_512mb.json"): 3,
-    ("m6g.metal", "6.1", "async", "2vcpu_256mb.json"): 2,
-    ("m6g.metal", "6.1", "async", "2vcpu_512mb.json"): 3,
-    ("c7g.metal", "4.14", "sync", "2vcpu_256mb.json"): 2,
-    ("c7g.metal", "4.14", "sync", "2vcpu_512mb.json"): 2,
-    ("c7g.metal", "5.10", "sync", "2vcpu_256mb.json"): 2,
-    ("c7g.metal", "5.10", "sync", "2vcpu_512mb.json"): 3,
-    ("c7g.metal", "5.10", "async", "2vcpu_256mb.json"): 320,
-    ("c7g.metal", "5.10", "async", "2vcpu_512mb.json"): 360,
-    ("c7g.metal", "6.1", "sync", "2vcpu_256mb.json"): 2,
-    ("c7g.metal", "6.1", "sync", "2vcpu_512mb.json"): 3,
-    ("c7g.metal", "6.1", "async", "2vcpu_256mb.json"): 2,
-    ("c7g.metal", "6.1", "async", "2vcpu_512mb.json"): 3,
-}
-
-
-def snapshot_create_measurements(vm_type, snapshot_type):
-    """Define measurements for snapshot create tests."""
-    lower_than = {
-        "target": CREATE_LATENCY_BASELINES[
-            platform.machine(),
-            vm_type,
-            "FULL" if snapshot_type == SnapshotType.FULL else "DIFF",
-        ]
-    }
-
-    latency = types.MeasurementDef.create_measurement(
-        "latency",
-        "ms",
-        [function.Max("max")],
-        {"max": criteria.LowerThan(lower_than)},
-    )
-
-    return [latency]
+# measurement without pass criteria = test is infallible but still submits metrics. Nice!
+LATENCY_MEASUREMENT = types.MeasurementDef.create_measurement(
+    "latency",
+    "ms",
+    [],
+    {},
+)
 
 
-def snapshot_resume_measurements(vm_type, io_engine):
-    """Define measurements for snapshot resume tests."""
-    load_latency = {
-        "target": LOAD_LATENCY_BASELINES[
-            get_instance_type(), get_kernel_version(level=1), io_engine, vm_type
-        ]
-    }
-
-    latency = types.MeasurementDef.create_measurement(
-        "latency",
-        "ms",
-        [function.Max("max")],
-        {"max": criteria.LowerThan(load_latency)},
-    )
-
-    return [latency]
-
-
-def snapshot_create_producer(
-    logger, vm, disks, ssh_key, target_version, metrics_fifo, snapshot_type
-):
+def snapshot_create_producer(logger, vm, disks, ssh_key, target_version, metrics_fifo):
     """Produce results for snapshot create tests."""
     snapshot_builder = SnapshotBuilder(vm)
     snapshot_builder.create(
         disks=disks,
         ssh_key=ssh_key,
-        snapshot_type=snapshot_type,
         target_version=target_version,
         use_ramdisk=True,
     )
     metrics = vm.flush_metrics(metrics_fifo)
 
-    if snapshot_type == SnapshotType.FULL:
-        value = metrics["latencies_us"]["full_create_snapshot"] / USEC_IN_MSEC
-    else:
-        value = metrics["latencies_us"]["diff_create_snapshot"] / USEC_IN_MSEC
+    value = metrics["latencies_us"]["full_create_snapshot"] / USEC_IN_MSEC
 
     logger.info("Latency {} ms".format(value))
 
@@ -203,6 +89,12 @@ def test_older_snapshot_resume_latency(
     With each previous firecracker version, create a snapshot and try to
     restore in current version.
     """
+
+    # The guest kernel does not "participate" in snapshot restore, so just pick some
+    # arbitrary one
+    if "4.14" not in guest_kernel.name():
+        pytest.skip()
+
     logger = logging.getLogger("old_snapshot_load")
     jailer = firecracker_release.jailer()
     fc_version = firecracker_release.base_name()[1:]
@@ -252,39 +144,37 @@ def test_older_snapshot_resume_latency(
         ),
         func_kwargs={},
     )
-    eager_map(
-        cons.set_measurement_def,
-        snapshot_resume_measurements(microvm_cfg, io_engine.lower()),
-    )
+    cons.set_measurement_def(LATENCY_MEASUREMENT)
 
     st_core.add_pipe(producer=prod, consumer=cons, tag=microvm_cfg)
     # Gather results and verify pass criteria.
     st_core.run_exercise()
 
 
-@pytest.mark.parametrize("guest_mem_mib", [256, 512])
-@pytest.mark.parametrize("snapshot_type", [SnapshotType.FULL, SnapshotType.DIFF])
 def test_snapshot_create_latency(
     microvm_factory,
     guest_kernel,
     rootfs,
-    guest_mem_mib,
-    snapshot_type,
     firecracker_release,
     st_core,
 ):
     """
-    Test scenario: Full/Diff snapshot create performance measurement.
+    Test scenario: Full snapshot create performance measurement.
 
     Testing matrix:
     - Guest kernel: all supported ones
     - Rootfs: Ubuntu 18.04
-    - Microvm: 2vCPU with 256/512 MB RAM
-    TODO: Multiple microvm sizes must be tested in the async pipeline.
+    - Microvm: 2vCPU with 512 MB RAM
     """
+
+    # The guest kernel does not "participate" in snapshot restore, so just pick some
+    # arbitrary one
+    if "4.14" not in guest_kernel.name():
+        pytest.skip()
+
     logger = logging.getLogger("snapshot_sequence")
 
-    diff_snapshots = snapshot_type == SnapshotType.DIFF
+    guest_mem_mib = 512
     vcpus = 2
     microvm_cfg = f"{vcpus}vcpu_{guest_mem_mib}mb.json"
     vm = microvm_factory.build(guest_kernel, rootfs, monitor_memory=False)
@@ -293,7 +183,6 @@ def test_snapshot_create_latency(
         vcpu_count=vcpus,
         mem_size_mib=guest_mem_mib,
         use_initrd=True,
-        track_dirty_pages=diff_snapshots,
     )
 
     # Configure metrics system.
@@ -319,12 +208,10 @@ def test_snapshot_create_latency(
             idx_vcpu, current_cpu_id + idx_vcpu
         ), f"Failed to pin fc_vcpu {idx_vcpu} thread."
 
-    st_core.name = f"snapshot_create_{snapshot_type}_latency"
+    st_core.name = f"snapshot_create_SnapshotType.FULL_latency"
     st_core.iterations = SAMPLE_COUNT
     st_core.custom["guest_config"] = microvm_cfg.strip(".json")
-    st_core.custom["snapshot_type"] = (
-        "FULL" if snapshot_type == SnapshotType.FULL else "DIFF"
-    )
+    st_core.custom["snapshot_type"] = "FULL"
 
     prod = producer.LambdaProducer(
         func=snapshot_create_producer,
@@ -335,7 +222,6 @@ def test_snapshot_create_latency(
             "ssh_key": rootfs.ssh_key(),
             "target_version": firecracker_release.snapshot_version,
             "metrics_fifo": metrics_fifo,
-            "snapshot_type": snapshot_type,
         },
     )
 
@@ -345,10 +231,7 @@ def test_snapshot_create_latency(
         ),
         func_kwargs={},
     )
-    eager_map(
-        cons.set_measurement_def,
-        snapshot_create_measurements(microvm_cfg, snapshot_type),
-    )
+    cons.set_measurement_def(LATENCY_MEASUREMENT)
 
     st_core.add_pipe(producer=prod, consumer=cons, tag=microvm_cfg)
     # Gather results and verify pass criteria.