From 0265aea8b1969374457487fd9f5030032a3fd546 Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Fri, 2 May 2025 12:25:21 +0100
Subject: [PATCH 1/3] doc: fix arguments to ab_test.py in README

It's --pytest-opts, not --test anymore.

Fixes: 316d9554da08 ("test(ab): generalize --test to --pytest-opts")
Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 tests/README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/tests/README.md b/tests/README.md
index c306566392f..677544889ad 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -184,7 +184,7 @@ function to [`.buildkite/pipeline_perf.py`](../.buildkite/pipeline_perf.py). To
 manually run an A/B-Test, use
 
 ```sh
-tools/devtool -y test --ab [optional arguments to ab_test.py] run <dir A> <dir B> --test <test specification>
+tools/devtool -y test --ab [optional arguments to ab_test.py] run <dir A> <dir B> --pytest-opts <test specification>
 ```
 
 Here, _dir A_ and _dir B_ are directories containing firecracker and jailer
@@ -198,7 +198,7 @@ branch and the `HEAD` of your current branch, run
 ```sh
 tools/devtool -y build --rev main --release
 tools/devtool -y build --rev HEAD --release
-tools/devtool -y test --no-build --ab -- run build/main build/HEAD --test integration_tests/performance/test_boottime.py::test_boottime
+tools/devtool -y test --no-build --ab -- run build/main build/HEAD --pytest-opts integration_tests/performance/test_boottime.py::test_boottime
 ```
 
 #### How to Write an A/B-Compatible Test and Common Pitfalls
@@ -213,9 +213,9 @@ dimension to match up data series between two test runs. It only matches up two
 data series with the same name if their dimensions match.
 
 Special care needs to be taken when pytest expands the argument passed to
-`tools/ab_test.py`'s `--test` option into multiple individual test cases. If two
-test cases use the same dimensions for different data series, the script will
-fail and print out the names of the violating data series. For this reason,
+`tools/ab_test.py`'s `--pytest-opts` option into multiple individual test cases.
+If two test cases use the same dimensions for different data series, the script
+will fail and print out the names of the violating data series. For this reason,
 **A/B-Compatible tests should include a `performance_test` key in their
 dimension set whose value is set to the name of the test**.
 

From 4c63a6d46a43d23f734646c0e6afe102165904f9 Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Fri, 2 May 2025 12:26:26 +0100
Subject: [PATCH 2/3] fix(ab): only reduce dimension set for printing

By reducing the dimension set to eliminate all dimensions that only
every take on a single value straight during parsing, we broke the
ignore list, which does require all dimensions to be present. Fix this
by moving the "dimension reduction" to only happen during printing of
failure messages.

Fixes: fcb39a68b19c ("test(ab): do not print dimension that are the same across all metrics")
Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 tools/ab_test.py | 37 ++++++++++++++++++-------------------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/tools/ab_test.py b/tools/ab_test.py
index 063f795072a..10857a35480 100755
--- a/tools/ab_test.py
+++ b/tools/ab_test.py
@@ -119,8 +119,6 @@ def load_data_series(report_path: Path, tag=None, *, reemit: bool = False):
     # Dictionary mapping EMF dimensions to A/B-testable metrics/properties
     processed_emf = {}
 
-    distinct_values_per_dimenson = defaultdict(set)
-
     report = json.loads(report_path.read_text("UTF-8"))
     for test in report["tests"]:
         for line in test["teardown"]["stdout"].splitlines():
@@ -140,9 +138,6 @@ def load_data_series(report_path: Path, tag=None, *, reemit: bool = False):
                 if not dimensions:
                     continue
 
-                for dimension, value in dimensions.items():
-                    distinct_values_per_dimenson[dimension].add(value)
-
                 dimension_set = frozenset(dimensions.items())
 
                 if dimension_set not in processed_emf:
@@ -159,24 +154,27 @@ def load_data_series(report_path: Path, tag=None, *, reemit: bool = False):
 
                         values.extend(result[metric][0])
 
-    irrelevant_dimensions = set()
+    return processed_emf
 
-    for dimension, distinct_values in distinct_values_per_dimenson.items():
-        if len(distinct_values) == 1:
-            irrelevant_dimensions.add(dimension)
 
-    post_processed_emf = {}
+def uninteresting_dimensions(processed_emf):
+    """
+    Computes the set of cloudwatch dimensions that only ever take on a
+    single value across the entire dataset.
+    """
+    values_per_dimension = defaultdict(set)
+
+    for dimension_set in processed_emf:
+        for dimension, value in dimension_set:
+            values_per_dimension[dimension].add(value)
 
-    for dimension_set, metrics in processed_emf.items():
-        processed_key = frozenset(
-            (dim, value)
-            for (dim, value) in dimension_set
-            if dim not in irrelevant_dimensions
-        )
+    uninteresting = set()
 
-        post_processed_emf[processed_key] = metrics
+    for dimension, distinct_values in values_per_dimension.items():
+        if len(distinct_values) == 1:
+            uninteresting.add(dimension)
 
-    return post_processed_emf
+    return uninteresting
 
 
 def collect_data(binary_dir: Path, pytest_opts: str):
@@ -304,6 +302,7 @@ def analyze_data(
             )
 
     messages = []
+    do_not_print_list = uninteresting_dimensions(processed_emf_a)
     for dimension_set, metric, result, unit in failures:
         # Sanity check as described above
         if abs(statistics.mean(relative_changes_by_metric[metric])) <= noise_threshold:
@@ -325,7 +324,7 @@ def analyze_data(
                 f"for metric \033[1m{metric}\033[0m with \033[0;31m\033[1mp={result.pvalue}\033[0m. "
                 f"This means that observing a change of this magnitude or worse, assuming that performance "
                 f"characteristics did not change across the tested commits, has a probability of {result.pvalue:.2%}. "
-                f"Tested Dimensions:\n{json.dumps(dict(dimension_set), indent=2, sort_keys=True)}"
+                f"Tested Dimensions:\n{json.dumps(dict({k: v for k,v in dimension_set.items() if k not in do_not_print_list}), indent=2, sort_keys=True)}"
             )
             messages.append(msg)
 

From 5ea08b60fe9eb1fa8e7ea8276c98f43ae0713a68 Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat@amazon.co.uk>
Date: Fri, 2 May 2025 12:29:16 +0100
Subject: [PATCH 3/3] ab: ignore some block throughput metrics on m8g

block throughput metrics on m8g.metal instances for test scenarios using
the async fio engine and more than 1 vcpu are volatile, so exclude them
from A/B-testing.

Suggested-by: Riccardo Mancini <mancio@amazon.com>
Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 tools/ab_test.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/ab_test.py b/tools/ab_test.py
index 10857a35480..2ad3037e86d 100755
--- a/tools/ab_test.py
+++ b/tools/ab_test.py
@@ -51,6 +51,9 @@
     {"fio_engine": "libaio", "metric": "clat_write"},
     # boot time metrics
     {"performance_test": "test_boottime", "metric": "resume_time"},
+    # block throughput on m8g
+    {"fio_engine": "libaio", "vcpus": 2, "instance": "m8g.metal-24xl"},
+    {"fio_engine": "libaio", "vcpus": 2, "instance": "m8g.metal-48xl"},
 ]