feat: store test results for A/B runs

ShadowCurse · ShadowCurse · commit d2939ef2346c · 2025-05-08T14:55:01.000+01:00
Currently when A/B is run, only results for B test
are available in the `test_results` dir because this
dir is shared for both runs and the last one overwrites the data.
Now we store results into separate dirs.

Signed-off-by: Egor Lazarchuk &lt;yegorlz@amazon.co.uk&gt;
diff --git a/tests/framework/ab_test.py b/tests/framework/ab_test.py
@@ -103,7 +103,7 @@ def git_ab_test(
 
 
 def binary_ab_test(
-    test_runner: Callable[[Path, bool], T],
+    test_runner: Callable[[str, Path, bool], T],
     comparator: Callable[[T, T], U] = default_comparator,
     *,
     a_directory: Path = DEFAULT_A_DIRECTORY,
@@ -113,8 +113,8 @@ def binary_ab_test(
     Similar to `git_ab_test`, but instead of locally checking out different revisions, it operates on
     directories containing firecracker/jailer binaries
     """
-    result_a = test_runner(a_directory, True)
-    result_b = test_runner(b_directory, False)
+    result_a = test_runner("A", a_directory, True)
+    result_b = test_runner("B", b_directory, False)
 
     return result_a, result_b, comparator(result_a, result_b)
 
diff --git a/tools/ab_test.py b/tools/ab_test.py
@@ -19,6 +19,7 @@
 between the two runs, performing statistical regression test across all the list-
 valued properties collected.
 """
+
 import argparse
 import json
 import os
@@ -180,13 +181,17 @@ def uninteresting_dimensions(processed_emf):
     return uninteresting
 
 
-def collect_data(binary_dir: Path, pytest_opts: str):
-    """Executes the specified test using the provided firecracker binaries"""
+def collect_data(tag: str, binary_dir: Path, pytest_opts: str):
+    """
+    Executes the specified test using the provided firecracker binaries and
+    stores results into the `test_results/tag` directory
+    """
     binary_dir = binary_dir.resolve()
 
     print(f"Collecting samples with {binary_dir}")
+    test_report_path = f"test_results/{tag}/test-report.json"
     subprocess.run(
-        f"./tools/test.sh --binary-dir={binary_dir} {pytest_opts} -m ''",
+        f"./tools/test.sh --binary-dir={binary_dir} {pytest_opts} -m '' --json-report-file=../{test_report_path}",
         env=os.environ
         | {
             "AWS_EMF_ENVIRONMENT": "local",
@@ -195,9 +200,8 @@ def collect_data(binary_dir: Path, pytest_opts: str):
         check=True,
         shell=True,
     )
-    return load_data_series(
-        Path("test_results/test-report.json"), binary_dir, reemit=True
-    )
+
+    return load_data_series(Path(test_report_path), binary_dir, reemit=True)
 
 
 def analyze_data(
@@ -327,7 +331,7 @@ def analyze_data(
                 f"for metric \033[1m{metric}\033[0m with \033[0;31m\033[1mp={result.pvalue}\033[0m. "
                 f"This means that observing a change of this magnitude or worse, assuming that performance "
                 f"characteristics did not change across the tested commits, has a probability of {result.pvalue:.2%}. "
-                f"Tested Dimensions:\n{json.dumps({k: v for k,v in dimension_set if k not in do_not_print_list}, indent=2, sort_keys=True)}"
+                f"Tested Dimensions:\n{json.dumps({k: v for k, v in dimension_set if k not in do_not_print_list}, indent=2, sort_keys=True)}"
             )
             messages.append(msg)
 
@@ -346,7 +350,7 @@ def ab_performance_test(
     """Does an A/B-test of the specified test with the given firecracker/jailer binaries"""
 
     return binary_ab_test(
-        lambda bin_dir, _: collect_data(bin_dir, pytest_opts),
+        lambda tag, bin_dir, _: collect_data(tag, bin_dir, pytest_opts),
         lambda ah, be: analyze_data(
             ah,
             be,