test: ab: operate on directories instead of commit SHAs

roypat · roypat · commit 5122c4d7da8a · 2024-11-04T09:50:06.000Z
The tools/ab_test.py script relies on precompiled binaries existing in
well-defined locations, and refuses to compile firecracker itself if
they're missing. It derives these locations from commit SHAs, so
conceptually, it makes more sense to cut out this middle step and just
directly pass in directories instead of SHAs (with the expectation that
the directories contain firecracker and jailer binaries).

However, the commit SHAs were still used to print the commit ranges over
which A/B-tests were done. It turns out that this was not working in the
vast majority of cases though, as the commit log printing logic did not
contain all the resolution logic that goes into compilation step (e.g.
for parsing revisions and such). So just remove that part.

In the EMF metrics, we now tag each report with the directory path
instead of the commit SHA. For the post-merge runs, this makes no
difference, as the SHA is part of the path.

Signed-off-by: Patrick Roy &lt;roypat@amazon.co.uk&gt;
diff --git a/.buildkite/pipeline_perf.py b/.buildkite/pipeline_perf.py
@@ -98,7 +98,7 @@
     pytest_opts = ""
     if REVISION_A:
         devtool_opts += " --ab"
-        pytest_opts = f"{ab_opts} run {REVISION_A} {REVISION_B} --test {test_path}"
+        pytest_opts = f"{ab_opts} run build/{REVISION_A}/build/cargo_target/$(uname -m)-unknown-linux-musl/release build/{REVISION_B}/build/cargo_target/$(uname -m)-unknown-linux-musl/release --test {test_path}"
     else:
         # Passing `-m ''` below instructs pytest to collect tests regardless of
         # their markers (e.g. it will collect both tests marked as nonci, and
diff --git a/tools/ab_test.py b/tools/ab_test.py
@@ -32,10 +32,8 @@
 sys.path.append(str(Path(__file__).parent.parent / "tests"))
 
 # pylint:disable=wrong-import-position
-from framework import utils
-from framework.ab_test import check_regression, git_ab_test
+from framework.ab_test import binary_ab_test, check_regression
 from framework.properties import global_props
-from host_tools.cargo_build import get_binary
 from host_tools.metrics import (
     emit_raw_emf,
     format_with_reduced_unit,
@@ -107,7 +105,7 @@ def find_unit(emf: dict, metric: str):
     return metrics.get(metric, "None")
 
 
-def load_data_series(report_path: Path, revision: str = None, *, reemit: bool = False):
+def load_data_series(report_path: Path, tag=None, *, reemit: bool = False):
     """Loads the data series relevant for A/B-testing from test_results/test-report.json
     into a dictionary mapping each message's cloudwatch dimensions to a dictionary of
     its list-valued properties/metrics.
@@ -126,10 +124,9 @@ def load_data_series(report_path: Path, revision: str = None, *, reemit: bool =
                 emf = json.loads(line)
 
                 if reemit:
-                    assert revision is not None
+                    assert tag is not None
 
-                    # These will show up in Cloudwatch, so canonicalize to long commit SHAs
-                    emf["git_commit_id"] = canonicalize_revision(revision)
+                    emf["git_commit_id"] = str(tag)
                     emit_raw_emf(emf)
 
                 dimensions, result = process_log_entry(emf)
@@ -159,7 +156,6 @@ def load_data_series(report_path: Path, revision: str = None, *, reemit: bool =
 def collect_data(binary_dir: Path, tests: list[str]):
     """Executes the specified test using the provided firecracker binaries"""
     # Example binary_dir: ../build/main/build/cargo_target/x86_64-unknown-linux-musl/release
-    revision = binary_dir.parents[3].name
 
     print(f"Collecting samples with {binary_dir}")
     subprocess.run(
@@ -172,7 +168,7 @@ def collect_data(binary_dir: Path, tests: list[str]):
         check=True,
     )
     return load_data_series(
-        Path("test_results/test-report.json"), revision, reemit=True
+        Path("test_results/test-report.json"), binary_dir, reemit=True
     )
 
 
@@ -311,23 +307,17 @@ def analyze_data(
 
 
 def ab_performance_test(
-    a_revision, b_revision, tests, p_thresh, strength_abs_thresh, noise_threshold
+    a_revision: Path,
+    b_revision: Path,
+    tests,
+    p_thresh,
+    strength_abs_thresh,
+    noise_threshold,
 ):
-    """Does an A/B-test of the specified test across the given revisions"""
-    _, commit_list, _ = utils.check_output(
-        f"git --no-pager log --oneline {a_revision}..{b_revision}"
-    )
-    print(
-        f"Performance A/B-test across {a_revision}..{b_revision}. This includes the following commits:"
-    )
-    print(commit_list.strip())
+    """Does an A/B-test of the specified test with the given firecracker/jailer binaries"""
 
-    def test_runner(workspace, _is_ab: bool):
-        bin_dir = get_binary("firecracker", workspace_dir=workspace).parent
-        return collect_data(bin_dir, tests)
-
-    return git_ab_test(
-        test_runner,
+    return binary_ab_test(
+        lambda bin_dir, _: collect_data(bin_dir, tests),
         lambda ah, be: analyze_data(
             ah,
             be,
@@ -336,16 +326,11 @@ def test_runner(workspace, _is_ab: bool):
             noise_threshold,
             n_resamples=int(100 / p_thresh),
         ),
-        a_revision=a_revision,
-        b_revision=b_revision,
+        a_directory=a_revision,
+        b_directory=b_revision,
     )
 
 
-def canonicalize_revision(revision):
-    """Canonicalizes the given revision to a 40 digit hex SHA"""
-    return utils.check_output(f"git rev-parse {revision}").stdout.strip()
-
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Executes Firecracker's A/B testsuite across the specified commits"
@@ -357,11 +342,13 @@ def canonicalize_revision(revision):
     )
     run_parser.add_argument(
         "a_revision",
-        help="The baseline revision compared to which we want to avoid regressing",
+        help="Directory containing firecracker and jailer binaries to be considered the performance baseline",
+        type=Path,
     )
     run_parser.add_argument(
         "b_revision",
-        help="The revision whose performance we want to compare against the results from a_revision",
+        help="Directory containing firecracker and jailer binaries whose performance we want to compare against the results from a_revision",
+        type=Path,
     )
     run_parser.add_argument("--test", help="The test to run", nargs="+", required=True)
     analyze_parser = subparsers.add_parser(