Skip to content

Commit 04630dd

Browse files
committed
feat: add ab_plot.py script
Add a script to create tables and plots for performance runs. It works for both single and multiple runs and can generate pdf and html output. Signed-off-by: Egor Lazarchuk <[email protected]>
1 parent ceeca6a commit 04630dd

File tree

2 files changed

+311
-0
lines changed

2 files changed

+311
-0
lines changed

tests/README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,15 @@ schedule an A/B-Test in buildkite, the `REVISION_A` and `REVISION_B` environment
251251
variables need to be set in the "Environment Variables" field under "Options" in
252252
buildkite's "New Build" modal.
253253

254+
### A/B visualization
255+
256+
To create visualization of A/B runs use `tools/ab_plot.py` script. Example
257+
usage:
258+
259+
```sh
260+
./tools/plot.py a_path b_path --output_type pdf
261+
```
262+
254263
### Beyond commit comparisons
255264

256265
While our automated A/B-Testing suite only supports A/B-Tests across commit

tools/ab_plot.py

Lines changed: 302 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,302 @@
1+
#!/usr/bin/env python3
2+
# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
"""
5+
Script for creating visualizations for A/B runs.
6+
7+
Usage:
8+
ab_plot.py path_to_run_a path_to_run_b path_to_run_c ... --output_type pdf/table
9+
"""
10+
11+
import argparse
12+
import glob
13+
import json
14+
import time
15+
from pathlib import Path
16+
from typing import Callable, List
17+
18+
import matplotlib.pyplot as plt
19+
import numpy as np
20+
import pandas as pd
21+
import scipy
22+
import seaborn as sns
23+
from matplotlib.backends.backend_pdf import PdfPages
24+
25+
pd.set_option("display.float_format", "{:.2f}".format)
26+
27+
28+
def check_regression(
29+
a_samples: List[float],
30+
b_samples: List[float],
31+
statistic: Callable = np.mean,
32+
*,
33+
n_resamples=9999,
34+
):
35+
"""
36+
Check if 2 sample groups have a statistically big enough difference
37+
"""
38+
result = scipy.stats.permutation_test(
39+
(a_samples, b_samples),
40+
lambda x, y: statistic(y) - statistic(x),
41+
vectorized=False,
42+
n_resamples=n_resamples,
43+
)
44+
statistic_a = statistic(a_samples)
45+
46+
return result.pvalue, result.statistic / statistic_a, result.statistic
47+
48+
49+
def load_data(data_path: Path):
50+
"""
51+
Recursively collects `metrics.json` files in provided path
52+
"""
53+
data = []
54+
for name in glob.glob(f"{data_path}/**/metrics.json", recursive=True):
55+
with open(name, encoding="utf-8") as f:
56+
j = json.load(f)
57+
58+
if "performance_test" not in j["dimensions"]:
59+
print(f"skipping: {name}")
60+
continue
61+
62+
metrics = j["metrics"]
63+
perf_test = j["dimensions"]["performance_test"]
64+
del j["dimensions"]["performance_test"]
65+
dimentions = frozenset(j["dimensions"].items())
66+
67+
for m in metrics:
68+
if "cpu_utilization" in m:
69+
continue
70+
mm = metrics[m]
71+
unit = mm["unit"]
72+
values = mm["values"]
73+
for i, v in enumerate(values):
74+
data.append(
75+
{
76+
"index": i,
77+
"test": perf_test,
78+
"metric": m,
79+
"value": v,
80+
"unit": unit,
81+
"dimensions": dimentions,
82+
}
83+
)
84+
85+
return data
86+
87+
88+
def p50(a):
89+
"""Returns 50th percentile of 1d-array a"""
90+
return np.percentile(a, 50)
91+
92+
93+
def p90(a):
94+
"""Returns 90th percentile of 1d-array a"""
95+
return np.percentile(a, 90)
96+
97+
98+
def create_table(df: pd.DataFrame):
99+
"""Create an html table per test in the data frame"""
100+
101+
for test_value in df["test"].unique():
102+
df_test = df[df["test"] == test_value]
103+
104+
# Split dimensions into separate columns
105+
df_expanded = df_test.copy()
106+
dim_data = []
107+
for _, row in df_expanded.iterrows():
108+
dim_dict = dict(row["dimensions"])
109+
dim_data.append(dim_dict)
110+
111+
# Need to reset indexes because otherwise `pd.concat` will add NaN in all
112+
# rows where indexes differ
113+
dim_df = pd.DataFrame(dim_data).reset_index(drop=True)
114+
df_data = df_expanded.drop("dimensions", axis=1).reset_index(drop=True)
115+
df_expanded = pd.concat([df_data, dim_df], axis=1)
116+
117+
# Use dimension columns as index
118+
dim_cols = sorted(list(dim_df.columns))
119+
df_pivoted = df_expanded.pivot_table(
120+
values=["value"],
121+
index=["metric", "unit"] + dim_cols,
122+
columns="group",
123+
aggfunc=[p50, p90],
124+
)
125+
126+
# Add comparison columns for each group vs first group (A)
127+
groups = sorted(df_test["group"].unique())
128+
for baseline in groups:
129+
for group in groups:
130+
if group == baseline:
131+
continue
132+
for stat in ["p50", "p90"]:
133+
diff_col = (stat, "value", f"{baseline}->{group} %")
134+
df_pivoted[diff_col] = (
135+
(
136+
df_pivoted[(stat, "value", group)]
137+
- df_pivoted[(stat, "value", baseline)]
138+
)
139+
/ df_pivoted[(stat, "value", baseline)]
140+
* 100.0
141+
)
142+
diff_col = (stat, "value", f"{baseline}->{group} abs")
143+
df_pivoted[diff_col] = (
144+
df_pivoted[(stat, "value", group)]
145+
- df_pivoted[(stat, "value", baseline)]
146+
)
147+
148+
# Sort columns to have a persistent table representation
149+
df_pivoted = df_pivoted[sorted(df_pivoted.columns)]
150+
151+
test_output_path = f"{test_value}.html"
152+
with open(test_output_path, "w", encoding="UTF-8") as writer:
153+
writer.write("<br>")
154+
styled = df_pivoted.style.format(precision=2)
155+
styled = styled.set_table_attributes("border=1")
156+
styled = styled.set_table_styles(
157+
[{"selector": 'th:contains("->")', "props": [("min-width", "80px")]}]
158+
)
159+
160+
# Apply color gradient to all comparison columns
161+
for baseline in groups:
162+
for group in groups:
163+
if group == baseline:
164+
continue
165+
for stat in ["p50", "p90"]:
166+
diff_col = (stat, "value", f"{baseline}->{group} %")
167+
styled = styled.background_gradient(
168+
subset=[diff_col], cmap="RdYlGn"
169+
)
170+
171+
writer.write(styled.to_html())
172+
writer.write("<br>")
173+
print(f"Ready: {test_output_path}")
174+
175+
176+
def create_pdf(args, df: pd.DataFrame):
177+
"""Create a pdf per test in the data frame"""
178+
179+
sns.set_style("whitegrid")
180+
metrics = df["metric"].unique()
181+
n_groups = len(df["group"].unique())
182+
183+
for test_value in df["test"].unique():
184+
test_output_path = f"{test_value}.pdf"
185+
with PdfPages(test_output_path) as pdf:
186+
df_test = df[df["test"] == test_value]
187+
for dim_value in df_test["dimensions"].unique():
188+
for metric in metrics:
189+
metric_data = df_test[
190+
(df_test["metric"] == metric)
191+
& (df_test["dimensions"] == dim_value)
192+
]
193+
194+
if len(metric_data) == 0:
195+
continue
196+
197+
additional_title = ""
198+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
199+
if n_groups == 2:
200+
# Check if difference is significant
201+
a_values = metric_data[metric_data["group"] == "A"][
202+
"value"
203+
].values
204+
b_values = metric_data[metric_data["group"] == "B"][
205+
"value"
206+
].values
207+
pvalue, diff_rel, diff_abs = check_regression(
208+
a_values, b_values
209+
)
210+
211+
if (
212+
pvalue <= 0.1
213+
and abs(diff_rel) >= 0.05
214+
and abs(diff_abs) >= 0.0
215+
):
216+
fig.patch.set_facecolor("lightcoral")
217+
additional_title = (
218+
f"{diff_rel * 100:+.2f}% ({diff_abs:+.2f}) difference"
219+
)
220+
221+
# Make a multi-line title since single line will be too long
222+
dim_items = sorted(str(item) for item in dim_value)
223+
dim_chunks = [
224+
", ".join(dim_items[i : i + 4])
225+
for i in range(0, len(dim_items), 4)
226+
]
227+
dim_str = "\n".join(dim_chunks)
228+
title = f"{metric}\n{dim_str}\n{additional_title}"
229+
if additional_title:
230+
weight = "bold"
231+
else:
232+
weight = "normal"
233+
fig.suptitle(title, fontsize=10, weight=weight)
234+
235+
sns.boxenplot(data=metric_data, x="group", y="value", ax=ax1)
236+
ax1.set_ylabel(f"{metric} ({metric_data['unit'].iloc[0]})")
237+
238+
metric_data_indexed = metric_data.reset_index()
239+
errorbar = (args.errorbar[0], int(args.errorbar[1]))
240+
sns.lineplot(
241+
data=metric_data_indexed,
242+
x="index",
243+
y="value",
244+
hue="group",
245+
ax=ax2,
246+
errorbar=errorbar,
247+
)
248+
ax2.set_ylabel(f"{metric} ({metric_data['unit'].iloc[0]})")
249+
250+
plt.tight_layout()
251+
pdf.savefig()
252+
plt.close()
253+
print(f"Ready: {test_output_path}")
254+
255+
256+
if __name__ == "__main__":
257+
parser = argparse.ArgumentParser(
258+
description="Executes Firecracker's A/B testsuite across the specified commits"
259+
)
260+
parser.add_argument(
261+
"paths",
262+
nargs="+",
263+
help="Paths to directories with test runs",
264+
type=Path,
265+
)
266+
parser.add_argument(
267+
"--errorbar",
268+
nargs=2,
269+
default=["pi", "95"],
270+
help="Errorbar configuration for lineplot (type, value)",
271+
)
272+
parser.add_argument(
273+
"--output_type",
274+
default=["pdf"],
275+
help="Type of the output to generate",
276+
)
277+
args = parser.parse_args()
278+
279+
# Data retrieval
280+
start_time = time.time()
281+
all_data = []
282+
for i, path in enumerate(args.paths):
283+
data = load_data(path)
284+
print(f"getting data {i} from {path}: {len(data)}")
285+
df = pd.DataFrame(data)
286+
df["group"] = chr(65 + i) # A, B, C, D, ...
287+
all_data.append(df)
288+
print(f"Data retrieval: {time.time() - start_time:.2f}s")
289+
290+
# Data processing
291+
start_time = time.time()
292+
df_combined = pd.concat(all_data, ignore_index=True)
293+
print(f"Data processing: {time.time() - start_time:.2f}s")
294+
295+
# Plotting
296+
start_time = time.time()
297+
if args.output_type == "pdf":
298+
create_pdf(args, df_combined)
299+
if args.output_type == "table":
300+
create_table(df_combined)
301+
302+
print(f"Plotting: {time.time() - start_time:.2f}s")

0 commit comments

Comments
 (0)