Skip to content

Commit cac5e79

Browse files
committed
feat: add ab_plot.py script
Add a script to create visualizations of performance data. - works for both single and multiple inputs - allows comparison of different hosts - allows comparison of multiple tests - can generate pdf and html outputs Signed-off-by: Egor Lazarchuk <[email protected]>
1 parent fa2627a commit cac5e79

File tree

2 files changed

+319
-0
lines changed

2 files changed

+319
-0
lines changed

tests/README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,15 @@ schedule an A/B-Test in buildkite, the `REVISION_A` and `REVISION_B` environment
251251
variables need to be set in the "Environment Variables" field under "Options" in
252252
buildkite's "New Build" modal.
253253

254+
### A/B visualization
255+
256+
To create visualization of A/B runs use `tools/ab_plot.py` script. Example
257+
usage:
258+
259+
```sh
260+
./tools/plot.py a_path b_path --output_type pdf
261+
```
262+
254263
### Beyond commit comparisons
255264

256265
While our automated A/B-Testing suite only supports A/B-Tests across commit

tools/ab_plot.py

Lines changed: 310 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,310 @@
1+
#!/usr/bin/env python3
2+
# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
"""
5+
Script for creating visualizations for A/B runs.
6+
7+
Usage:
8+
ab_plot.py path_to_run_a path_to_run_b path_to_run_c ... --output_type pdf/table
9+
"""
10+
11+
import argparse
12+
import glob
13+
import json
14+
import time
15+
from pathlib import Path
16+
from typing import Callable, List
17+
18+
import matplotlib.pyplot as plt
19+
import numpy as np
20+
import pandas as pd
21+
import scipy
22+
import seaborn as sns
23+
from matplotlib.backends.backend_pdf import PdfPages
24+
25+
pd.set_option("display.float_format", "{:.2f}".format)
26+
27+
28+
def check_regression(
29+
a_samples: List[float],
30+
b_samples: List[float],
31+
statistic: Callable = np.mean,
32+
*,
33+
n_resamples=9999,
34+
):
35+
"""
36+
Check if 2 sample groups have a statistically big enough difference
37+
"""
38+
result = scipy.stats.permutation_test(
39+
(a_samples, b_samples),
40+
lambda x, y: statistic(y) - statistic(x),
41+
vectorized=False,
42+
n_resamples=n_resamples,
43+
)
44+
statistic_a = statistic(a_samples)
45+
46+
return result.pvalue, result.statistic / statistic_a, result.statistic
47+
48+
49+
def load_data(data_path: Path):
50+
"""
51+
Recursively collects `metrics.json` files in provided path
52+
"""
53+
data = []
54+
for name in glob.glob(f"{data_path}/**/metrics.json", recursive=True):
55+
with open(name, encoding="utf-8") as f:
56+
j = json.load(f)
57+
58+
if "performance_test" not in j["dimensions"]:
59+
print(f"skipping: {name}")
60+
continue
61+
62+
metrics = j["metrics"]
63+
# Move test name from dimentions into a separate column
64+
perf_test = j["dimensions"]["performance_test"]
65+
del j["dimensions"]["performance_test"]
66+
67+
# These are host specific and will prevent comparison of
68+
# different hosts
69+
del j["dimensions"]["instance"]
70+
del j["dimensions"]["cpu_model"]
71+
del j["dimensions"]["host_kernel"]
72+
73+
dimentions = frozenset(j["dimensions"].items())
74+
75+
for m in metrics:
76+
if "cpu_utilization" in m:
77+
continue
78+
mm = metrics[m]
79+
unit = mm["unit"]
80+
values = mm["values"]
81+
for i, v in enumerate(values):
82+
data.append(
83+
{
84+
"index": i,
85+
"test": perf_test,
86+
"metric": m,
87+
"value": v,
88+
"unit": unit,
89+
"dimensions": dimentions,
90+
}
91+
)
92+
93+
return data
94+
95+
96+
def p50(a):
97+
"""Returns 50th percentile of 1d-array a"""
98+
return np.percentile(a, 50)
99+
100+
101+
def p90(a):
102+
"""Returns 90th percentile of 1d-array a"""
103+
return np.percentile(a, 90)
104+
105+
106+
def create_table(df: pd.DataFrame):
107+
"""Create an html table per test in the data frame"""
108+
109+
for test_value in df["test"].unique():
110+
df_test = df[df["test"] == test_value]
111+
112+
# Split dimensions into separate columns
113+
df_expanded = df_test.copy()
114+
dim_data = []
115+
for _, row in df_expanded.iterrows():
116+
dim_dict = dict(row["dimensions"])
117+
dim_data.append(dim_dict)
118+
119+
# Need to reset indexes because otherwise `pd.concat` will add NaN in all
120+
# rows where indexes differ
121+
dim_df = pd.DataFrame(dim_data).reset_index(drop=True)
122+
df_data = df_expanded.drop("dimensions", axis=1).reset_index(drop=True)
123+
df_expanded = pd.concat([df_data, dim_df], axis=1)
124+
125+
# Use dimension columns as index
126+
dim_cols = sorted(list(dim_df.columns))
127+
df_pivoted = df_expanded.pivot_table(
128+
values=["value"],
129+
index=["metric", "unit"] + dim_cols,
130+
columns="group",
131+
aggfunc=[p50, p90],
132+
)
133+
134+
# Add comparison columns for each group vs first group (A)
135+
groups = sorted(df_test["group"].unique())
136+
for baseline in groups:
137+
for group in groups:
138+
if group == baseline:
139+
continue
140+
for stat in ["p50", "p90"]:
141+
diff_col = (stat, "value", f"{baseline}->{group} %")
142+
df_pivoted[diff_col] = (
143+
(
144+
df_pivoted[(stat, "value", group)]
145+
- df_pivoted[(stat, "value", baseline)]
146+
)
147+
/ df_pivoted[(stat, "value", baseline)]
148+
* 100.0
149+
)
150+
diff_col = (stat, "value", f"{baseline}->{group} abs")
151+
df_pivoted[diff_col] = (
152+
df_pivoted[(stat, "value", group)]
153+
- df_pivoted[(stat, "value", baseline)]
154+
)
155+
156+
# Sort columns to have a persistent table representation
157+
df_pivoted = df_pivoted[sorted(df_pivoted.columns)]
158+
159+
test_output_path = f"{test_value}.html"
160+
with open(test_output_path, "w", encoding="UTF-8") as writer:
161+
writer.write("<br>")
162+
styled = df_pivoted.style.format(precision=2)
163+
styled = styled.set_table_attributes("border=1")
164+
styled = styled.set_table_styles(
165+
[{"selector": 'th:contains("->")', "props": [("min-width", "80px")]}]
166+
)
167+
168+
# Apply color gradient to all comparison columns
169+
for baseline in groups:
170+
for group in groups:
171+
if group == baseline:
172+
continue
173+
for stat in ["p50", "p90"]:
174+
diff_col = (stat, "value", f"{baseline}->{group} %")
175+
styled = styled.background_gradient(
176+
subset=[diff_col], cmap="RdYlGn"
177+
)
178+
179+
writer.write(styled.to_html())
180+
writer.write("<br>")
181+
print(f"Ready: {test_output_path}")
182+
183+
184+
def create_pdf(args, df: pd.DataFrame):
185+
"""Create a pdf per test in the data frame"""
186+
187+
sns.set_style("whitegrid")
188+
metrics = df["metric"].unique()
189+
n_groups = len(df["group"].unique())
190+
191+
for test_value in df["test"].unique():
192+
test_output_path = f"{test_value}.pdf"
193+
with PdfPages(test_output_path) as pdf:
194+
df_test = df[df["test"] == test_value]
195+
for dim_value in df_test["dimensions"].unique():
196+
for metric in metrics:
197+
metric_data = df_test[
198+
(df_test["metric"] == metric)
199+
& (df_test["dimensions"] == dim_value)
200+
]
201+
202+
if len(metric_data) == 0:
203+
continue
204+
205+
additional_title = ""
206+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
207+
if n_groups == 2:
208+
# Check if difference is significant
209+
a_values = metric_data[metric_data["group"] == "A"][
210+
"value"
211+
].values
212+
b_values = metric_data[metric_data["group"] == "B"][
213+
"value"
214+
].values
215+
pvalue, diff_rel, diff_abs = check_regression(
216+
a_values, b_values
217+
)
218+
219+
if (
220+
pvalue <= 0.1
221+
and abs(diff_rel) >= 0.05
222+
and abs(diff_abs) >= 0.0
223+
):
224+
fig.patch.set_facecolor("lightcoral")
225+
additional_title = (
226+
f"{diff_rel * 100:+.2f}% ({diff_abs:+.2f}) difference"
227+
)
228+
229+
# Make a multi-line title since single line will be too long
230+
dim_items = sorted(str(item) for item in dim_value)
231+
dim_chunks = [
232+
", ".join(dim_items[i : i + 4])
233+
for i in range(0, len(dim_items), 4)
234+
]
235+
dim_str = "\n".join(dim_chunks)
236+
title = f"{metric}\n{dim_str}\n{additional_title}"
237+
if additional_title:
238+
weight = "bold"
239+
else:
240+
weight = "normal"
241+
fig.suptitle(title, fontsize=10, weight=weight)
242+
243+
sns.boxenplot(data=metric_data, x="group", y="value", ax=ax1)
244+
ax1.set_ylabel(f"{metric} ({metric_data['unit'].iloc[0]})")
245+
246+
metric_data_indexed = metric_data.reset_index()
247+
errorbar = (args.errorbar[0], int(args.errorbar[1]))
248+
sns.lineplot(
249+
data=metric_data_indexed,
250+
x="index",
251+
y="value",
252+
hue="group",
253+
ax=ax2,
254+
errorbar=errorbar,
255+
)
256+
ax2.set_ylabel(f"{metric} ({metric_data['unit'].iloc[0]})")
257+
258+
plt.tight_layout()
259+
pdf.savefig()
260+
plt.close()
261+
print(f"Ready: {test_output_path}")
262+
263+
264+
if __name__ == "__main__":
265+
parser = argparse.ArgumentParser(
266+
description="Executes Firecracker's A/B testsuite across the specified commits"
267+
)
268+
parser.add_argument(
269+
"paths",
270+
nargs="+",
271+
help="Paths to directories with test runs",
272+
type=Path,
273+
)
274+
parser.add_argument(
275+
"--errorbar",
276+
nargs=2,
277+
default=["pi", "95"],
278+
help="Errorbar configuration for lineplot (type, value)",
279+
)
280+
parser.add_argument(
281+
"--output_type",
282+
default=["pdf"],
283+
help="Type of the output to generate",
284+
)
285+
args = parser.parse_args()
286+
287+
# Data retrieval
288+
start_time = time.time()
289+
all_data = []
290+
for i, path in enumerate(args.paths):
291+
data = load_data(path)
292+
print(f"getting data {i} from {path}: {len(data)}")
293+
df = pd.DataFrame(data)
294+
df["group"] = chr(65 + i) # A, B, C, D, ...
295+
all_data.append(df)
296+
print(f"Data retrieval: {time.time() - start_time:.2f}s")
297+
298+
# Data processing
299+
start_time = time.time()
300+
df_combined = pd.concat(all_data, ignore_index=True)
301+
print(f"Data processing: {time.time() - start_time:.2f}s")
302+
303+
# Plotting
304+
start_time = time.time()
305+
if args.output_type == "pdf":
306+
create_pdf(args, df_combined)
307+
if args.output_type == "table":
308+
create_table(df_combined)
309+
310+
print(f"Plotting: {time.time() - start_time:.2f}s")

0 commit comments

Comments
 (0)