Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions tests/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ function to [`.buildkite/pipeline_perf.py`](../.buildkite/pipeline_perf.py). To
manually run an A/B-Test, use

```sh
tools/devtool -y test --ab [optional arguments to ab_test.py] run <dir A> <dir B> --test <test specification>
tools/devtool -y test --ab [optional arguments to ab_test.py] run <dir A> <dir B> --pytest-opts <test specification>
```

Here, _dir A_ and _dir B_ are directories containing firecracker and jailer
Expand All @@ -198,7 +198,7 @@ branch and the `HEAD` of your current branch, run
```sh
tools/devtool -y build --rev main --release
tools/devtool -y build --rev HEAD --release
tools/devtool -y test --no-build --ab -- run build/main build/HEAD --test integration_tests/performance/test_boottime.py::test_boottime
tools/devtool -y test --no-build --ab -- run build/main build/HEAD --pytest-opts integration_tests/performance/test_boottime.py::test_boottime
```

#### How to Write an A/B-Compatible Test and Common Pitfalls
Expand All @@ -213,9 +213,9 @@ dimension to match up data series between two test runs. It only matches up two
data series with the same name if their dimensions match.

Special care needs to be taken when pytest expands the argument passed to
`tools/ab_test.py`'s `--test` option into multiple individual test cases. If two
test cases use the same dimensions for different data series, the script will
fail and print out the names of the violating data series. For this reason,
`tools/ab_test.py`'s `--pytest-opts` option into multiple individual test cases.
If two test cases use the same dimensions for different data series, the script
will fail and print out the names of the violating data series. For this reason,
**A/B-Compatible tests should include a `performance_test` key in their
dimension set whose value is set to the name of the test**.

Expand Down
40 changes: 21 additions & 19 deletions tools/ab_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@
{"fio_engine": "libaio", "metric": "clat_write"},
# boot time metrics
{"performance_test": "test_boottime", "metric": "resume_time"},
# block throughput on m8g
{"fio_engine": "libaio", "vcpus": 2, "instance": "m8g.metal-24xl"},
{"fio_engine": "libaio", "vcpus": 2, "instance": "m8g.metal-48xl"},
]


Expand Down Expand Up @@ -119,8 +122,6 @@ def load_data_series(report_path: Path, tag=None, *, reemit: bool = False):
# Dictionary mapping EMF dimensions to A/B-testable metrics/properties
processed_emf = {}

distinct_values_per_dimenson = defaultdict(set)

report = json.loads(report_path.read_text("UTF-8"))
for test in report["tests"]:
for line in test["teardown"]["stdout"].splitlines():
Expand All @@ -140,9 +141,6 @@ def load_data_series(report_path: Path, tag=None, *, reemit: bool = False):
if not dimensions:
continue

for dimension, value in dimensions.items():
distinct_values_per_dimenson[dimension].add(value)

dimension_set = frozenset(dimensions.items())

if dimension_set not in processed_emf:
Expand All @@ -159,24 +157,27 @@ def load_data_series(report_path: Path, tag=None, *, reemit: bool = False):

values.extend(result[metric][0])

irrelevant_dimensions = set()
return processed_emf

for dimension, distinct_values in distinct_values_per_dimenson.items():
if len(distinct_values) == 1:
irrelevant_dimensions.add(dimension)

post_processed_emf = {}
def uninteresting_dimensions(processed_emf):
"""
Computes the set of cloudwatch dimensions that only ever take on a
single value across the entire dataset.
"""
values_per_dimension = defaultdict(set)

for dimension_set in processed_emf:
for dimension, value in dimension_set:
values_per_dimension[dimension].add(value)

for dimension_set, metrics in processed_emf.items():
processed_key = frozenset(
(dim, value)
for (dim, value) in dimension_set
if dim not in irrelevant_dimensions
)
uninteresting = set()

post_processed_emf[processed_key] = metrics
for dimension, distinct_values in values_per_dimension.items():
if len(distinct_values) == 1:
uninteresting.add(dimension)

return post_processed_emf
return uninteresting


def collect_data(binary_dir: Path, pytest_opts: str):
Expand Down Expand Up @@ -304,6 +305,7 @@ def analyze_data(
)

messages = []
do_not_print_list = uninteresting_dimensions(processed_emf_a)
for dimension_set, metric, result, unit in failures:
# Sanity check as described above
if abs(statistics.mean(relative_changes_by_metric[metric])) <= noise_threshold:
Expand All @@ -325,7 +327,7 @@ def analyze_data(
f"for metric \033[1m{metric}\033[0m with \033[0;31m\033[1mp={result.pvalue}\033[0m. "
f"This means that observing a change of this magnitude or worse, assuming that performance "
f"characteristics did not change across the tested commits, has a probability of {result.pvalue:.2%}. "
f"Tested Dimensions:\n{json.dumps(dict(dimension_set), indent=2, sort_keys=True)}"
f"Tested Dimensions:\n{json.dumps(dict({k: v for k,v in dimension_set.items() if k not in do_not_print_list}), indent=2, sort_keys=True)}"
)
messages.append(msg)

Expand Down