diff --git a/README.md b/README.md index 668b516d..11f88285 100644 --- a/README.md +++ b/README.md @@ -214,11 +214,26 @@ If you don't want the results to be summarized and displayed in a human-readable format, you can get raw JSON or CSV via the `--raw` flag: ``` -$ cargo run -- benchmark --raw --output-format csv -- benchmark.wasm +$ cargo run -- benchmark --raw --output-format json -- benchmark.wasm ``` Then you can use your own R/Python/spreadsheets/etc. to analyze and visualize -the benchmark results. +the benchmark results. The JSON output is also compatible with the viz.py +script in tree. + +### Benchmark Results Visualization (viz.py) + +For results collected in the JSON format, a tool is provided in tree that can +visualize benchmark results as an HTML document with embedded graphics. This +tool requires that the python +[`uv`](https://docs.astral.sh/uv/getting-started/installation/) tool be +installed. + +``` +$ ./scripts/viz.py -o experiment.html baseline.json experiment1.json experiment2.json +``` + +See the `--help` option on the tool for additional usage options. ### Adding a New Benchmark diff --git a/scripts/viz.jinja b/scripts/viz.jinja new file mode 100644 index 00000000..6d5fe992 --- /dev/null +++ b/scripts/viz.jinja @@ -0,0 +1,97 @@ + + + + + + Sightglass Benchmark Results + + + + + + {% for benchmark in benchmarks %} + + + {% endfor %} + + +

Sightglass Benchmark Results

+
+ Key: + Baseline CV > 5% -- Inconsistent Benchmark + Test Slower by CV%+ from baseline + Test Faster by CV%+ from baseline +
+
+ CV is the Coefficient of Variation for the baseline benchmark run. +
+ + + + + {% for prefix in prefixes %} + + {% endfor %} + + + + {% for benchmark in benchmarks %} + + + {% for prefix in prefixes %} + {% set stats = benchmark.stats if prefix == baseline else benchmark.stats.relative[prefix] %} + {% if prefix == baseline %} + {% set class = "inconsistent" if stats.cv > 5 else "" %} + + {% else %} + {% set class = "slower" if stats.p25_delta_pct > benchmark.stats.cv else "faster" if stats.p25_delta_pct < -1 * benchmark.stats.cv else "" %} + + {% endif %} + {% endfor %} + + {% endfor %} + +
Benchmark{% if prefix == baseline %}*{% endif %}{{ prefix }}
{{ benchmark.name }}{{ "%.2f"|format(stats.p25) }} +/- {{ "%.2f"|format(stats.cv) }}%{{ "%.2f"|format(stats.p25_delta_pct) }}%
+ +
+ {% for benchmark in benchmarks %} +

{{ benchmark.name }}

+ +
+ {% endfor %} +
+ + > + + diff --git a/scripts/viz.py b/scripts/viz.py new file mode 100644 index 00000000..5ced0017 --- /dev/null +++ b/scripts/viz.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python3 +# +# /// script +# dependencies = [ +# "pandas", +# "altair", +# "jinja2" +# ] +# /// + +import argparse +import os +import pandas as pd +import altair as alt +from dataclasses import dataclass +import re +import sys +import pathlib + +# our usage isn't abusing this, suppress noisy warnings +pd.options.mode.chained_assignment = None + + +@dataclass +class RelativeStats: + cv: float + p25: float + p25_delta_pct: float + + +@dataclass +class Stats: + cv: float + p25: float + relative: dict[str, RelativeStats] + + +def wasm_path_to_benchmark_name(wasm_path: str) -> str: + splits = wasm_path.split("/") + if splits[-1] == "benchmark.wasm": + # E.g. noop/benchmark.wasm -> noop + return splits[-2] + else: + # E.g. libsodium/libsodium-box7.wasm -> libsodium-box7 + return splits[-1].replace(".wasm", "") + + +def parse_single_input(path, prefix, pass_num=1, measure="cycles"): + df = pd.read_json(path) + df = df[df["event"] == measure] + df["pass"] = pass_num + df["prefix"] = prefix + return df + + +def parse_inputs(inputs, measure="cycles"): + """Yield a chart for each pass for this prefix""" + if len(inputs) == 1 and not inputs[0].endswith(".json"): + # assume this is a directory; try to use all JSON files + import glob + + file_inputs = list(sorted(glob.glob(f"{inputs[0]}/*.json"))) + return parse_inputs(file_inputs, measure) + + # list of files now; organize by prefix if detected + RE_NUMBERED = re.compile(r"(?P.+)-(?P\d+).json") + df = pd.DataFrame() + for path in inputs: + path = pathlib.Path(path) + if not path.exists(): + print(f"{path} not found!") + return sys.exit(1) + + match = RE_NUMBERED.match(path.name) + if match: + prefix = match["prefix"] + pass_num = int(match["number"]) + df = pd.concat([df, parse_single_input(path, prefix, pass_num)]) + else: + prefix = path.stem + df = pd.concat([df, parse_single_input(path, prefix)]) + + return df + + +def compute_stats(df, benchmark, baseline): + # select only rows for this benchmark + df = df[df["benchmark"] == benchmark] + + baseline_df = df[df["prefix"] == baseline] + prefixes = df[df["prefix"] != baseline]["prefix"].unique() + + baseline_p25 = baseline_df["count"].quantile(0.25) + baseline_mean = baseline_df["count"].mean() + baseline_cv = (baseline_df["count"].std() / baseline_mean) * 100 + stats = Stats(cv=baseline_cv, p25=baseline_p25, relative={}) + for prefix in prefixes: + prefix_df = df[df["prefix"] == prefix] + prefix_p25 = prefix_df["count"].quantile(0.25) + prefix_mean = prefix_df["count"].mean() + prefix_cv = (prefix_df["count"].std() / prefix_mean) * 100 + p25_delta_pct = ( + (prefix_p25 - baseline_p25) / ((baseline_p25 + prefix_p25) / 2) + ) * 100 + rel_stats = RelativeStats( + cv=prefix_cv, p25=prefix_p25, p25_delta_pct=p25_delta_pct + ) + stats.relative[prefix] = rel_stats + + return stats + + +def plot_benchmark(df, baseline, benchmark): + # select only rows for this benchmark + df = df[df["benchmark"] == benchmark] + + chart1 = ( + alt.Chart(df) + .mark_boxplot() + .encode( + y=alt.Y("prefix", title=None), + x=alt.X("count:Q", title="Count (Cycles)"), + color=alt.Color("prefix"), + ) + ) + + df_baseline = df[df["prefix"] == baseline] + baseline_p25 = df_baseline["count"].quantile(0.25) + df["pct_diff_from_p25"] = (df["count"] - baseline_p25) / baseline_p25 * 100 + + chart2 = ( + alt.Chart(df) + .mark_boxplot() + .encode( + y=alt.Y("prefix", title=None), + x=alt.X("pct_diff_from_p25", title="Percent Difference from Baseline p25"), + color=alt.Color("prefix"), + ) + ) + + return alt.hconcat(chart1, chart2).properties(title=f"{benchmark}") + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("inputs", nargs="+", help="Directory or JSON files to analyze") + parser.add_argument( + "-b", + "--baseline", + default=None, + help="prefix of baseline to use (by default look for one containing 'baseline')", + ) + parser.add_argument( + "-o", "--output", default="viz.html", help="HTML File to output" + ) + args = parser.parse_args() + return args + + +def render(args, tmpl_data): + from jinja2 import Environment, FileSystemLoader + + env = Environment( + loader=FileSystemLoader(os.path.abspath(os.path.dirname(__file__))) + ) + viz_template = env.get_template("viz.jinja") + rendered = viz_template.render(**tmpl_data) + with open(args.output, "w") as out: + out.write(rendered) + print(f"Wrote {args.output}") + + +def main(): + args = parse_args() + df = parse_inputs(args.inputs) + + # add column for benchmark + df["benchmark"] = df.apply( + lambda row: wasm_path_to_benchmark_name(row["wasm"]), axis=1 + ) + + tmpl_data = {} + benchmarks = df["benchmark"].unique() + baseline = ( + next((p for p in df["prefix"].unique() if "baseline" in p), None) + or benchmarks[0] + ) + + # reorg prefixes so that baseline is first + prefixes = df["prefix"].unique() + for i, prefix in enumerate(prefixes): + if prefix == baseline: + prefixes[i] = prefixes[0] + prefixes[0] = baseline + break + tmpl_data["prefixes"] = prefixes + tmpl_data["baseline"] = baseline + tmpl_data["benchmarks"] = [] + for benchmark in benchmarks: + chart = plot_benchmark(df, baseline, benchmark) + tmpl_data["benchmarks"].append( + { + "name": benchmark, + "baseline": baseline, + "stats": compute_stats(df, benchmark, baseline), + "chart": { + "id": benchmark, + "json": chart.to_json(), + }, + } + ) + render(args, tmpl_data) + + +if __name__ == "__main__": + main()