bytecodealliance · posborne · Apr 30, 2025 · posborne · May 1, 2025
diff --git a/README.md b/README.md
@@ -214,11 +214,26 @@ If you don't want the results to be summarized and displayed in a human-readable
 format, you can get raw JSON or CSV via the `--raw` flag:
 
 ```
-$ cargo run -- benchmark --raw --output-format csv -- benchmark.wasm
+$ cargo run -- benchmark --raw --output-format json -- benchmark.wasm
 ```
 
 Then you can use your own R/Python/spreadsheets/etc. to analyze and visualize
-the benchmark results.
+the benchmark results.  The JSON output is also compatible with the viz.py
+script in tree.
+
+### Benchmark Results Visualization (viz.py)
+
+For results collected in the JSON format, a tool is provided in tree that can
+visualize benchmark results as an HTML document with embedded graphics.  This
+tool requires that the python
+[`uv`](https://docs.astral.sh/uv/getting-started/installation/) tool be
+installed.
+
+```
+$ ./scripts/viz.py -o experiment.html baseline.json experiment1.json experiment2.json
+```
+
+See the `--help` option on the tool for additional usage options.
 
 ### Adding a New Benchmark
 

diff --git a/scripts/viz.jinja b/scripts/viz.jinja
@@ -0,0 +1,97 @@
+<!doctype html>
+<html lang="en-us">
+    <head>
+        <meta charset="utf-8">
+        <meta http-equiv="x-ua-compatible" content="ie=edge">
+        <title>Sightglass Benchmark Results</title>
+        <meta name="description" content="Sightglass Benchmark Results Visualization">
+        <meta name="viewport" content="width=device-width, initial-scale=1">
+        <script src="https://cdn.jsdelivr.net/npm/vega@6"></script>
+        <script src="https://cdn.jsdelivr.net/npm/vega-lite@6"></script>
+        <script src="https://cdn.jsdelivr.net/npm/vega-embed@6"></script>
+        {% for benchmark in benchmarks %}
+        <script type="application/json" id="vega-lite-{{ benchmark.chart.id }}">
+            {{ benchmark.chart.json }}
+        </script>
+        <style>
+            .slower {
+                background: lightcoral;
+                font-weight: bold;
+            }
+
+            .faster {
+                background: lightgreen;
+                font-weight: bold;
+            }
+
+            .inconsistent {
+                background: lightgray;
+            }
+        </style>
+        {% endfor %}
+    </head>
+    <body>
+        <h1>Sightglass Benchmark Results</h1>
+        <div>
+            <span style="font-weight:bold">Key:</span>
+            <span class="inconsistent">Baseline CV > 5% -- Inconsistent Benchmark</span>
+            <span class="slower">Test Slower by CV%+ from baseline</span>
+            <span class="faster">Test Faster by CV%+ from baseline</span>
+        </div>
+        <div>
+            CV is the <a href="https://en.wikipedia.org/wiki/Coefficient_of_variation">Coefficient of Variation</a> for the baseline benchmark run.
+        </div>
+        <table style="padding-top:20px">
+            <thead>
+                <tr>
+                    <th scope="col">Benchmark</th>
+                    {% for prefix in prefixes %}
+                        <th scope="col">{% if prefix == baseline %}*{% endif %}{{ prefix }}</th>
+                    {% endfor %}
+                </tr>
+            </thead>
+            <tbody>
+                {% for benchmark in benchmarks %}
+                <tr>
+                    <td><a href="#{{ benchmark.name }}">{{ benchmark.name }}</a></td>
+                    {% for prefix in prefixes %}
+                        {% set stats = benchmark.stats if prefix == baseline else benchmark.stats.relative[prefix] %}
+                        {% if prefix == baseline %}
+                            {% set class = "inconsistent" if stats.cv > 5 else "" %}
+                            <td class="{{ class }}">{{ "%.2f"|format(stats.p25) }} +/- {{ "%.2f"|format(stats.cv) }}%</td>
+                        {% else %}
+                            {% set class = "slower" if stats.p25_delta_pct > benchmark.stats.cv else "faster" if stats.p25_delta_pct < -1 * benchmark.stats.cv else "" %}
+                            <td class="{{ class }}">{{ "%.2f"|format(stats.p25_delta_pct) }}%</td>
+                        {% endif %}
+                    {% endfor %}
+                </tr>
+                {% endfor %}
+            </tbody>
+        </table>
+
+        <div id="benchmark-details">
+        {% for benchmark in benchmarks %}
+            <h2 id="{{benchmark.name}}">{{ benchmark.name }}</h2>
+            <ul>
+                <li><strong {% if benchmark.stats.cv > 5 %}class="inconsistent"{% endif %}>Baseline CV:</strong> {{ "%.2f"|format(benchmark.stats.cv) }}%</li>
+                {% for prefix, stats in benchmark.stats.relative.items() %}
+                    {% set class = "slower" if stats.p25_delta_pct > benchmark.stats.cv else "faster" if stats.p25_delta_pct < -1 * benchmark.stats.cv else "" %}
+                    <li>{{ prefix }}: <span class="{{ class }}">{{ "%.2f"|format(stats.p25_delta_pct) }}%</span></li>
+                {% endfor %}
+            </ul>
+            <div id="viz-{{ benchmark.chart.id }}"></div>
+        {% endfor %}
+        </div>
+
+        <script type="text/javascript">
+         function renderViz(chart_id) {
+             let vl_text = document.getElementById(`vega-lite-${chart_id}`).textContent;
+             let vl_data = JSON.parse(vl_text);
+             vegaEmbed(`#viz-${chart_id}`, vl_data);
+         }
+         {% for benchmark in benchmarks %}
+             renderViz("{{ benchmark.chart.id }}");
+         {% endfor %}
+        </script>>
+    </body>
+</html>
diff --git a/scripts/viz.py b/scripts/viz.py
@@ -0,0 +1,216 @@
+#!/usr/bin/env python3
+#
+# /// script
+# dependencies = [
+#     "pandas",
+#     "altair",
+#     "jinja2"
+# ]
+# ///
+
+import argparse
+import os
+import pandas as pd
+import altair as alt
+from dataclasses import dataclass
+import re
+import sys
+import pathlib
+
+# our usage isn't abusing this, suppress noisy warnings
+pd.options.mode.chained_assignment = None
+
+
+@dataclass
+class RelativeStats:
+    cv: float
+    p25: float
+    p25_delta_pct: float
+
+
+@dataclass
+class Stats:
+    cv: float
+    p25: float
+    relative: dict[str, RelativeStats]
+
+
+def wasm_path_to_benchmark_name(wasm_path: str) -> str:
+    splits = wasm_path.split("/")
+    if splits[-1] == "benchmark.wasm":
+        # E.g. noop/benchmark.wasm -> noop
+        return splits[-2]
+    else:
+        # E.g. libsodium/libsodium-box7.wasm -> libsodium-box7
+        return splits[-1].replace(".wasm", "")
+
+
+def parse_single_input(path, prefix, pass_num=1, measure="cycles"):
+    df = pd.read_json(path)
+    df = df[df["event"] == measure]
+    df["pass"] = pass_num
+    df["prefix"] = prefix
+    return df
+
+
+def parse_inputs(inputs, measure="cycles"):
+    """Yield a chart for each pass for this prefix"""
+    if len(inputs) == 1 and not inputs[0].endswith(".json"):
+        # assume this is a directory; try to use all JSON files
+        import glob
+
+        file_inputs = list(sorted(glob.glob(f"{inputs[0]}/*.json")))
+        return parse_inputs(file_inputs, measure)
+
+    # list of files now; organize by prefix if detected
+    RE_NUMBERED = re.compile(r"(?P<prefix>.+)-(?P<number>\d+).json")
+    df = pd.DataFrame()
+    for path in inputs:
+        path = pathlib.Path(path)
+        if not path.exists():
+            print(f"{path} not found!")
+            return sys.exit(1)
+
+        match = RE_NUMBERED.match(path.name)
+        if match:
+            prefix = match["prefix"]
+            pass_num = int(match["number"])
+            df = pd.concat([df, parse_single_input(path, prefix, pass_num)])
+        else:
+            prefix = path.stem
+            df = pd.concat([df, parse_single_input(path, prefix)])
+
+    return df
+
+
+def compute_stats(df, benchmark, baseline):
+    # select only rows for this benchmark
+    df = df[df["benchmark"] == benchmark]
+
+    baseline_df = df[df["prefix"] == baseline]
+    prefixes = df[df["prefix"] != baseline]["prefix"].unique()
+
+    baseline_p25 = baseline_df["count"].quantile(0.25)
+    baseline_mean = baseline_df["count"].mean()
+    baseline_cv = (baseline_df["count"].std() / baseline_mean) * 100
+    stats = Stats(cv=baseline_cv, p25=baseline_p25, relative={})
+    for prefix in prefixes:
+        prefix_df = df[df["prefix"] == prefix]
+        prefix_p25 = prefix_df["count"].quantile(0.25)
+        prefix_mean = prefix_df["count"].mean()
+        prefix_cv = (prefix_df["count"].std() / prefix_mean) * 100
+        p25_delta_pct = (
+            (prefix_p25 - baseline_p25) / ((baseline_p25 + prefix_p25) / 2)
+        ) * 100
+        rel_stats = RelativeStats(
+            cv=prefix_cv, p25=prefix_p25, p25_delta_pct=p25_delta_pct
+        )
+        stats.relative[prefix] = rel_stats
+
+    return stats
+
+
+def plot_benchmark(df, baseline, benchmark):
+    # select only rows for this benchmark
+    df = df[df["benchmark"] == benchmark]
+
+    chart1 = (
+        alt.Chart(df)
+        .mark_boxplot()
+        .encode(
+            y=alt.Y("prefix", title=None),
+            x=alt.X("count:Q", title="Count (Cycles)"),
+            color=alt.Color("prefix"),
+        )
+    )
+
+    df_baseline = df[df["prefix"] == baseline]
+    baseline_p25 = df_baseline["count"].quantile(0.25)
+    df["pct_diff_from_p25"] = (df["count"] - baseline_p25) / baseline_p25 * 100
+
+    chart2 = (
+        alt.Chart(df)
+        .mark_boxplot()
+        .encode(
+            y=alt.Y("prefix", title=None),
+            x=alt.X("pct_diff_from_p25", title="Percent Difference from Baseline p25"),
+            color=alt.Color("prefix"),
+        )
+    )
+
+    return alt.hconcat(chart1, chart2).properties(title=f"{benchmark}")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("inputs", nargs="+", help="Directory or JSON files to analyze")
+    parser.add_argument(
+        "-b",
+        "--baseline",
+        default=None,
+        help="prefix of baseline to use (by default look for one containing 'baseline')",
+    )
+    parser.add_argument(
+        "-o", "--output", default="viz.html", help="HTML File to output"
+    )
+    args = parser.parse_args()
+    return args
+
+
+def render(args, tmpl_data):
+    from jinja2 import Environment, FileSystemLoader
+
+    env = Environment(
+        loader=FileSystemLoader(os.path.abspath(os.path.dirname(__file__)))
+    )
+    viz_template = env.get_template("viz.jinja")
+    rendered = viz_template.render(**tmpl_data)
+    with open(args.output, "w") as out:
+        out.write(rendered)
+    print(f"Wrote {args.output}")
+
+
+def main():
+    args = parse_args()
+    df = parse_inputs(args.inputs)
+
+    # add column for benchmark
+    df["benchmark"] = df.apply(
+        lambda row: wasm_path_to_benchmark_name(row["wasm"]), axis=1
+    )
+
+    tmpl_data = {}
+    benchmarks = df["benchmark"].unique()
+    baseline = (
+        next((p for p in df["prefix"].unique() if "baseline" in p), None)
+        or benchmarks[0]
+    )
+
+    # reorg prefixes so that baseline is first
+    prefixes = df["prefix"].unique()
+    for i, prefix in enumerate(prefixes):
+        if prefix == baseline:
+            prefixes[i] = prefixes[0]
+            prefixes[0] = baseline
+            break
+    tmpl_data["prefixes"] = prefixes
+    tmpl_data["baseline"] = baseline
+    tmpl_data["benchmarks"] = []
+    for benchmark in benchmarks:
+        chart = plot_benchmark(df, baseline, benchmark)
+        tmpl_data["benchmarks"].append(
+            {
+                "name": benchmark,
+                "baseline": baseline,
+                "stats": compute_stats(df, benchmark, baseline),
+                "chart": {
+                    "id": benchmark,
+                    "json": chart.to_json(),
+                },
+            }
+        )
+    render(args, tmpl_data)
+
+
+if __name__ == "__main__":
+    main()