feat: add Radar setup (#346)

david-christiansen · web-flow · commit 837f89af8e04 · 2026-01-17T22:03:40.000+01:00
This PR adds a Radar setup so we can track the performance of doc-gen4 builds.

The benchmarks are:

* `own-docs` measures the time and maxrss for building the docs for doc-gen itself. Here, it is a convenient stand-in for a moderately-sized Lean project with a moderate number of depenencies.

* `mathlib-docs` measures the time and maxrss for building Mathlib's docs. We can use it to ensure that from-scratch Mathlib documentation builds don't get slower.
diff --git a/scripts/bench/README.md b/scripts/bench/README.md
@@ -0,0 +1,38 @@
+# doc-gen4 benchmark suite
+
+This directory contains the `doc-gen4` benchmark suite.
+It is built around [radar](github.com/leanprover/radar)
+and benchmark results can be viewed
+on the [Lean FRO radar instance](https://radar.lean-lang.org/repos/doc-gen4).
+
+To execute the entire suite, run `scripts/bench/run` in the repo root.
+To execute an individual benchmark, run `scripts/bench/<benchmark>/run` in the repo root.
+All scripts output their measurements into the file `measurements.jsonl`.
+
+Radar sums any duplicated measurements with matching metrics.
+To post-process the `measurements.jsonl` file this way in-place,
+run `scripts/bench/combine.py` in the repo root after executing the benchmark suite.
+
+The `*.py` symlinks exist only so the python files are a bit nicer to edit
+in text editors that rely on the file ending.
+
+## Adding a benchmark
+
+To add a benchmark to the suite, follow these steps:
+
+1. Create a new folder containing a `run` script and a `README.md` file describing the benchmark,
+   as well as any other files required for the benchmark.
+2. Edit `scripts/bench/run` to call the `run` script of your new benchmark.
+
+## How radar executes the benchmark suite
+
+Radar requires a _bench repo_ to be configured for each repo.
+The bench repo contains scripts that execute benchmarks and present the results to radar
+following the
+[bench repo specification](https://github.com/leanprover/radar/blob/62bffab39025a1c2039499ae7a85b1ad446286d9/README.md#bench-repo-specification).
+
+The bench repo for `doc-gen4` is
+[leanprover/radar-bench-doc-gen4](https://github.com/leanprover/radar-bench-mathlib4).
+It calls the bench suite inside the mathlib repository and passes the results on to radar.
+It expects all measurements to be presented through the `measurements.jsonl` file,
+_not_ through stdout/stderr (even though this would be allowed by the bench script specification).
diff --git a/scripts/bench/combine.py b/scripts/bench/combine.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+from pathlib import Path
+
+OUTFILE = Path() / "measurements.jsonl"
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description=f"Combine duplicated measurements in {OUTFILE.name} the way radar does, by summing their values."
+    )
+    args = parser.parse_args()
+
+    values: dict[str, float] = {}
+    units: dict[str, str | None] = {}
+
+    with open(OUTFILE, "r") as f:
+        for line in f:
+            data = json.loads(line)
+            metric = data["metric"]
+            values[metric] = values.get(metric, 0) + data["value"]
+            units[metric] = data.get("unit")
+
+    with open(OUTFILE, "w") as f:
+        for metric, value in values.items():
+            unit = units.get(metric)
+            data = {"metric": metric, "value": value}
+            if unit is not None:
+                data["unit"] = unit
+            f.write(f"{json.dumps(data)}\n")
diff --git a/scripts/bench/mathlib-docs/README.md b/scripts/bench/mathlib-docs/README.md
@@ -0,0 +1,4 @@
+# The `mathlib-docs` benchmark
+
+This benchmark measures the time taken for `doc-gen4` to generate
+documentation for Mathlib.
diff --git a/scripts/bench/mathlib-docs/run b/scripts/bench/mathlib-docs/run
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+set -euxo pipefail
+
+BENCH="scripts/bench"
+REPO_ROOT="$(pwd)"
+
+# Look up the Lean toolchain version from lean-toolchain
+TOOLCHAIN_FULL=$(cat "$REPO_ROOT/lean-toolchain")
+TOOLCHAIN=${TOOLCHAIN_FULL#*:}
+
+# Validate that it's a release or RC version (e.g., 4.27.0, v4.27.0, 4.27.0-rc1, v4.27.0-rc1)
+if [[ ! "$TOOLCHAIN" =~ ^v?[0-9]+\.[0-9]+\.[0-9]+(-rc[0-9]+)?$ ]]; then
+  echo "Error: Toolchain '$TOOLCHAIN' is not a Lean release or RC version" >&2
+  exit 1
+fi
+
+# Create a temp directory and set up cleanup
+TMPDIR=$(mktemp -d)
+trap 'rm -rf "$TMPDIR"' EXIT
+
+pushd "$TMPDIR"
+
+# Create a new Mathlib project using the math-lax template
+lake +"$TOOLCHAIN" new mathproject math-lax
+
+cd mathproject
+
+# Add a dependency to the doc-gen4 checkout
+cat >> lakefile.toml <<EOF
+
+[[require]]
+name = "doc-gen4"
+path = "$REPO_ROOT"
+EOF
+
+# Update doc-gen4 dependency
+MATHLIB_NO_CACHE_ON_UPDATE=1 lake update doc-gen4
+
+# Get Mathlib cache
+lake exe cache get
+
+# Build DocGen4 and its executable first (we want to measure docs generation, not tool building)
+lake build DocGen4
+lake build doc-gen4
+
+popd
+
+# Benchmark documentation generation
+env DOCGEN_SRC="file" "$REPO_ROOT/$BENCH/measure.py" -t mathlib-docs -m instructions -m maxrss -m task-clock -m wall-clock -- \
+  lake --dir "$TMPDIR/mathproject" build Mathlib:docs
diff --git a/scripts/bench/measure.py b/scripts/bench/measure.py
@@ -0,0 +1,160 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import os
+import resource
+import subprocess
+import sys
+import tempfile
+from dataclasses import dataclass
+from pathlib import Path
+
+OUTFILE = Path() / "measurements.jsonl"
+
+
+@dataclass
+class PerfMetric:
+    event: str
+    factor: float = 1
+    unit: str | None = None
+
+
+@dataclass
+class RusageMetric:
+    name: str
+    factor: float = 1
+    unit: str | None = None
+
+
+PERF_METRICS = {
+    "task-clock": PerfMetric("task-clock", factor=1e-9, unit="s"),
+    "wall-clock": PerfMetric("duration_time", factor=1e-9, unit="s"),
+    "instructions": PerfMetric("instructions"),
+}
+
+PERF_UNITS = {
+    "msec": 1e-3,
+    "ns": 1e-9,
+}
+
+RUSAGE_METRICS = {
+    "maxrss": RusageMetric("ru_maxrss", factor=1000, unit="B"),  # KiB on linux
+}
+
+ALL_METRICS = {**PERF_METRICS, **RUSAGE_METRICS}
+
+
+def measure_perf(cmd: list[str], events: list[str]) -> dict[str, tuple[float, str]]:
+    with tempfile.NamedTemporaryFile() as tmp:
+        cmd = [
+            *["perf", "stat", "-j", "-o", tmp.name],
+            *[arg for event in events for arg in ["-e", event]],
+            *["--", *cmd],
+        ]
+
+        # Execute command
+        env = os.environ.copy()
+        env["LC_ALL"] = "C"  # or else perf may output syntactically invalid json
+        result = subprocess.run(cmd, env=env)
+        if result.returncode != 0:
+            sys.exit(result.returncode)
+
+        # Collect results
+        perf = {}
+        for line in tmp:
+            data = json.loads(line)
+            if "event" in data and "counter-value" in data:
+                perf[data["event"]] = float(data["counter-value"]), data["unit"]
+
+        return perf
+
+
+@dataclass
+class Result:
+    category: str
+    value: float
+    unit: str | None
+
+    def fmt(self, topic: str) -> str:
+        metric = f"{topic}//{self.category}"
+        if self.unit is None:
+            return json.dumps({"metric": metric, "value": self.value})
+        return json.dumps({"metric": metric, "value": self.value, "unit": self.unit})
+
+
+def measure(cmd: list[str], metrics: list[str]) -> list[Result]:
+    # Check args
+    unknown_metrics = []
+    for metric in metrics:
+        if metric not in RUSAGE_METRICS and metric not in PERF_METRICS:
+            unknown_metrics.append(metric)
+    if unknown_metrics:
+        raise Exception(f"unknown metrics: {', '.join(unknown_metrics)}")
+
+    # Prepare perf events
+    events: list[str] = []
+    for metric in metrics:
+        if info := PERF_METRICS.get(metric):
+            events.append(info.event)
+
+    # Measure
+    perf = measure_perf(cmd, events)
+    rusage = resource.getrusage(resource.RUSAGE_CHILDREN)
+
+    # Extract results
+    results = []
+    for metric in metrics:
+        if info := PERF_METRICS.get(metric):
+            if info.event in perf:
+                value, unit = perf[info.event]
+            else:
+                # Without the corresponding permissions,
+                # we only get access to the userspace versions of the counters.
+                value, unit = perf[f"{info.event}:u"]
+
+            value *= PERF_UNITS.get(unit, info.factor)
+            results.append(Result(metric, value, info.unit))
+
+        if info := RUSAGE_METRICS.get(metric):
+            value = getattr(rusage, info.name) * info.factor
+            results.append(Result(metric, value, info.unit))
+
+    return results
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description=f"Measure resource usage of a command using perf and rusage. The results are appended to {OUTFILE.name}.",
+    )
+    parser.add_argument(
+        "-t",
+        "--topic",
+        action="append",
+        default=[],
+        help="topic prefix for the metrics",
+    )
+    parser.add_argument(
+        "-m",
+        "--metric",
+        action="append",
+        default=[],
+        help=f"metrics to measure. Can be specified multiple times. Available metrics: {', '.join(sorted(ALL_METRICS))}",
+    )
+    parser.add_argument(
+        "cmd",
+        nargs="*",
+        help="command to measure the resource usage of",
+    )
+    args = parser.parse_args()
+
+    topics: list[str] = args.topic
+    metrics: list[str] = args.metric
+    cmd: list[str] = args.cmd
+
+    results = measure(cmd, metrics)
+
+    with open(OUTFILE, "a+") as f:
+        for result in results:
+            for topic in topics:
+                f.write(f"{result.fmt(topic)}\n")
diff --git a/scripts/bench/own-docs/README.md b/scripts/bench/own-docs/README.md
@@ -0,0 +1,7 @@
+# The `own-docs` benchmark
+
+This benchmark measures the time taken for `doc-gen4` to generate its
+own documentation. This is a convenient stand-in for moderately-sized
+Lean projects without too many dependencies. When this benchmark was
+created, the time was dominated by building documentation for core
+Lean, but this may change over time and it's good to track it.
diff --git a/scripts/bench/own-docs/run b/scripts/bench/own-docs/run
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+set -euxo pipefail
+
+BENCH="scripts/bench"
+
+# Prepare build
+
+lake clean
+
+# We want to measure the time taken to generate documentation, not to
+# build the tool, so we build it and the library first:
+lake build DocGen4
+lake build doc-gen4
+
+env DOCGEN_SRC="file" "$BENCH/measure.py" -t own-docs -m instructions -m maxrss -m task-clock -m wall-clock -- \
+  lake build DocGen4:docs
diff --git a/scripts/bench/repeatedly.py b/scripts/bench/repeatedly.py
diff --git a/scripts/bench/run b/scripts/bench/run