add benchmarks using pytest-benchmark and codspeed (#3562)

d-v-b · maxrjones · web-flow · commit 7eba7f16f954 · 2026-01-15T11:01:16.000+01:00
* add benchmarks * remove failing zipstore * don't do benchmarking in default pytest runs * changelog * codspeed workflow * lint * remove pedantic mode * only run benchmarks in one environment * use better string id for test params, make test data 1MB, and simplify params * move layout to an external file * get workloads to resemble recent sharding perf tests * test ids * tweak tests * tweak tests * fix typo * add slice indexing benchmarks * remove readme * add docs documentation * simplify pytest benchmark options * use --codspeed flag in benchmark ci * measure walltime in ci * Update .github/workflows/codspeed.yml Co-authored-by: Max Jones <14077947+maxrjones@users.noreply.github.com> * Apply suggestion from @maxrjones Co-authored-by: Max Jones <14077947+maxrjones@users.noreply.github.com> * add --ignore option to main test and gpu test invocations * add comment * ignore codspeed warnings * update workflow --------- Co-authored-by: Max Jones <14077947+maxrjones@users.noreply.github.com>
diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml
@@ -0,0 +1,35 @@
+name: CodSpeed Benchmarks
+
+on:
+  push:
+    branches:
+      - "main"
+  pull_request:
+  # `workflow_dispatch` allows CodSpeed to trigger backtest
+  # performance analysis in order to generate initial data.
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+jobs:
+  benchmarks:
+    name: Run benchmarks
+    runs-on: codspeed-macro
+    steps:
+    - uses: actions/checkout@v5
+      with:
+        fetch-depth: 0  # grab all branches and tags
+    - name: Set up Python
+      uses: actions/setup-python@v6
+      with:
+        python-version: "3.11"
+    - name: Install Hatch
+      run: |
+        python -m pip install --upgrade pip
+        pip install hatch
+    - name: Run the benchmarks
+      uses: CodSpeedHQ/action@v4
+      with:
+          mode: walltime
+          run: hatch run test.py3.11-2.0-minimal:pytest tests/benchmarks --codspeed
diff --git a/changes/3562.misc.md b/changes/3562.misc.md
@@ -0,0 +1 @@
+Add continuous performance benchmarking infrastructure.
diff --git a/docs/contributing.md b/docs/contributing.md
@@ -264,4 +264,13 @@ If an existing Zarr format version changes, or a new version of the Zarr format
 ## Release procedure
 
 Open an issue on GitHub announcing the release using the release checklist template:
-[https://github.com/zarr-developers/zarr-python/issues/new?template=release-checklist.md](https://github.com/zarr-developers/zarr-python/issues/new?template=release-checklist.md>). The release checklist includes all steps necessary for the release.
+[https://github.com/zarr-developers/zarr-python/issues/new?template=release-checklist.md](https://github.com/zarr-developers/zarr-python/issues/new?template=release-checklist.md>). The release checklist includes all steps necessary for the release.
+
+## Benchmarks
+
+Zarr uses [pytest-benchmark](https://pytest-benchmark.readthedocs.io/en/latest/) for running
+performance benchmarks as part of our test suite. The benchmarks can be are found in `tests/benchmarks`.
+By default pytest is configured to run these benchmarks as plain tests (i.e., no benchmarking). To run
+a benchmark with timing measurements, use the `--benchmark-enable` when invoking `pytest`.
+
+The benchmarks are run as part of the continuous integration suite through [codspeed](https://codspeed.io/zarr-developers/zarr-python).
diff --git a/pyproject.toml b/pyproject.toml
@@ -82,6 +82,8 @@ test = [
     'numpydoc',
     "hypothesis",
     "pytest-xdist",
+    "pytest-benchmark",
+    "pytest-codspeed",
     "packaging",
     "tomlkit",
     "uv",
@@ -175,11 +177,12 @@ matrix.deps.dependencies = [
 run-coverage = "pytest --cov-config=pyproject.toml --cov=src --cov-append --cov-report xml --junitxml=junit.xml -o junit_family=legacy"
 run-coverage-html = "pytest --cov-config=pyproject.toml --cov=src --cov-append --cov-report html"
 run-coverage-gpu = "pip install cupy-cuda12x && pytest -m gpu --cov-config=pyproject.toml --cov=src --cov-append --cov-report xml --junitxml=junit.xml -o junit_family=legacy"
-run = "run-coverage --no-cov"
+run = "run-coverage --no-cov --ignore tests/benchmarks"
 run-pytest = "run"
 run-verbose = "run-coverage --verbose"
 run-mypy = "mypy src"
 run-hypothesis = "run-coverage -nauto --run-slow-hypothesis tests/test_properties.py tests/test_store/test_stateful*"
+run-benchmark = "pytest --benchmark-enable tests/benchmarks"
 list-env = "pip list"
 
 [tool.hatch.envs.gputest]
@@ -196,7 +199,7 @@ numpy = ["2.0", "2.2"]
 version = ["minimal"]
 
 [tool.hatch.envs.gputest.scripts]
-run-coverage = "pytest -m gpu --cov-config=pyproject.toml --cov=pkg --cov-report xml --cov=src --junitxml=junit.xml -o junit_family=legacy"
+run-coverage = "pytest -m gpu --cov-config=pyproject.toml --cov=pkg --cov-report xml --cov=src --junitxml=junit.xml -o junit_family=legacy --ignore tests/benchmarks"
 run = "run-coverage --no-cov"
 run-verbose = "run-coverage --verbose"
 run-mypy = "mypy src"
@@ -405,7 +408,10 @@ doctest_optionflags = [
     "IGNORE_EXCEPTION_DETAIL",
 ]
 addopts = [
-    "--durations=10", "-ra", "--strict-config", "--strict-markers",
+    "--benchmark-columns", "min,mean,stddev,outliers,rounds,iterations",
+    "--benchmark-disable", # benchmark routines run as tests without benchmarking instrumentation
+    "--durations", "10",
+    "-ra", "--strict-config", "--strict-markers",
 ]
 filterwarnings = [
     "error",
diff --git a/tests/benchmarks/__init__.py b/tests/benchmarks/__init__.py
diff --git a/tests/benchmarks/common.py b/tests/benchmarks/common.py
@@ -0,0 +1,8 @@
+from dataclasses import dataclass
+
+
+@dataclass(kw_only=True, frozen=True)
+class Layout:
+    shape: tuple[int, ...]
+    chunks: tuple[int, ...]
+    shards: tuple[int, ...] | None
diff --git a/tests/benchmarks/conftest.py b/tests/benchmarks/conftest.py
@@ -0,0 +1,15 @@
+"""Pytest configuration for benchmark tests."""
+
+import pytest
+
+# Filter CodSpeed instrumentation warnings that can occur intermittently
+# when registering benchmark results. This is a known issue with the
+# CodSpeed walltime instrumentation hooks.
+# See: https://github.com/CodSpeedHQ/pytest-codspeed
+
+
+def pytest_configure(config: pytest.Config) -> None:
+    config.addinivalue_line(
+        "filterwarnings",
+        "ignore:Failed to set executed benchmark:RuntimeWarning",
+    )
diff --git a/tests/benchmarks/test_e2e.py b/tests/benchmarks/test_e2e.py
@@ -0,0 +1,82 @@
+"""
+Benchmarks for end-to-end read/write performance of Zarr
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from tests.benchmarks.common import Layout
+
+if TYPE_CHECKING:
+    from pytest_benchmark.fixture import BenchmarkFixture
+
+    from zarr.abc.store import Store
+    from zarr.core.common import NamedConfig
+from operator import getitem, setitem
+from typing import Any, Literal
+
+import pytest
+
+from zarr import create_array
+
+CompressorName = Literal["gzip"] | None
+
+compressors: dict[CompressorName, NamedConfig[Any, Any] | None] = {
+    None: None,
+    "gzip": {"name": "gzip", "configuration": {"level": 1}},
+}
+
+
+layouts: tuple[Layout, ...] = (
+    # No shards, just 1000 chunks
+    Layout(shape=(1_000_000,), chunks=(1000,), shards=None),
+    # 1:1 chunk:shard shape, should measure overhead of sharding
+    Layout(shape=(1_000_000,), chunks=(1000,), shards=(1000,)),
+    # One shard with all the chunks, should measure overhead of handling inner shard chunks
+    Layout(shape=(1_000_000,), chunks=(100,), shards=(10000 * 100,)),
+)
+
+
+@pytest.mark.parametrize("compression_name", [None, "gzip"])
+@pytest.mark.parametrize("layout", layouts, ids=str)
+@pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"])
+def test_write_array(
+    store: Store, layout: Layout, compression_name: CompressorName, benchmark: BenchmarkFixture
+) -> None:
+    """
+    Test the time required to fill an array with a single value
+    """
+    arr = create_array(
+        store,
+        dtype="uint8",
+        shape=layout.shape,
+        chunks=layout.chunks,
+        shards=layout.shards,
+        compressors=compressors[compression_name],  # type: ignore[arg-type]
+        fill_value=0,
+    )
+
+    benchmark(setitem, arr, Ellipsis, 1)
+
+
+@pytest.mark.parametrize("compression_name", [None, "gzip"])
+@pytest.mark.parametrize("layout", layouts, ids=str)
+@pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"])
+def test_read_array(
+    store: Store, layout: Layout, compression_name: CompressorName, benchmark: BenchmarkFixture
+) -> None:
+    """
+    Test the time required to fill an array with a single value
+    """
+    arr = create_array(
+        store,
+        dtype="uint8",
+        shape=layout.shape,
+        chunks=layout.chunks,
+        shards=layout.shards,
+        compressors=compressors[compression_name],  # type: ignore[arg-type]
+        fill_value=0,
+    )
+    arr[:] = 1
+    benchmark(getitem, arr, Ellipsis)
diff --git a/tests/benchmarks/test_indexing.py b/tests/benchmarks/test_indexing.py
@@ -0,0 +1,43 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from pytest_benchmark.fixture import BenchmarkFixture
+
+    from zarr.abc.store import Store
+
+from operator import getitem
+
+import pytest
+
+from zarr import create_array
+
+indexers = (
+    (0,) * 3,
+    (slice(None),) * 3,
+    (slice(0, None, 4),) * 3,
+    (slice(10),) * 3,
+    (slice(10, -10, 4),) * 3,
+    (slice(None), slice(0, 3, 2), slice(0, 10)),
+)
+
+
+@pytest.mark.parametrize("store", ["memory"], indirect=["store"])
+@pytest.mark.parametrize("indexer", indexers, ids=str)
+def test_slice_indexing(
+    store: Store, indexer: tuple[int | slice], benchmark: BenchmarkFixture
+) -> None:
+    data = create_array(
+        store=store,
+        shape=(105,) * 3,
+        dtype="uint8",
+        chunks=(10,) * 3,
+        shards=None,
+        compressors=None,
+        filters=None,
+        fill_value=0,
+    )
+
+    data[:] = 1
+    benchmark(getitem, data, indexer)

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Add continuous performance benchmarking infrastructure.`