diff --git a/.github/workflows/benchmark.yaml b/.github/workflows/benchmark.yaml
new file mode 100644
index 00000000..33622d26
--- /dev/null
+++ b/.github/workflows/benchmark.yaml
@@ -0,0 +1,169 @@
+name: Benchmarks
+
+on:
+ pull_request:
+ types: [labeled, synchronize]
+ workflow_dispatch:
+ inputs:
+ mode:
+ description: 'Benchmark mode'
+ required: true
+ type: choice
+ options:
+ - run
+ - compare
+ default: 'run'
+ comparison_ref:
+ description: 'Git ref to compare against (only used in compare mode)'
+ required: false
+ default: 'main'
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
+env:
+ FORCE_COLOR: "1"
+ MPLBACKEND: agg
+ UV_COMPILE_BYTECODE: "1"
+
+defaults:
+ run:
+ # to fail on error in multiline statements (-e), in pipes (-o pipefail), and on unset variables (-u).
+ shell: bash -euo pipefail {0}
+
+jobs:
+ benchmark:
+ # Run on 'pytest-benchmark' label or manual dispatch
+ if: |
+ github.event_name == 'workflow_dispatch' ||
+ (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'pytest-benchmark'))
+ name: Run Benchmarks
+ runs-on: ubuntu-latest
+ permissions:
+ contents: write
+ pull-requests: write
+
+ steps:
+ - uses: actions/checkout@v5
+ with:
+ filter: blob:none
+ fetch-depth: 0
+
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: "3.13"
+
+ - name: Install uv
+ uses: astral-sh/setup-uv@v7
+ with:
+ enable-cache: true
+ cache-dependency-glob: pyproject.toml
+
+ - name: Install dependencies
+ run: uv pip install -e ".[test]" --system
+
+ - name: Run benchmarks
+ run: |
+ pytest tests/benchmarks/ \
+ --benchmark-enable \
+ --benchmark-only \
+ --benchmark-json=benchmark-results.json \
+ --benchmark-warmup=on \
+ --benchmark-disable-gc \
+ --benchmark-min-rounds=5 \
+ -v --color=yes
+
+ - name: Generate report (simple mode)
+ if: github.event.inputs.mode != 'compare'
+ run: |
+ python .scripts/ci/benchmark_report.py \
+ benchmark-results.json \
+ --pr-ref="${GITHUB_HEAD_REF:-$GITHUB_REF_NAME}" \
+ --output=benchmark-report.md \
+ --github-summary
+
+ - name: Run comparison benchmarks
+ if: github.event.inputs.mode == 'compare'
+ run: |
+ # Save current results
+ mv benchmark-results.json benchmark-pr.json
+
+ # Checkout base
+ git checkout ${{ github.event.inputs.comparison_ref || 'main' }}
+
+ # Reinstall in case dependencies changed
+ uv pip install -e ".[test]" --system
+
+ # Run benchmarks on base
+ pytest tests/benchmarks/ \
+ --benchmark-enable \
+ --benchmark-only \
+ --benchmark-json=benchmark-base.json \
+ --benchmark-warmup=on \
+ --benchmark-disable-gc \
+ --benchmark-min-rounds=5 \
+ -v --color=yes || true
+
+ # Checkout back
+ git checkout ${{ github.sha }}
+
+ # Rename for report
+ mv benchmark-pr.json benchmark-results.json
+
+ - name: Generate comparison report
+ if: github.event.inputs.mode == 'compare'
+ run: |
+ python .scripts/ci/benchmark_report.py \
+ benchmark-results.json \
+ --base-results=benchmark-base.json \
+ --pr-ref="${GITHUB_HEAD_REF:-$GITHUB_REF_NAME}" \
+ --base-ref="${{ github.event.inputs.comparison_ref || 'main' }}" \
+ --output=benchmark-report.md \
+ --github-summary
+
+ - name: Comment on PR
+ if: github.event_name == 'pull_request'
+ uses: actions/github-script@v7
+ with:
+ script: |
+ const fs = require('fs');
+ const report = fs.readFileSync('benchmark-report.md', 'utf8');
+
+ const { data: comments } = await github.rest.issues.listComments({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ issue_number: context.issue.number,
+ });
+
+ const botComment = comments.find(comment =>
+ comment.user.type === 'Bot' &&
+ comment.body.includes('## ๐ Benchmark Results')
+ );
+
+ if (botComment) {
+ await github.rest.issues.updateComment({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ comment_id: botComment.id,
+ body: report
+ });
+ } else {
+ await github.rest.issues.createComment({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ issue_number: context.issue.number,
+ body: report
+ });
+ }
+
+ - name: Upload benchmark results
+ uses: actions/upload-artifact@v4
+ with:
+ name: benchmark-results
+ path: |
+ benchmark-results.json
+ benchmark-base.json
+ benchmark-report.md
+ if-no-files-found: ignore
diff --git a/.gitignore b/.gitignore
index b638264c..f632b140 100644
--- a/.gitignore
+++ b/.gitignore
@@ -144,3 +144,6 @@ data
# pixi
.pixi
pixi.lock
+
+# benchmarks
+benchmark*.json
diff --git a/.scripts/ci/benchmark_report.py b/.scripts/ci/benchmark_report.py
new file mode 100644
index 00000000..7477dd43
--- /dev/null
+++ b/.scripts/ci/benchmark_report.py
@@ -0,0 +1,245 @@
+#!/usr/bin/env python
+"""Generate benchmark reports from pytest-benchmark JSON output."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+
+
+def load_benchmarks(path: str | Path) -> dict:
+ """Load benchmarks from JSON file."""
+ path = Path(path)
+ if not path.exists():
+ return {}
+ with open(path) as f:
+ data = json.load(f)
+ return {b["name"]: b["stats"] for b in data.get("benchmarks", [])}
+
+
+def format_time(seconds: float) -> str:
+ """Format time in human-readable units."""
+ if seconds < 0.001:
+ return f"{seconds * 1_000_000:.2f}ยตs"
+ if seconds < 1:
+ return f"{seconds * 1000:.2f}ms"
+ return f"{seconds:.4f}s"
+
+
+def generate_report(
+ pr_path: str | Path,
+ base_path: str | Path | None = None,
+ pr_ref: str = "PR",
+ base_ref: str = "base",
+) -> str:
+ """
+ Generate markdown benchmark report.
+
+ Parameters
+ ----------
+ pr_path
+ Path to PR benchmark JSON file.
+ base_path
+ Path to base benchmark JSON file (optional, for comparison mode).
+ pr_ref
+ Name of PR branch/ref for display.
+ base_ref
+ Name of base branch/ref for display.
+
+ Returns
+ -------
+ Markdown formatted report string.
+ """
+ pr_benchmarks = load_benchmarks(pr_path)
+
+ if not pr_benchmarks:
+ return "โ No benchmark results found!"
+
+ # Comparison mode
+ if base_path:
+ base_benchmarks = load_benchmarks(base_path)
+ return _generate_comparison_report(pr_benchmarks, base_benchmarks, pr_ref, base_ref)
+
+ # Simple report mode (no comparison)
+ return _generate_simple_report(pr_benchmarks, pr_ref)
+
+
+def _generate_simple_report(benchmarks: dict, ref: str) -> str:
+ """Generate a simple benchmark report without comparison."""
+ lines = [
+ "## ๐ Benchmark Results\n",
+ f"Results for `{ref}`\n",
+ "| Benchmark | Mean | Std Dev | Min | Max | Rounds |",
+ "|-----------|------|---------|-----|-----|--------|",
+ ]
+
+ for name, stats in sorted(benchmarks.items()):
+ mean = format_time(stats["mean"])
+ stddev = format_time(stats["stddev"])
+ min_time = format_time(stats["min"])
+ max_time = format_time(stats["max"])
+ rounds = stats["rounds"]
+
+ lines.append(f"| `{name}` | {mean} | ยฑ{stddev} | {min_time} | {max_time} | {rounds} |")
+
+ lines.extend(
+ [
+ "",
+ "",
+ "๐ Raw Statistics
",
+ "",
+ "```",
+ ]
+ )
+
+ for name, stats in sorted(benchmarks.items()):
+ lines.append(f"\n{name}:")
+ lines.append(f" mean: {stats['mean']:.6f}s ยฑ {stats['stddev']:.6f}s")
+ lines.append(f" min: {stats['min']:.6f}s")
+ lines.append(f" max: {stats['max']:.6f}s")
+ lines.append(f" rounds: {stats['rounds']}")
+ if "iterations" in stats:
+ lines.append(f" iterations: {stats['iterations']}")
+
+ lines.extend(
+ [
+ "```",
+ " ",
+ ]
+ )
+
+ return "\n".join(lines)
+
+
+def _generate_comparison_report(
+ pr_benchmarks: dict,
+ base_benchmarks: dict,
+ pr_ref: str,
+ base_ref: str,
+) -> str:
+ """Generate a comparison benchmark report."""
+ lines = [
+ "## ๐ Benchmark Results\n",
+ f"Comparing `{pr_ref}` against `{base_ref}`\n",
+ "| Benchmark | PR (mean) | Base (mean) | Change |",
+ "|-----------|-----------|-------------|--------|",
+ ]
+
+ for name, pr_stats in sorted(pr_benchmarks.items()):
+ pr_mean = pr_stats["mean"]
+ pr_str = format_time(pr_mean)
+
+ if name in base_benchmarks:
+ base_mean = base_benchmarks[name]["mean"]
+ base_str = format_time(base_mean)
+ change = ((pr_mean - base_mean) / base_mean) * 100
+
+ if change > 10:
+ change_str = f"๐ด +{change:.1f}%"
+ elif change < -10:
+ change_str = f"๐ข {change:.1f}%"
+ else:
+ change_str = f"โช {change:+.1f}%"
+ else:
+ base_str = "N/A"
+ change_str = "๐ New"
+
+ lines.append(f"| `{name}` | {pr_str} | {base_str} | {change_str} |")
+
+ # Check for removed benchmarks
+ removed = set(base_benchmarks.keys()) - set(pr_benchmarks.keys())
+ if removed:
+ lines.append("")
+ lines.append("**Removed benchmarks:** " + ", ".join(f"`{n}`" for n in sorted(removed)))
+
+ lines.extend(
+ [
+ "",
+ "",
+ "๐ Detailed Statistics
",
+ "",
+ "```",
+ ]
+ )
+
+ for name, stats in sorted(pr_benchmarks.items()):
+ lines.append(f"\n{name}:")
+ lines.append(f" mean: {stats['mean']:.6f}s ยฑ {stats['stddev']:.6f}s")
+ lines.append(f" min: {stats['min']:.6f}s")
+ lines.append(f" max: {stats['max']:.6f}s")
+ lines.append(f" rounds: {stats['rounds']}")
+
+ lines.extend(
+ [
+ "```",
+ " ",
+ "",
+ "**Legend:** ๐ด >10% slower | ๐ข >10% faster | โช within 10% | ๐ new benchmark",
+ ]
+ )
+
+ return "\n".join(lines)
+
+
+def main() -> int:
+ """CLI entrypoint."""
+ parser = argparse.ArgumentParser(description="Generate benchmark reports")
+ parser.add_argument(
+ "pr_results",
+ help="Path to PR benchmark JSON file",
+ )
+ parser.add_argument(
+ "--base-results",
+ help="Path to base benchmark JSON file (enables comparison mode)",
+ )
+ parser.add_argument(
+ "--pr-ref",
+ default=os.environ.get("GITHUB_HEAD_REF", "PR"),
+ help="PR branch/ref name for display",
+ )
+ parser.add_argument(
+ "--base-ref",
+ default=os.environ.get("GITHUB_BASE_REF", "main"),
+ help="Base branch/ref name for display",
+ )
+ parser.add_argument(
+ "--output",
+ "-o",
+ help="Output file path (default: stdout)",
+ )
+ parser.add_argument(
+ "--github-summary",
+ action="store_true",
+ help="Also write to GITHUB_STEP_SUMMARY if available",
+ )
+
+ args = parser.parse_args()
+
+ report = generate_report(
+ pr_path=args.pr_results,
+ base_path=args.base_results,
+ pr_ref=args.pr_ref,
+ base_ref=args.base_ref,
+ )
+
+ # Output to file or stdout
+ if args.output:
+ Path(args.output).write_text(report)
+ print(f"Report written to {args.output}")
+ else:
+ print(report)
+
+ # Optionally write to GitHub step summary
+ if args.github_summary and "GITHUB_STEP_SUMMARY" in os.environ:
+ with open(os.environ["GITHUB_STEP_SUMMARY"], "a") as f:
+ f.write(report)
+ f.write("\n")
+
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/docs/notebooks b/docs/notebooks
index 1cbaf62a..98d97da4 160000
--- a/docs/notebooks
+++ b/docs/notebooks
@@ -1 +1 @@
-Subproject commit 1cbaf62a32f65b950552229d210b9884757ce116
+Subproject commit 98d97da49528e4229548a315a88569ebfc618942
diff --git a/pyproject.toml b/pyproject.toml
index bc36d037..de1a6ec3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -103,6 +103,7 @@ optional-dependencies.test = [
"coverage[toml]>=7",
"pytest>=7",
# Just for VS Code
+ "pytest-benchmark>=4",
"pytest-cov>=4",
"pytest-mock>=3.5",
"pytest-timeout>=2.1",
@@ -240,7 +241,13 @@ testpaths = [ "tests/" ]
xfail_strict = true
addopts = [
"--ignore=tests/plotting/test_interactive.py",
+ "--ignore=tests/benchmarks",
"--ignore=docs",
+ "--benchmark-disable",
+]
+markers = [
+ "benchmark: mark test as a benchmark test",
+ "slow: mark test as slow running",
]
[tool.coverage.run]
@@ -306,6 +313,9 @@ lint = "ruff check ."
format = "ruff format ."
pre-commit-install = "pre-commit install"
pre-commit = "pre-commit run"
+benchmark = "pytest tests/benchmarks/ --benchmark-enable --benchmark-only -v --color=yes"
+benchmark-save = "pytest tests/benchmarks/ --benchmark-enable --benchmark-only --benchmark-autosave -v --color=yes"
+benchmark-compare = "pytest tests/benchmarks/ --benchmark-enable --benchmark-only --benchmark-compare -v --color=yes"
[tool.cruft]
skip = [
diff --git a/tests/benchmarks/__init__.py b/tests/benchmarks/__init__.py
new file mode 100644
index 00000000..585436db
--- /dev/null
+++ b/tests/benchmarks/__init__.py
@@ -0,0 +1,11 @@
+"""Squidpy benchmarks package.
+
+This package contains performance benchmarks for squidpy functions.
+Benchmarks are excluded from regular pytest runs and can be executed with:
+
+ pytest benchmarks/ --benchmark-only -v
+
+For more options, see pytest-benchmark documentation.
+"""
+
+from __future__ import annotations
diff --git a/tests/benchmarks/conftest.py b/tests/benchmarks/conftest.py
new file mode 100644
index 00000000..e5a5164b
--- /dev/null
+++ b/tests/benchmarks/conftest.py
@@ -0,0 +1,98 @@
+"""Benchmark fixtures and configuration for squidpy performance tests."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+import pandas as pd
+import pytest
+from anndata import AnnData
+
+from squidpy._constants._pkg_constants import Key
+
+if TYPE_CHECKING:
+ from collections.abc import Callable
+
+
+def pytest_configure(config: pytest.Config) -> None:
+ """Register benchmark markers."""
+ config.addinivalue_line("markers", "benchmark: mark test as a benchmark test")
+
+
+@pytest.fixture(scope="session")
+def make_adata() -> Callable[[int, int], AnnData]:
+ """
+ Factory fixture to create synthetic AnnData objects for benchmarking.
+
+ Returns a function that generates AnnData with specified n_obs and n_clusters.
+ """
+
+ def _make_adata(n_obs: int, n_clusters: int = 10) -> AnnData:
+ """
+ Create a synthetic AnnData object for benchmarking.
+
+ Parameters
+ ----------
+ n_obs
+ Number of observations (cells).
+ n_clusters
+ Number of cluster categories.
+
+ Returns
+ -------
+ AnnData object with spatial coordinates and cluster labels.
+ """
+ rng = np.random.default_rng(42)
+
+ # Create random spatial coordinates
+ spatial_coords = rng.uniform(0, 1000, size=(n_obs, 2))
+
+ # Create random cluster assignments
+ cluster_labels = pd.Categorical(rng.choice([f"cluster_{i}" for i in range(n_clusters)], size=n_obs))
+
+ # Create minimal expression matrix
+ X = rng.random((n_obs, 50))
+
+ # Build AnnData
+ adata = AnnData(X=X)
+ adata.obsm[Key.obsm.spatial] = spatial_coords
+ adata.obs["cluster"] = cluster_labels
+
+ return adata
+
+ return _make_adata
+
+
+# Pre-defined dataset sizes for parameterized benchmarks
+# Adjust these values to match your benchmarking needs
+BENCHMARK_SIZES = {
+ "1k": 1_000,
+ "5k": 5_000,
+ "10k": 10_000,
+ "50k": 50_000,
+ "100k": 100_000,
+}
+
+
+@pytest.fixture(params=list(BENCHMARK_SIZES.keys()), ids=list(BENCHMARK_SIZES.keys()))
+def adata_scaling(request: pytest.FixtureRequest, make_adata: Callable[[int, int], AnnData]) -> AnnData:
+ """
+ Parameterized fixture that provides AnnData objects of varying sizes.
+
+ The fixture name makes it clear that the dataset size varies across test runs.
+ Sizes are defined in BENCHMARK_SIZES dict - modify that to change scale points.
+ """
+ size_name = request.param
+ n_obs = BENCHMARK_SIZES[size_name]
+ return make_adata(n_obs)
+
+
+# Default size for non-scaling benchmarks (uses first size in BENCHMARK_SIZES)
+DEFAULT_BENCHMARK_SIZE = next(iter(BENCHMARK_SIZES.values()))
+
+
+@pytest.fixture
+def adata_default(make_adata: Callable[[int, int], AnnData]) -> AnnData:
+ """Fixed dataset for non-scaling benchmarks. Size defined by DEFAULT_BENCHMARK_SIZE."""
+ return make_adata(DEFAULT_BENCHMARK_SIZE)
diff --git a/tests/benchmarks/test_co_occurrence.py b/tests/benchmarks/test_co_occurrence.py
new file mode 100644
index 00000000..f4265bf8
--- /dev/null
+++ b/tests/benchmarks/test_co_occurrence.py
@@ -0,0 +1,151 @@
+"""Benchmarks for squidpy.gr.co_occurrence function.
+
+Run benchmarks with:
+ pytest benchmarks/ --benchmark-only -v
+
+Compare against baseline:
+ pytest benchmarks/ --benchmark-only --benchmark-compare
+
+Save benchmark results:
+ pytest benchmarks/ --benchmark-only --benchmark-autosave
+
+Multithreading Behavior
+-----------------------
+pytest-benchmark runs each benchmark function sequentially (one at a time).
+Within each benchmark iteration, the function under test (e.g., co_occurrence)
+can use multiple threads/processes as configured by its parameters (n_jobs).
+
+This means:
+- Benchmarks do NOT run in parallel with each other
+- Each benchmark has full access to system resources
+- Functions using numba @njit(parallel=True) will use multiple threads
+- Functions using joblib/loky parallelization will spawn worker processes
+
+To benchmark single-threaded vs multi-threaded performance, create separate
+test cases with different n_jobs values, or use pytest parametrization.
+
+Note: pytest-xdist (-n auto) parallelizes test COLLECTION, not benchmark
+execution. For accurate benchmarks, avoid -n with benchmark tests.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pytest
+
+from squidpy.gr import co_occurrence
+
+if TYPE_CHECKING:
+ from collections.abc import Callable
+
+ from anndata import AnnData
+
+
+class TestCoOccurrenceBenchmarks:
+ """Benchmark suite for co_occurrence function scaling with dataset size."""
+
+ @pytest.mark.benchmark(group="co_occurrence_scaling")
+ def test_co_occurrence_scaling(
+ self,
+ benchmark: pytest.benchmark.fixture.BenchmarkFixture,
+ adata_scaling: AnnData,
+ ) -> None:
+ """
+ Benchmark co_occurrence across different dataset sizes.
+
+ The adata_scaling fixture is parameterized with sizes defined in
+ conftest.BENCHMARK_SIZES. Modify that dict to change scale points.
+ """
+ benchmark.extra_info["n_obs"] = adata_scaling.n_obs
+ benchmark(co_occurrence, adata_scaling, cluster_key="cluster", copy=True)
+
+
+class TestCoOccurrenceIntervalBenchmarks:
+ """Benchmark suite for co_occurrence with different interval parameters."""
+
+ @pytest.mark.benchmark(group="co_occurrence_intervals")
+ @pytest.mark.parametrize("n_intervals", [10, 25, 50, 100])
+ def test_co_occurrence_intervals(
+ self,
+ benchmark: pytest.benchmark.fixture.BenchmarkFixture,
+ adata_default: AnnData,
+ n_intervals: int,
+ ) -> None:
+ """Benchmark co_occurrence with different interval counts."""
+ benchmark.extra_info["n_intervals"] = n_intervals
+ benchmark.extra_info["n_obs"] = adata_default.n_obs
+ benchmark(
+ co_occurrence,
+ adata_default,
+ cluster_key="cluster",
+ interval=n_intervals,
+ copy=True,
+ )
+
+
+class TestCoOccurrenceNumbaCompilation:
+ """Benchmark numba compilation overhead."""
+
+ @pytest.mark.benchmark(group="co_occurrence_warmup")
+ def test_co_occurrence_first_run(
+ self,
+ benchmark: pytest.benchmark.fixture.BenchmarkFixture,
+ make_adata: Callable[[int, int], AnnData],
+ ) -> None:
+ """
+ Benchmark first run to capture numba compilation overhead.
+
+ Note: This test measures cold-start performance including JIT compilation.
+ Run with --benchmark-warmup=off to capture compilation time.
+ """
+
+ # Create fresh adata each time to avoid caching effects
+ def run_co_occurrence() -> None:
+ adata = make_adata(100)
+ co_occurrence(adata, cluster_key="cluster", copy=True)
+
+ benchmark.pedantic(run_co_occurrence, warmup_rounds=0, rounds=3)
+
+
+# Parametrized benchmark for comprehensive scaling analysis
+@pytest.mark.benchmark(group="co_occurrence_comprehensive")
+@pytest.mark.parametrize(
+ "n_obs,n_clusters,n_intervals",
+ [
+ (1_000, 5, 25),
+ (1_000, 10, 50),
+ (5_000, 10, 50),
+ (10_000, 10, 50),
+ (10_000, 20, 50),
+ (50_000, 10, 50),
+ ],
+)
+def test_co_occurrence_comprehensive(
+ benchmark: pytest.benchmark.fixture.BenchmarkFixture,
+ make_adata: Callable[[int, int], AnnData],
+ n_obs: int,
+ n_clusters: int,
+ n_intervals: int,
+) -> None:
+ """
+ Comprehensive parametrized benchmark for co_occurrence.
+
+ Tests various combinations of dataset size, cluster count, and intervals
+ to understand scaling behavior across multiple dimensions.
+ """
+ adata = make_adata(n_obs, n_clusters=n_clusters)
+ benchmark.extra_info.update(
+ {
+ "n_obs": n_obs,
+ "n_clusters": n_clusters,
+ "n_intervals": n_intervals,
+ }
+ )
+ benchmark(
+ co_occurrence,
+ adata,
+ cluster_key="cluster",
+ interval=n_intervals,
+ copy=True,
+ )