diff --git a/.github/workflows/benchmark.yaml b/.github/workflows/benchmark.yaml
new file mode 100644
index 00000000..33622d26
--- /dev/null
+++ b/.github/workflows/benchmark.yaml
@@ -0,0 +1,169 @@
+name: Benchmarks
+
+on:
+  pull_request:
+    types: [labeled, synchronize]
+  workflow_dispatch:
+    inputs:
+      mode:
+        description: 'Benchmark mode'
+        required: true
+        type: choice
+        options:
+          - run
+          - compare
+        default: 'run'
+      comparison_ref:
+        description: 'Git ref to compare against (only used in compare mode)'
+        required: false
+        default: 'main'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  FORCE_COLOR: "1"
+  MPLBACKEND: agg
+  UV_COMPILE_BYTECODE: "1"
+
+defaults:
+  run:
+    # to fail on error in multiline statements (-e), in pipes (-o pipefail), and on unset variables (-u).
+    shell: bash -euo pipefail {0}
+
+jobs:
+  benchmark:
+    # Run on 'pytest-benchmark' label or manual dispatch
+    if: |
+      github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'pytest-benchmark'))
+    name: Run Benchmarks
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      pull-requests: write
+
+    steps:
+      - uses: actions/checkout@v5
+        with:
+          filter: blob:none
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.13"
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
+        with:
+          enable-cache: true
+          cache-dependency-glob: pyproject.toml
+
+      - name: Install dependencies
+        run: uv pip install -e ".[test]" --system
+
+      - name: Run benchmarks
+        run: |
+          pytest tests/benchmarks/ \
+            --benchmark-enable \
+            --benchmark-only \
+            --benchmark-json=benchmark-results.json \
+            --benchmark-warmup=on \
+            --benchmark-disable-gc \
+            --benchmark-min-rounds=5 \
+            -v --color=yes
+
+      - name: Generate report (simple mode)
+        if: github.event.inputs.mode != 'compare'
+        run: |
+          python .scripts/ci/benchmark_report.py \
+            benchmark-results.json \
+            --pr-ref="${GITHUB_HEAD_REF:-$GITHUB_REF_NAME}" \
+            --output=benchmark-report.md \
+            --github-summary
+
+      - name: Run comparison benchmarks
+        if: github.event.inputs.mode == 'compare'
+        run: |
+          # Save current results
+          mv benchmark-results.json benchmark-pr.json
+
+          # Checkout base
+          git checkout ${{ github.event.inputs.comparison_ref || 'main' }}
+
+          # Reinstall in case dependencies changed
+          uv pip install -e ".[test]" --system
+
+          # Run benchmarks on base
+          pytest tests/benchmarks/ \
+            --benchmark-enable \
+            --benchmark-only \
+            --benchmark-json=benchmark-base.json \
+            --benchmark-warmup=on \
+            --benchmark-disable-gc \
+            --benchmark-min-rounds=5 \
+            -v --color=yes || true
+
+          # Checkout back
+          git checkout ${{ github.sha }}
+
+          # Rename for report
+          mv benchmark-pr.json benchmark-results.json
+
+      - name: Generate comparison report
+        if: github.event.inputs.mode == 'compare'
+        run: |
+          python .scripts/ci/benchmark_report.py \
+            benchmark-results.json \
+            --base-results=benchmark-base.json \
+            --pr-ref="${GITHUB_HEAD_REF:-$GITHUB_REF_NAME}" \
+            --base-ref="${{ github.event.inputs.comparison_ref || 'main' }}" \
+            --output=benchmark-report.md \
+            --github-summary
+
+      - name: Comment on PR
+        if: github.event_name == 'pull_request'
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+            const report = fs.readFileSync('benchmark-report.md', 'utf8');
+
+            const { data: comments } = await github.rest.issues.listComments({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+            });
+
+            const botComment = comments.find(comment =>
+              comment.user.type === 'Bot' &&
+              comment.body.includes('## 📊 Benchmark Results')
+            );
+
+            if (botComment) {
+              await github.rest.issues.updateComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                comment_id: botComment.id,
+                body: report
+              });
+            } else {
+              await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: context.issue.number,
+                body: report
+              });
+            }
+
+      - name: Upload benchmark results
+        uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-results
+          path: |
+            benchmark-results.json
+            benchmark-base.json
+            benchmark-report.md
+          if-no-files-found: ignore
diff --git a/.gitignore b/.gitignore
index b638264c..f632b140 100644
--- a/.gitignore
+++ b/.gitignore
@@ -144,3 +144,6 @@ data
 # pixi
 .pixi
 pixi.lock
+
+# benchmarks
+benchmark*.json
diff --git a/.scripts/ci/benchmark_report.py b/.scripts/ci/benchmark_report.py
new file mode 100644
index 00000000..7477dd43
--- /dev/null
+++ b/.scripts/ci/benchmark_report.py
@@ -0,0 +1,245 @@
+#!/usr/bin/env python
+"""Generate benchmark reports from pytest-benchmark JSON output."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+
+
+def load_benchmarks(path: str | Path) -> dict:
+    """Load benchmarks from JSON file."""
+    path = Path(path)
+    if not path.exists():
+        return {}
+    with open(path) as f:
+        data = json.load(f)
+    return {b["name"]: b["stats"] for b in data.get("benchmarks", [])}
+
+
+def format_time(seconds: float) -> str:
+    """Format time in human-readable units."""
+    if seconds < 0.001:
+        return f"{seconds * 1_000_000:.2f}µs"
+    if seconds < 1:
+        return f"{seconds * 1000:.2f}ms"
+    return f"{seconds:.4f}s"
+
+
+def generate_report(
+    pr_path: str | Path,
+    base_path: str | Path | None = None,
+    pr_ref: str = "PR",
+    base_ref: str = "base",
+) -> str:
+    """
+    Generate markdown benchmark report.
+
+    Parameters
+    ----------
+    pr_path
+        Path to PR benchmark JSON file.
+    base_path
+        Path to base benchmark JSON file (optional, for comparison mode).
+    pr_ref
+        Name of PR branch/ref for display.
+    base_ref
+        Name of base branch/ref for display.
+
+    Returns
+    -------
+    Markdown formatted report string.
+    """
+    pr_benchmarks = load_benchmarks(pr_path)
+
+    if not pr_benchmarks:
+        return "❌ No benchmark results found!"
+
+    # Comparison mode
+    if base_path:
+        base_benchmarks = load_benchmarks(base_path)
+        return _generate_comparison_report(pr_benchmarks, base_benchmarks, pr_ref, base_ref)
+
+    # Simple report mode (no comparison)
+    return _generate_simple_report(pr_benchmarks, pr_ref)
+
+
+def _generate_simple_report(benchmarks: dict, ref: str) -> str:
+    """Generate a simple benchmark report without comparison."""
+    lines = [
+        "## 📊 Benchmark Results\n",
+        f"Results for `{ref}`\n",
+        "| Benchmark | Mean | Std Dev | Min | Max | Rounds |",
+        "|-----------|------|---------|-----|-----|--------|",
+    ]
+
+    for name, stats in sorted(benchmarks.items()):
+        mean = format_time(stats["mean"])
+        stddev = format_time(stats["stddev"])
+        min_time = format_time(stats["min"])
+        max_time = format_time(stats["max"])
+        rounds = stats["rounds"]
+
+        lines.append(f"| `{name}` | {mean} | ±{stddev} | {min_time} | {max_time} | {rounds} |")
+
+    lines.extend(
+        [
+            "",
+            "<details>",
+            "<summary>📈 Raw Statistics</summary>",
+            "",
+            "```",
+        ]
+    )
+
+    for name, stats in sorted(benchmarks.items()):
+        lines.append(f"\n{name}:")
+        lines.append(f"  mean:   {stats['mean']:.6f}s ± {stats['stddev']:.6f}s")
+        lines.append(f"  min:    {stats['min']:.6f}s")
+        lines.append(f"  max:    {stats['max']:.6f}s")
+        lines.append(f"  rounds: {stats['rounds']}")
+        if "iterations" in stats:
+            lines.append(f"  iterations: {stats['iterations']}")
+
+    lines.extend(
+        [
+            "```",
+            "</details>",
+        ]
+    )
+
+    return "\n".join(lines)
+
+
+def _generate_comparison_report(
+    pr_benchmarks: dict,
+    base_benchmarks: dict,
+    pr_ref: str,
+    base_ref: str,
+) -> str:
+    """Generate a comparison benchmark report."""
+    lines = [
+        "## 📊 Benchmark Results\n",
+        f"Comparing `{pr_ref}` against `{base_ref}`\n",
+        "| Benchmark | PR (mean) | Base (mean) | Change |",
+        "|-----------|-----------|-------------|--------|",
+    ]
+
+    for name, pr_stats in sorted(pr_benchmarks.items()):
+        pr_mean = pr_stats["mean"]
+        pr_str = format_time(pr_mean)
+
+        if name in base_benchmarks:
+            base_mean = base_benchmarks[name]["mean"]
+            base_str = format_time(base_mean)
+            change = ((pr_mean - base_mean) / base_mean) * 100
+
+            if change > 10:
+                change_str = f"🔴 +{change:.1f}%"
+            elif change < -10:
+                change_str = f"🟢 {change:.1f}%"
+            else:
+                change_str = f"⚪ {change:+.1f}%"
+        else:
+            base_str = "N/A"
+            change_str = "🆕 New"
+
+        lines.append(f"| `{name}` | {pr_str} | {base_str} | {change_str} |")
+
+    # Check for removed benchmarks
+    removed = set(base_benchmarks.keys()) - set(pr_benchmarks.keys())
+    if removed:
+        lines.append("")
+        lines.append("**Removed benchmarks:** " + ", ".join(f"`{n}`" for n in sorted(removed)))
+
+    lines.extend(
+        [
+            "",
+            "<details>",
+            "<summary>📈 Detailed Statistics</summary>",
+            "",
+            "```",
+        ]
+    )
+
+    for name, stats in sorted(pr_benchmarks.items()):
+        lines.append(f"\n{name}:")
+        lines.append(f"  mean:   {stats['mean']:.6f}s ± {stats['stddev']:.6f}s")
+        lines.append(f"  min:    {stats['min']:.6f}s")
+        lines.append(f"  max:    {stats['max']:.6f}s")
+        lines.append(f"  rounds: {stats['rounds']}")
+
+    lines.extend(
+        [
+            "```",
+            "</details>",
+            "",
+            "**Legend:** 🔴 >10% slower | 🟢 >10% faster | ⚪ within 10% | 🆕 new benchmark",
+        ]
+    )
+
+    return "\n".join(lines)
+
+
+def main() -> int:
+    """CLI entrypoint."""
+    parser = argparse.ArgumentParser(description="Generate benchmark reports")
+    parser.add_argument(
+        "pr_results",
+        help="Path to PR benchmark JSON file",
+    )
+    parser.add_argument(
+        "--base-results",
+        help="Path to base benchmark JSON file (enables comparison mode)",
+    )
+    parser.add_argument(
+        "--pr-ref",
+        default=os.environ.get("GITHUB_HEAD_REF", "PR"),
+        help="PR branch/ref name for display",
+    )
+    parser.add_argument(
+        "--base-ref",
+        default=os.environ.get("GITHUB_BASE_REF", "main"),
+        help="Base branch/ref name for display",
+    )
+    parser.add_argument(
+        "--output",
+        "-o",
+        help="Output file path (default: stdout)",
+    )
+    parser.add_argument(
+        "--github-summary",
+        action="store_true",
+        help="Also write to GITHUB_STEP_SUMMARY if available",
+    )
+
+    args = parser.parse_args()
+
+    report = generate_report(
+        pr_path=args.pr_results,
+        base_path=args.base_results,
+        pr_ref=args.pr_ref,
+        base_ref=args.base_ref,
+    )
+
+    # Output to file or stdout
+    if args.output:
+        Path(args.output).write_text(report)
+        print(f"Report written to {args.output}")
+    else:
+        print(report)
+
+    # Optionally write to GitHub step summary
+    if args.github_summary and "GITHUB_STEP_SUMMARY" in os.environ:
+        with open(os.environ["GITHUB_STEP_SUMMARY"], "a") as f:
+            f.write(report)
+            f.write("\n")
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/docs/notebooks b/docs/notebooks
index 1cbaf62a..98d97da4 160000
--- a/docs/notebooks
+++ b/docs/notebooks
@@ -1 +1 @@
-Subproject commit 1cbaf62a32f65b950552229d210b9884757ce116
+Subproject commit 98d97da49528e4229548a315a88569ebfc618942
diff --git a/pyproject.toml b/pyproject.toml
index bc36d037..de1a6ec3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -103,6 +103,7 @@ optional-dependencies.test = [
   "coverage[toml]>=7",
   "pytest>=7",
   # Just for VS Code
+  "pytest-benchmark>=4",
   "pytest-cov>=4",
   "pytest-mock>=3.5",
   "pytest-timeout>=2.1",
@@ -240,7 +241,13 @@ testpaths = [ "tests/" ]
 xfail_strict = true
 addopts = [
   "--ignore=tests/plotting/test_interactive.py",
+  "--ignore=tests/benchmarks",
   "--ignore=docs",
+  "--benchmark-disable",
+]
+markers = [
+  "benchmark: mark test as a benchmark test",
+  "slow: mark test as slow running",
 ]
 
 [tool.coverage.run]
@@ -306,6 +313,9 @@ lint = "ruff check ."
 format = "ruff format ."
 pre-commit-install = "pre-commit install"
 pre-commit = "pre-commit run"
+benchmark = "pytest tests/benchmarks/ --benchmark-enable --benchmark-only -v --color=yes"
+benchmark-save = "pytest tests/benchmarks/ --benchmark-enable --benchmark-only --benchmark-autosave -v --color=yes"
+benchmark-compare = "pytest tests/benchmarks/ --benchmark-enable --benchmark-only --benchmark-compare -v --color=yes"
 
 [tool.cruft]
 skip = [
diff --git a/tests/benchmarks/__init__.py b/tests/benchmarks/__init__.py
new file mode 100644
index 00000000..585436db
--- /dev/null
+++ b/tests/benchmarks/__init__.py
@@ -0,0 +1,11 @@
+"""Squidpy benchmarks package.
+
+This package contains performance benchmarks for squidpy functions.
+Benchmarks are excluded from regular pytest runs and can be executed with:
+
+    pytest benchmarks/ --benchmark-only -v
+
+For more options, see pytest-benchmark documentation.
+"""
+
+from __future__ import annotations
diff --git a/tests/benchmarks/conftest.py b/tests/benchmarks/conftest.py
new file mode 100644
index 00000000..e5a5164b
--- /dev/null
+++ b/tests/benchmarks/conftest.py
@@ -0,0 +1,98 @@
+"""Benchmark fixtures and configuration for squidpy performance tests."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+import pandas as pd
+import pytest
+from anndata import AnnData
+
+from squidpy._constants._pkg_constants import Key
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
+def pytest_configure(config: pytest.Config) -> None:
+    """Register benchmark markers."""
+    config.addinivalue_line("markers", "benchmark: mark test as a benchmark test")
+
+
+@pytest.fixture(scope="session")
+def make_adata() -> Callable[[int, int], AnnData]:
+    """
+    Factory fixture to create synthetic AnnData objects for benchmarking.
+
+    Returns a function that generates AnnData with specified n_obs and n_clusters.
+    """
+
+    def _make_adata(n_obs: int, n_clusters: int = 10) -> AnnData:
+        """
+        Create a synthetic AnnData object for benchmarking.
+
+        Parameters
+        ----------
+        n_obs
+            Number of observations (cells).
+        n_clusters
+            Number of cluster categories.
+
+        Returns
+        -------
+        AnnData object with spatial coordinates and cluster labels.
+        """
+        rng = np.random.default_rng(42)
+
+        # Create random spatial coordinates
+        spatial_coords = rng.uniform(0, 1000, size=(n_obs, 2))
+
+        # Create random cluster assignments
+        cluster_labels = pd.Categorical(rng.choice([f"cluster_{i}" for i in range(n_clusters)], size=n_obs))
+
+        # Create minimal expression matrix
+        X = rng.random((n_obs, 50))
+
+        # Build AnnData
+        adata = AnnData(X=X)
+        adata.obsm[Key.obsm.spatial] = spatial_coords
+        adata.obs["cluster"] = cluster_labels
+
+        return adata
+
+    return _make_adata
+
+
+# Pre-defined dataset sizes for parameterized benchmarks
+# Adjust these values to match your benchmarking needs
+BENCHMARK_SIZES = {
+    "1k": 1_000,
+    "5k": 5_000,
+    "10k": 10_000,
+    "50k": 50_000,
+    "100k": 100_000,
+}
+
+
+@pytest.fixture(params=list(BENCHMARK_SIZES.keys()), ids=list(BENCHMARK_SIZES.keys()))
+def adata_scaling(request: pytest.FixtureRequest, make_adata: Callable[[int, int], AnnData]) -> AnnData:
+    """
+    Parameterized fixture that provides AnnData objects of varying sizes.
+
+    The fixture name makes it clear that the dataset size varies across test runs.
+    Sizes are defined in BENCHMARK_SIZES dict - modify that to change scale points.
+    """
+    size_name = request.param
+    n_obs = BENCHMARK_SIZES[size_name]
+    return make_adata(n_obs)
+
+
+# Default size for non-scaling benchmarks (uses first size in BENCHMARK_SIZES)
+DEFAULT_BENCHMARK_SIZE = next(iter(BENCHMARK_SIZES.values()))
+
+
+@pytest.fixture
+def adata_default(make_adata: Callable[[int, int], AnnData]) -> AnnData:
+    """Fixed dataset for non-scaling benchmarks. Size defined by DEFAULT_BENCHMARK_SIZE."""
+    return make_adata(DEFAULT_BENCHMARK_SIZE)
diff --git a/tests/benchmarks/test_co_occurrence.py b/tests/benchmarks/test_co_occurrence.py
new file mode 100644
index 00000000..f4265bf8
--- /dev/null
+++ b/tests/benchmarks/test_co_occurrence.py
@@ -0,0 +1,151 @@
+"""Benchmarks for squidpy.gr.co_occurrence function.
+
+Run benchmarks with:
+    pytest benchmarks/ --benchmark-only -v
+
+Compare against baseline:
+    pytest benchmarks/ --benchmark-only --benchmark-compare
+
+Save benchmark results:
+    pytest benchmarks/ --benchmark-only --benchmark-autosave
+
+Multithreading Behavior
+-----------------------
+pytest-benchmark runs each benchmark function sequentially (one at a time).
+Within each benchmark iteration, the function under test (e.g., co_occurrence)
+can use multiple threads/processes as configured by its parameters (n_jobs).
+
+This means:
+- Benchmarks do NOT run in parallel with each other
+- Each benchmark has full access to system resources
+- Functions using numba @njit(parallel=True) will use multiple threads
+- Functions using joblib/loky parallelization will spawn worker processes
+
+To benchmark single-threaded vs multi-threaded performance, create separate
+test cases with different n_jobs values, or use pytest parametrization.
+
+Note: pytest-xdist (-n auto) parallelizes test COLLECTION, not benchmark
+execution. For accurate benchmarks, avoid -n with benchmark tests.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pytest
+
+from squidpy.gr import co_occurrence
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+    from anndata import AnnData
+
+
+class TestCoOccurrenceBenchmarks:
+    """Benchmark suite for co_occurrence function scaling with dataset size."""
+
+    @pytest.mark.benchmark(group="co_occurrence_scaling")
+    def test_co_occurrence_scaling(
+        self,
+        benchmark: pytest.benchmark.fixture.BenchmarkFixture,
+        adata_scaling: AnnData,
+    ) -> None:
+        """
+        Benchmark co_occurrence across different dataset sizes.
+
+        The adata_scaling fixture is parameterized with sizes defined in
+        conftest.BENCHMARK_SIZES. Modify that dict to change scale points.
+        """
+        benchmark.extra_info["n_obs"] = adata_scaling.n_obs
+        benchmark(co_occurrence, adata_scaling, cluster_key="cluster", copy=True)
+
+
+class TestCoOccurrenceIntervalBenchmarks:
+    """Benchmark suite for co_occurrence with different interval parameters."""
+
+    @pytest.mark.benchmark(group="co_occurrence_intervals")
+    @pytest.mark.parametrize("n_intervals", [10, 25, 50, 100])
+    def test_co_occurrence_intervals(
+        self,
+        benchmark: pytest.benchmark.fixture.BenchmarkFixture,
+        adata_default: AnnData,
+        n_intervals: int,
+    ) -> None:
+        """Benchmark co_occurrence with different interval counts."""
+        benchmark.extra_info["n_intervals"] = n_intervals
+        benchmark.extra_info["n_obs"] = adata_default.n_obs
+        benchmark(
+            co_occurrence,
+            adata_default,
+            cluster_key="cluster",
+            interval=n_intervals,
+            copy=True,
+        )
+
+
+class TestCoOccurrenceNumbaCompilation:
+    """Benchmark numba compilation overhead."""
+
+    @pytest.mark.benchmark(group="co_occurrence_warmup")
+    def test_co_occurrence_first_run(
+        self,
+        benchmark: pytest.benchmark.fixture.BenchmarkFixture,
+        make_adata: Callable[[int, int], AnnData],
+    ) -> None:
+        """
+        Benchmark first run to capture numba compilation overhead.
+
+        Note: This test measures cold-start performance including JIT compilation.
+        Run with --benchmark-warmup=off to capture compilation time.
+        """
+
+        # Create fresh adata each time to avoid caching effects
+        def run_co_occurrence() -> None:
+            adata = make_adata(100)
+            co_occurrence(adata, cluster_key="cluster", copy=True)
+
+        benchmark.pedantic(run_co_occurrence, warmup_rounds=0, rounds=3)
+
+
+# Parametrized benchmark for comprehensive scaling analysis
+@pytest.mark.benchmark(group="co_occurrence_comprehensive")
+@pytest.mark.parametrize(
+    "n_obs,n_clusters,n_intervals",
+    [
+        (1_000, 5, 25),
+        (1_000, 10, 50),
+        (5_000, 10, 50),
+        (10_000, 10, 50),
+        (10_000, 20, 50),
+        (50_000, 10, 50),
+    ],
+)
+def test_co_occurrence_comprehensive(
+    benchmark: pytest.benchmark.fixture.BenchmarkFixture,
+    make_adata: Callable[[int, int], AnnData],
+    n_obs: int,
+    n_clusters: int,
+    n_intervals: int,
+) -> None:
+    """
+    Comprehensive parametrized benchmark for co_occurrence.
+
+    Tests various combinations of dataset size, cluster count, and intervals
+    to understand scaling behavior across multiple dimensions.
+    """
+    adata = make_adata(n_obs, n_clusters=n_clusters)
+    benchmark.extra_info.update(
+        {
+            "n_obs": n_obs,
+            "n_clusters": n_clusters,
+            "n_intervals": n_intervals,
+        }
+    )
+    benchmark(
+        co_occurrence,
+        adata,
+        cluster_key="cluster",
+        interval=n_intervals,
+        copy=True,
+    )