diff --git a/.github/workflows/benchmark.yaml b/.github/workflows/benchmark.yaml new file mode 100644 index 00000000..33622d26 --- /dev/null +++ b/.github/workflows/benchmark.yaml @@ -0,0 +1,169 @@ +name: Benchmarks + +on: + pull_request: + types: [labeled, synchronize] + workflow_dispatch: + inputs: + mode: + description: 'Benchmark mode' + required: true + type: choice + options: + - run + - compare + default: 'run' + comparison_ref: + description: 'Git ref to compare against (only used in compare mode)' + required: false + default: 'main' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +env: + FORCE_COLOR: "1" + MPLBACKEND: agg + UV_COMPILE_BYTECODE: "1" + +defaults: + run: + # to fail on error in multiline statements (-e), in pipes (-o pipefail), and on unset variables (-u). + shell: bash -euo pipefail {0} + +jobs: + benchmark: + # Run on 'pytest-benchmark' label or manual dispatch + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'pytest-benchmark')) + name: Run Benchmarks + runs-on: ubuntu-latest + permissions: + contents: write + pull-requests: write + + steps: + - uses: actions/checkout@v5 + with: + filter: blob:none + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.13" + + - name: Install uv + uses: astral-sh/setup-uv@v7 + with: + enable-cache: true + cache-dependency-glob: pyproject.toml + + - name: Install dependencies + run: uv pip install -e ".[test]" --system + + - name: Run benchmarks + run: | + pytest tests/benchmarks/ \ + --benchmark-enable \ + --benchmark-only \ + --benchmark-json=benchmark-results.json \ + --benchmark-warmup=on \ + --benchmark-disable-gc \ + --benchmark-min-rounds=5 \ + -v --color=yes + + - name: Generate report (simple mode) + if: github.event.inputs.mode != 'compare' + run: | + python .scripts/ci/benchmark_report.py \ + benchmark-results.json \ + --pr-ref="${GITHUB_HEAD_REF:-$GITHUB_REF_NAME}" \ + --output=benchmark-report.md \ + --github-summary + + - name: Run comparison benchmarks + if: github.event.inputs.mode == 'compare' + run: | + # Save current results + mv benchmark-results.json benchmark-pr.json + + # Checkout base + git checkout ${{ github.event.inputs.comparison_ref || 'main' }} + + # Reinstall in case dependencies changed + uv pip install -e ".[test]" --system + + # Run benchmarks on base + pytest tests/benchmarks/ \ + --benchmark-enable \ + --benchmark-only \ + --benchmark-json=benchmark-base.json \ + --benchmark-warmup=on \ + --benchmark-disable-gc \ + --benchmark-min-rounds=5 \ + -v --color=yes || true + + # Checkout back + git checkout ${{ github.sha }} + + # Rename for report + mv benchmark-pr.json benchmark-results.json + + - name: Generate comparison report + if: github.event.inputs.mode == 'compare' + run: | + python .scripts/ci/benchmark_report.py \ + benchmark-results.json \ + --base-results=benchmark-base.json \ + --pr-ref="${GITHUB_HEAD_REF:-$GITHUB_REF_NAME}" \ + --base-ref="${{ github.event.inputs.comparison_ref || 'main' }}" \ + --output=benchmark-report.md \ + --github-summary + + - name: Comment on PR + if: github.event_name == 'pull_request' + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + const report = fs.readFileSync('benchmark-report.md', 'utf8'); + + const { data: comments } = await github.rest.issues.listComments({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + }); + + const botComment = comments.find(comment => + comment.user.type === 'Bot' && + comment.body.includes('## ๐Ÿ“Š Benchmark Results') + ); + + if (botComment) { + await github.rest.issues.updateComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: botComment.id, + body: report + }); + } else { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body: report + }); + } + + - name: Upload benchmark results + uses: actions/upload-artifact@v4 + with: + name: benchmark-results + path: | + benchmark-results.json + benchmark-base.json + benchmark-report.md + if-no-files-found: ignore diff --git a/.gitignore b/.gitignore index b638264c..f632b140 100644 --- a/.gitignore +++ b/.gitignore @@ -144,3 +144,6 @@ data # pixi .pixi pixi.lock + +# benchmarks +benchmark*.json diff --git a/.scripts/ci/benchmark_report.py b/.scripts/ci/benchmark_report.py new file mode 100644 index 00000000..7477dd43 --- /dev/null +++ b/.scripts/ci/benchmark_report.py @@ -0,0 +1,245 @@ +#!/usr/bin/env python +"""Generate benchmark reports from pytest-benchmark JSON output.""" + +from __future__ import annotations + +import argparse +import json +import os +import sys +from pathlib import Path + + +def load_benchmarks(path: str | Path) -> dict: + """Load benchmarks from JSON file.""" + path = Path(path) + if not path.exists(): + return {} + with open(path) as f: + data = json.load(f) + return {b["name"]: b["stats"] for b in data.get("benchmarks", [])} + + +def format_time(seconds: float) -> str: + """Format time in human-readable units.""" + if seconds < 0.001: + return f"{seconds * 1_000_000:.2f}ยตs" + if seconds < 1: + return f"{seconds * 1000:.2f}ms" + return f"{seconds:.4f}s" + + +def generate_report( + pr_path: str | Path, + base_path: str | Path | None = None, + pr_ref: str = "PR", + base_ref: str = "base", +) -> str: + """ + Generate markdown benchmark report. + + Parameters + ---------- + pr_path + Path to PR benchmark JSON file. + base_path + Path to base benchmark JSON file (optional, for comparison mode). + pr_ref + Name of PR branch/ref for display. + base_ref + Name of base branch/ref for display. + + Returns + ------- + Markdown formatted report string. + """ + pr_benchmarks = load_benchmarks(pr_path) + + if not pr_benchmarks: + return "โŒ No benchmark results found!" + + # Comparison mode + if base_path: + base_benchmarks = load_benchmarks(base_path) + return _generate_comparison_report(pr_benchmarks, base_benchmarks, pr_ref, base_ref) + + # Simple report mode (no comparison) + return _generate_simple_report(pr_benchmarks, pr_ref) + + +def _generate_simple_report(benchmarks: dict, ref: str) -> str: + """Generate a simple benchmark report without comparison.""" + lines = [ + "## ๐Ÿ“Š Benchmark Results\n", + f"Results for `{ref}`\n", + "| Benchmark | Mean | Std Dev | Min | Max | Rounds |", + "|-----------|------|---------|-----|-----|--------|", + ] + + for name, stats in sorted(benchmarks.items()): + mean = format_time(stats["mean"]) + stddev = format_time(stats["stddev"]) + min_time = format_time(stats["min"]) + max_time = format_time(stats["max"]) + rounds = stats["rounds"] + + lines.append(f"| `{name}` | {mean} | ยฑ{stddev} | {min_time} | {max_time} | {rounds} |") + + lines.extend( + [ + "", + "
", + "๐Ÿ“ˆ Raw Statistics", + "", + "```", + ] + ) + + for name, stats in sorted(benchmarks.items()): + lines.append(f"\n{name}:") + lines.append(f" mean: {stats['mean']:.6f}s ยฑ {stats['stddev']:.6f}s") + lines.append(f" min: {stats['min']:.6f}s") + lines.append(f" max: {stats['max']:.6f}s") + lines.append(f" rounds: {stats['rounds']}") + if "iterations" in stats: + lines.append(f" iterations: {stats['iterations']}") + + lines.extend( + [ + "```", + "
", + ] + ) + + return "\n".join(lines) + + +def _generate_comparison_report( + pr_benchmarks: dict, + base_benchmarks: dict, + pr_ref: str, + base_ref: str, +) -> str: + """Generate a comparison benchmark report.""" + lines = [ + "## ๐Ÿ“Š Benchmark Results\n", + f"Comparing `{pr_ref}` against `{base_ref}`\n", + "| Benchmark | PR (mean) | Base (mean) | Change |", + "|-----------|-----------|-------------|--------|", + ] + + for name, pr_stats in sorted(pr_benchmarks.items()): + pr_mean = pr_stats["mean"] + pr_str = format_time(pr_mean) + + if name in base_benchmarks: + base_mean = base_benchmarks[name]["mean"] + base_str = format_time(base_mean) + change = ((pr_mean - base_mean) / base_mean) * 100 + + if change > 10: + change_str = f"๐Ÿ”ด +{change:.1f}%" + elif change < -10: + change_str = f"๐ŸŸข {change:.1f}%" + else: + change_str = f"โšช {change:+.1f}%" + else: + base_str = "N/A" + change_str = "๐Ÿ†• New" + + lines.append(f"| `{name}` | {pr_str} | {base_str} | {change_str} |") + + # Check for removed benchmarks + removed = set(base_benchmarks.keys()) - set(pr_benchmarks.keys()) + if removed: + lines.append("") + lines.append("**Removed benchmarks:** " + ", ".join(f"`{n}`" for n in sorted(removed))) + + lines.extend( + [ + "", + "
", + "๐Ÿ“ˆ Detailed Statistics", + "", + "```", + ] + ) + + for name, stats in sorted(pr_benchmarks.items()): + lines.append(f"\n{name}:") + lines.append(f" mean: {stats['mean']:.6f}s ยฑ {stats['stddev']:.6f}s") + lines.append(f" min: {stats['min']:.6f}s") + lines.append(f" max: {stats['max']:.6f}s") + lines.append(f" rounds: {stats['rounds']}") + + lines.extend( + [ + "```", + "
", + "", + "**Legend:** ๐Ÿ”ด >10% slower | ๐ŸŸข >10% faster | โšช within 10% | ๐Ÿ†• new benchmark", + ] + ) + + return "\n".join(lines) + + +def main() -> int: + """CLI entrypoint.""" + parser = argparse.ArgumentParser(description="Generate benchmark reports") + parser.add_argument( + "pr_results", + help="Path to PR benchmark JSON file", + ) + parser.add_argument( + "--base-results", + help="Path to base benchmark JSON file (enables comparison mode)", + ) + parser.add_argument( + "--pr-ref", + default=os.environ.get("GITHUB_HEAD_REF", "PR"), + help="PR branch/ref name for display", + ) + parser.add_argument( + "--base-ref", + default=os.environ.get("GITHUB_BASE_REF", "main"), + help="Base branch/ref name for display", + ) + parser.add_argument( + "--output", + "-o", + help="Output file path (default: stdout)", + ) + parser.add_argument( + "--github-summary", + action="store_true", + help="Also write to GITHUB_STEP_SUMMARY if available", + ) + + args = parser.parse_args() + + report = generate_report( + pr_path=args.pr_results, + base_path=args.base_results, + pr_ref=args.pr_ref, + base_ref=args.base_ref, + ) + + # Output to file or stdout + if args.output: + Path(args.output).write_text(report) + print(f"Report written to {args.output}") + else: + print(report) + + # Optionally write to GitHub step summary + if args.github_summary and "GITHUB_STEP_SUMMARY" in os.environ: + with open(os.environ["GITHUB_STEP_SUMMARY"], "a") as f: + f.write(report) + f.write("\n") + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/docs/notebooks b/docs/notebooks index 1cbaf62a..98d97da4 160000 --- a/docs/notebooks +++ b/docs/notebooks @@ -1 +1 @@ -Subproject commit 1cbaf62a32f65b950552229d210b9884757ce116 +Subproject commit 98d97da49528e4229548a315a88569ebfc618942 diff --git a/pyproject.toml b/pyproject.toml index bc36d037..de1a6ec3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -103,6 +103,7 @@ optional-dependencies.test = [ "coverage[toml]>=7", "pytest>=7", # Just for VS Code + "pytest-benchmark>=4", "pytest-cov>=4", "pytest-mock>=3.5", "pytest-timeout>=2.1", @@ -240,7 +241,13 @@ testpaths = [ "tests/" ] xfail_strict = true addopts = [ "--ignore=tests/plotting/test_interactive.py", + "--ignore=tests/benchmarks", "--ignore=docs", + "--benchmark-disable", +] +markers = [ + "benchmark: mark test as a benchmark test", + "slow: mark test as slow running", ] [tool.coverage.run] @@ -306,6 +313,9 @@ lint = "ruff check ." format = "ruff format ." pre-commit-install = "pre-commit install" pre-commit = "pre-commit run" +benchmark = "pytest tests/benchmarks/ --benchmark-enable --benchmark-only -v --color=yes" +benchmark-save = "pytest tests/benchmarks/ --benchmark-enable --benchmark-only --benchmark-autosave -v --color=yes" +benchmark-compare = "pytest tests/benchmarks/ --benchmark-enable --benchmark-only --benchmark-compare -v --color=yes" [tool.cruft] skip = [ diff --git a/tests/benchmarks/__init__.py b/tests/benchmarks/__init__.py new file mode 100644 index 00000000..585436db --- /dev/null +++ b/tests/benchmarks/__init__.py @@ -0,0 +1,11 @@ +"""Squidpy benchmarks package. + +This package contains performance benchmarks for squidpy functions. +Benchmarks are excluded from regular pytest runs and can be executed with: + + pytest benchmarks/ --benchmark-only -v + +For more options, see pytest-benchmark documentation. +""" + +from __future__ import annotations diff --git a/tests/benchmarks/conftest.py b/tests/benchmarks/conftest.py new file mode 100644 index 00000000..e5a5164b --- /dev/null +++ b/tests/benchmarks/conftest.py @@ -0,0 +1,98 @@ +"""Benchmark fixtures and configuration for squidpy performance tests.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numpy as np +import pandas as pd +import pytest +from anndata import AnnData + +from squidpy._constants._pkg_constants import Key + +if TYPE_CHECKING: + from collections.abc import Callable + + +def pytest_configure(config: pytest.Config) -> None: + """Register benchmark markers.""" + config.addinivalue_line("markers", "benchmark: mark test as a benchmark test") + + +@pytest.fixture(scope="session") +def make_adata() -> Callable[[int, int], AnnData]: + """ + Factory fixture to create synthetic AnnData objects for benchmarking. + + Returns a function that generates AnnData with specified n_obs and n_clusters. + """ + + def _make_adata(n_obs: int, n_clusters: int = 10) -> AnnData: + """ + Create a synthetic AnnData object for benchmarking. + + Parameters + ---------- + n_obs + Number of observations (cells). + n_clusters + Number of cluster categories. + + Returns + ------- + AnnData object with spatial coordinates and cluster labels. + """ + rng = np.random.default_rng(42) + + # Create random spatial coordinates + spatial_coords = rng.uniform(0, 1000, size=(n_obs, 2)) + + # Create random cluster assignments + cluster_labels = pd.Categorical(rng.choice([f"cluster_{i}" for i in range(n_clusters)], size=n_obs)) + + # Create minimal expression matrix + X = rng.random((n_obs, 50)) + + # Build AnnData + adata = AnnData(X=X) + adata.obsm[Key.obsm.spatial] = spatial_coords + adata.obs["cluster"] = cluster_labels + + return adata + + return _make_adata + + +# Pre-defined dataset sizes for parameterized benchmarks +# Adjust these values to match your benchmarking needs +BENCHMARK_SIZES = { + "1k": 1_000, + "5k": 5_000, + "10k": 10_000, + "50k": 50_000, + "100k": 100_000, +} + + +@pytest.fixture(params=list(BENCHMARK_SIZES.keys()), ids=list(BENCHMARK_SIZES.keys())) +def adata_scaling(request: pytest.FixtureRequest, make_adata: Callable[[int, int], AnnData]) -> AnnData: + """ + Parameterized fixture that provides AnnData objects of varying sizes. + + The fixture name makes it clear that the dataset size varies across test runs. + Sizes are defined in BENCHMARK_SIZES dict - modify that to change scale points. + """ + size_name = request.param + n_obs = BENCHMARK_SIZES[size_name] + return make_adata(n_obs) + + +# Default size for non-scaling benchmarks (uses first size in BENCHMARK_SIZES) +DEFAULT_BENCHMARK_SIZE = next(iter(BENCHMARK_SIZES.values())) + + +@pytest.fixture +def adata_default(make_adata: Callable[[int, int], AnnData]) -> AnnData: + """Fixed dataset for non-scaling benchmarks. Size defined by DEFAULT_BENCHMARK_SIZE.""" + return make_adata(DEFAULT_BENCHMARK_SIZE) diff --git a/tests/benchmarks/test_co_occurrence.py b/tests/benchmarks/test_co_occurrence.py new file mode 100644 index 00000000..f4265bf8 --- /dev/null +++ b/tests/benchmarks/test_co_occurrence.py @@ -0,0 +1,151 @@ +"""Benchmarks for squidpy.gr.co_occurrence function. + +Run benchmarks with: + pytest benchmarks/ --benchmark-only -v + +Compare against baseline: + pytest benchmarks/ --benchmark-only --benchmark-compare + +Save benchmark results: + pytest benchmarks/ --benchmark-only --benchmark-autosave + +Multithreading Behavior +----------------------- +pytest-benchmark runs each benchmark function sequentially (one at a time). +Within each benchmark iteration, the function under test (e.g., co_occurrence) +can use multiple threads/processes as configured by its parameters (n_jobs). + +This means: +- Benchmarks do NOT run in parallel with each other +- Each benchmark has full access to system resources +- Functions using numba @njit(parallel=True) will use multiple threads +- Functions using joblib/loky parallelization will spawn worker processes + +To benchmark single-threaded vs multi-threaded performance, create separate +test cases with different n_jobs values, or use pytest parametrization. + +Note: pytest-xdist (-n auto) parallelizes test COLLECTION, not benchmark +execution. For accurate benchmarks, avoid -n with benchmark tests. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest + +from squidpy.gr import co_occurrence + +if TYPE_CHECKING: + from collections.abc import Callable + + from anndata import AnnData + + +class TestCoOccurrenceBenchmarks: + """Benchmark suite for co_occurrence function scaling with dataset size.""" + + @pytest.mark.benchmark(group="co_occurrence_scaling") + def test_co_occurrence_scaling( + self, + benchmark: pytest.benchmark.fixture.BenchmarkFixture, + adata_scaling: AnnData, + ) -> None: + """ + Benchmark co_occurrence across different dataset sizes. + + The adata_scaling fixture is parameterized with sizes defined in + conftest.BENCHMARK_SIZES. Modify that dict to change scale points. + """ + benchmark.extra_info["n_obs"] = adata_scaling.n_obs + benchmark(co_occurrence, adata_scaling, cluster_key="cluster", copy=True) + + +class TestCoOccurrenceIntervalBenchmarks: + """Benchmark suite for co_occurrence with different interval parameters.""" + + @pytest.mark.benchmark(group="co_occurrence_intervals") + @pytest.mark.parametrize("n_intervals", [10, 25, 50, 100]) + def test_co_occurrence_intervals( + self, + benchmark: pytest.benchmark.fixture.BenchmarkFixture, + adata_default: AnnData, + n_intervals: int, + ) -> None: + """Benchmark co_occurrence with different interval counts.""" + benchmark.extra_info["n_intervals"] = n_intervals + benchmark.extra_info["n_obs"] = adata_default.n_obs + benchmark( + co_occurrence, + adata_default, + cluster_key="cluster", + interval=n_intervals, + copy=True, + ) + + +class TestCoOccurrenceNumbaCompilation: + """Benchmark numba compilation overhead.""" + + @pytest.mark.benchmark(group="co_occurrence_warmup") + def test_co_occurrence_first_run( + self, + benchmark: pytest.benchmark.fixture.BenchmarkFixture, + make_adata: Callable[[int, int], AnnData], + ) -> None: + """ + Benchmark first run to capture numba compilation overhead. + + Note: This test measures cold-start performance including JIT compilation. + Run with --benchmark-warmup=off to capture compilation time. + """ + + # Create fresh adata each time to avoid caching effects + def run_co_occurrence() -> None: + adata = make_adata(100) + co_occurrence(adata, cluster_key="cluster", copy=True) + + benchmark.pedantic(run_co_occurrence, warmup_rounds=0, rounds=3) + + +# Parametrized benchmark for comprehensive scaling analysis +@pytest.mark.benchmark(group="co_occurrence_comprehensive") +@pytest.mark.parametrize( + "n_obs,n_clusters,n_intervals", + [ + (1_000, 5, 25), + (1_000, 10, 50), + (5_000, 10, 50), + (10_000, 10, 50), + (10_000, 20, 50), + (50_000, 10, 50), + ], +) +def test_co_occurrence_comprehensive( + benchmark: pytest.benchmark.fixture.BenchmarkFixture, + make_adata: Callable[[int, int], AnnData], + n_obs: int, + n_clusters: int, + n_intervals: int, +) -> None: + """ + Comprehensive parametrized benchmark for co_occurrence. + + Tests various combinations of dataset size, cluster count, and intervals + to understand scaling behavior across multiple dimensions. + """ + adata = make_adata(n_obs, n_clusters=n_clusters) + benchmark.extra_info.update( + { + "n_obs": n_obs, + "n_clusters": n_clusters, + "n_intervals": n_intervals, + } + ) + benchmark( + co_occurrence, + adata, + cluster_key="cluster", + interval=n_intervals, + copy=True, + )