init

selmanozleyen · selmanozleyen · commit 274600d332c5 · 2025-07-10T16:56:02.000+02:00
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -0,0 +1,62 @@
+name: Benchmark
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+env:
+  FORCE_COLOR: "1"
+
+defaults:
+  run:
+    shell: bash -e {0} # -e to fail on error
+
+jobs:
+  benchmark:
+    runs-on: ${{ matrix.os }}
+
+    strategy:
+      fail-fast: false
+      matrix:
+        python: ["3.13"]
+        os: [ubuntu-latest]
+
+    env:
+      OS: ${{ matrix.os }}
+      PYTHON: ${{ matrix.python }}
+      ASV_DIR: "./benchmarks"
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Fetch main branch for `asv run`’s hash
+        run: git fetch origin main:main
+        if: ${{ github.ref_name != 'main' }}
+
+      - name: Set up Python ${{ matrix.python }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python }}
+          cache: 'pip'
+
+      - name: Cache datasets
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cache
+          key: benchmark-state-${{ hashFiles('benchmarks/**') }}
+
+      - name: Install dependencies
+        run: pip install 'asv>=0.6.4'
+
+      - name: Configure ASV
+        working-directory: ${{ env.ASV_DIR }}
+        run: asv machine --yes
+
+      - name: Quick benchmark run
+        working-directory: ${{ env.ASV_DIR }}
+        run: asv run --dry-run --quick --show-stderr --verbose HEAD^!
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -0,0 +1,19 @@
+# Squidpy Benchmarks
+
+This directory contains code for benchmarking Squidpy using [asv][].
+
+The functionality is checked using the [`benchmark.yml`][] workflow.
+Benchmarks are run using the [benchmark bot][].
+
+[asv]: https://asv.readthedocs.io/
+[`benchmark.yml`]: ../.github/workflows/benchmark.yml
+[benchmark bot]: https://github.com/apps/scverse-benchmark
+
+## Data processing in benchmarks
+
+Each dataset is processed so it has
+
+- `.X` (containing data in C/row-major format) and `.layers['off-axis']` (containing data in FORTRAN/column-major format) with log-transformed data
+
+The benchmarks are set up so the `layer` parameter indicates the layer that will be moved into `.X` before the benchmark.
+That way, we don’t need to add `layer=layer` everywhere.
diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json
@@ -0,0 +1,159 @@
+{
+    // The version of the config file format.  Do not change, unless
+    // you know what you are doing.
+    "version": 1,
+
+    // The name of the project being benchmarked
+    "project": "squidpy",
+
+    // The project's homepage
+    "project_url": "https://squidpy.readthedocs.io/",
+
+    // The URL or local path of the source code repository for the
+    // project being benchmarked
+    "repo": "..",
+
+    // The Python project's subdirectory in your repo.  If missing or
+    // the empty string, the project is assumed to be located at the root
+    // of the repository.
+    // "repo_subdir": "",
+
+    // Customizable commands for building, installing, and
+    // uninstalling the project. See asv.conf.json documentation.
+    //
+    // "install_command": ["python -mpip install {wheel_file}"],
+    // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"],
+    "build_command": [
+        "python -m pip install build",
+        "python -m build --wheel -o {build_cache_dir} {build_dir}",
+    ],
+
+    // List of branches to benchmark. If not provided, defaults to "master"
+    // (for git) or "default" (for mercurial).
+    "branches": ["main"], // for git
+
+    // The DVCS being used.  If not set, it will be automatically
+    // determined from "repo" by looking at the protocol in the URL
+    // (if remote), or by looking for special directories, such as
+    // ".git" (if local).
+    "dvcs": "git",
+
+    // The tool to use to create environments.  May be "conda",
+    // "virtualenv" or other value depending on the plugins in use.
+    // If missing or the empty string, the tool will be automatically
+    // determined by looking for tools on the PATH environment
+    // variable.
+    "environment_type": "conda",
+
+    // timeout in seconds for installing any dependencies in environment
+    // defaults to 10 min
+    //"install_timeout": 600,
+
+    // the base URL to show a commit for the project.
+    "show_commit_url": "https://github.com/squidpy/scanpy/commit/",
+
+    // The Pythons you'd like to test against.  If not provided, defaults
+    // to the current version of Python used to run `asv`.
+    // "pythons": ["3.11", "3.13"],
+
+    // The list of conda channel names to be searched for benchmark
+    // dependency packages in the specified order
+    "conda_channels": ["conda-forge", "defaults"],
+
+    // The matrix of dependencies to test.  Each key is the name of a
+    // package (in PyPI) and the values are version numbers.  An empty
+    // list or empty string indicates to just test against the default
+    // (latest) version. null indicates that the package is to not be
+    // installed. If the package to be tested is only available from
+    // PyPi, and the 'environment_type' is conda, then you can preface
+    // the package name by 'pip+', and the package will be installed via
+    // pip (with all the conda available packages installed first,
+    // followed by the pip installed packages).
+    //
+    "matrix": {
+        "numpy": [""],
+        "scipy": [""],
+        "squidpy": [""]
+    },
+
+    // Combinations of libraries/python versions can be excluded/included
+    // from the set to test. Each entry is a dictionary containing additional
+    // key-value pairs to include/exclude.
+    //
+    // An exclude entry excludes entries where all values match. The
+    // values are regexps that should match the whole string.
+    //
+    // An include entry adds an environment. Only the packages listed
+    // are installed. The 'python' key is required. The exclude rules
+    // do not apply to includes.
+    //
+    // In addition to package names, the following keys are available:
+    //
+    // - python
+    //     Python version, as in the *pythons* variable above.
+    // - environment_type
+    //     Environment type, as above.
+    // - sys_platform
+    //     Platform, as in sys.platform. Possible values for the common
+    //     cases: 'linux2', 'win32', 'cygwin', 'darwin'.
+    //
+    // "exclude": [
+    //     {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows
+    //     {"environment_type": "conda", "six": null}, // don't run without six on conda
+    // ],
+    //
+    // "include": [
+    //     // additional env for python2.7
+    //     {"python": "2.7", "numpy": "1.8"},
+    //     // additional env if run on windows+conda
+    //     {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""},
+    // ],
+
+    // The directory (relative to the current directory) that benchmarks are
+    // stored in.  If not provided, defaults to "benchmarks"
+    // "benchmark_dir": "benchmarks",
+
+    // The directory (relative to the current directory) to cache the Python
+    // environments in.  If not provided, defaults to "env"
+    "env_dir": ".asv/env",
+
+    // The directory (relative to the current directory) that raw benchmark
+    // results are stored in.  If not provided, defaults to "results".
+    "results_dir": ".asv/results",
+
+    // The directory (relative to the current directory) that the html tree
+    // should be written to.  If not provided, defaults to "html".
+    "html_dir": ".asv/html",
+
+    // The number of characters to retain in the commit hashes.
+    // "hash_length": 8,
+
+    // `asv` will cache results of the recent builds in each
+    // environment, making them faster to install next time.  This is
+    // the number of builds to keep, per environment.
+    // "build_cache_size": 2,
+
+    // The commits after which the regression search in `asv publish`
+    // should start looking for regressions. Dictionary whose keys are
+    // regexps matching to benchmark names, and values corresponding to
+    // the commit (exclusive) after which to start looking for
+    // regressions.  The default is to start from the first commit
+    // with results. If the commit is `null`, regression detection is
+    // skipped for the matching benchmark.
+    //
+    // "regressions_first_commits": {
+    //    "some_benchmark": "352cdf",  // Consider regressions only after this commit
+    //    "another_benchmark": null,   // Skip regression detection altogether
+    // },
+
+    // The thresholds for relative change in results, after which `asv
+    // publish` starts reporting regressions. Dictionary of the same
+    // form as in ``regressions_first_commits``, with values
+    // indicating the thresholds.  If multiple entries match, the
+    // maximum is taken. If no entry matches, the default is 5%.
+    //
+    // "regressions_thresholds": {
+    //    "some_benchmark": 0.01,     // Threshold of 1%
+    //    "another_benchmark": 0.5,   // Threshold of 50%
+    // },
+}
diff --git a/benchmarks/benchmarks/__init__.py b/benchmarks/benchmarks/__init__.py
@@ -0,0 +1 @@
+"""ASV benchmark suite for squidpy."""
diff --git a/benchmarks/benchmarks/_utils.py b/benchmarks/benchmarks/_utils.py
@@ -0,0 +1,109 @@
+from __future__ import annotations
+
+import itertools
+from functools import cache
+from typing import TYPE_CHECKING
+
+import numpy as np
+from asv_runner.benchmarks.mark import skip_for_params
+from scanpy._compat import CSCBase, CSRBase
+
+import squidpy as sq
+
+if TYPE_CHECKING:
+    from collections.abc import Callable, Sequence
+    from collections.abc import Set as AbstractSet
+    from typing import Literal, Protocol, TypeVar
+
+    from anndata import AnnData
+
+    C = TypeVar("C", bound=Callable)
+
+    class ParamSkipper(Protocol):
+        def __call__(self, **skipped: AbstractSet) -> Callable[[C], C]: ...
+
+    Dataset = Literal["imc"]
+    KeyX = Literal[None, "off-axis"]
+
+
+@cache
+def _imc() -> AnnData:
+    adata = sq.datasets.imc()
+    assert isinstance(adata.X, np.ndarray)
+    assert not np.isfortran(adata.X)
+
+    return adata
+
+
+def imc() -> AnnData:
+    return _imc().copy()
+
+
+def to_off_axis(x: np.ndarray | CSRBase) -> np.ndarray | CSCBase:
+    if isinstance(x, CSRBase):
+        return x.tocsc()
+    if isinstance(x, np.ndarray):
+        assert not np.isfortran(x)
+        return x.copy(order="F")
+    msg = f"Unexpected type {type(x)}"
+    raise TypeError(msg)
+
+
+def _get_dataset_raw(dataset: Dataset) -> tuple[AnnData, str | None]:
+    match dataset:
+        case "imc":
+            adata, cluster_key = imc(), "cell type"
+        case _:
+            msg = f"Unknown dataset {dataset}"
+            raise AssertionError(msg)
+
+    adata.layers["off-axis"] = to_off_axis(adata.X)
+
+    return adata, cluster_key
+
+
+def get_dataset(dataset: Dataset, *, layer: KeyX = None) -> tuple[AnnData, str | None]:
+    adata, batch_key = _get_dataset_raw(dataset)
+    if layer is not None:
+        adata.X = adata.layers.pop(layer)
+    return adata, batch_key
+
+
+def get_count_dataset(dataset: Dataset, *, layer: KeyCount = "counts") -> tuple[AnnData, str | None]:
+    adata, batch_key = _get_dataset_raw(dataset)
+
+    adata.X = adata.layers.pop(layer)
+    # remove indicators that X was transformed
+    adata.uns.pop("log1p", None)
+
+    return adata, batch_key
+
+
+def param_skipper(param_names: Sequence[str], params: tuple[Sequence[object], ...]) -> ParamSkipper:
+    """Create a decorator that will skip all combinations that contain any of the given parameters.
+
+    Examples
+    --------
+    >>> param_names = ["letters", "numbers"]
+    >>> params = [["a", "b"], [3, 4, 5]]
+    >>> skip_when = param_skipper(param_names, params)
+
+    >>> @skip_when(letters={"a"}, numbers={3})
+    ... def func(a, b):
+    ...     print(a, b)
+    >>> run_as_asv_benchmark(func)
+    b 4
+    b 5
+
+    """
+
+    def skip(**skipped: AbstractSet) -> Callable[[C], C]:
+        skipped_combs = [
+            tuple(record.values())
+            for record in (dict(zip(param_names, vals, strict=True)) for vals in itertools.product(*params))
+            if any(v in skipped.get(n, set()) for n, v in record.items())
+        ]
+        # print(skipped_combs, file=sys.stderr)
+        return skip_for_params(skipped_combs)
+
+    return skip
diff --git a/benchmarks/benchmarks/preprocessing_co_occurence.py b/benchmarks/benchmarks/preprocessing_co_occurence.py