diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 000000000..1e952440b --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,62 @@ +name: Benchmark + +on: + push: + branches: [main] + pull_request: + branches: [main] + +env: + FORCE_COLOR: "1" + +defaults: + run: + shell: bash -e {0} # -e to fail on error + +jobs: + benchmark: + runs-on: ${{ matrix.os }} + + strategy: + fail-fast: false + matrix: + python: ["3.13"] + os: [ubuntu-latest] + + env: + OS: ${{ matrix.os }} + PYTHON: ${{ matrix.python }} + ASV_DIR: "./benchmarks" + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Fetch main branch for `asv run`’s hash + run: git fetch origin main:main + if: ${{ github.ref_name != 'main' }} + + - name: Set up Python ${{ matrix.python }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + cache: 'pip' + + - name: Cache datasets + uses: actions/cache@v4 + with: + path: | + ~/.cache + key: benchmark-state-${{ hashFiles('benchmarks/**') }} + + - name: Install dependencies + run: pip install 'asv>=0.6.4' + + - name: Configure ASV + working-directory: ${{ env.ASV_DIR }} + run: asv machine --yes + + - name: Quick benchmark run + working-directory: ${{ env.ASV_DIR }} + run: asv run --dry-run --quick --show-stderr --verbose HEAD^! diff --git a/benchmarks/.asv/results/benchmarks.json b/benchmarks/.asv/results/benchmarks.json new file mode 100644 index 000000000..708d1c7d3 --- /dev/null +++ b/benchmarks/.asv/results/benchmarks.json @@ -0,0 +1,27 @@ +{ + "co_occurence.peakmem_co_occurrence": { + "code": "def peakmem_co_occurrence():\n sq.gr.co_occurrence(adata, cluster_key=\"cell type\")\n\ndef setup():\n global adata # noqa: PLW0603\n adata = sq.datasets.imc()", + "name": "co_occurence.peakmem_co_occurrence", + "param_names": [], + "params": [], + "type": "peakmemory", + "unit": "bytes", + "version": "8e588751fbcc15d56cfe73dbb3844752d6cac545c868bca1ce1045f0a54e301d" + }, + "co_occurence.time_co_occurrence": { + "code": "def time_co_occurrence():\n sq.gr.co_occurrence(adata, cluster_key=\"cell type\")\n\ndef setup():\n global adata # noqa: PLW0603\n adata = sq.datasets.imc()", + "min_run_count": 2, + "name": "co_occurence.time_co_occurrence", + "number": 0, + "param_names": [], + "params": [], + "repeat": 0, + "rounds": 2, + "sample_time": 0.01, + "type": "time", + "unit": "seconds", + "version": "e764c1b999e315fffac8cac6708ede350ff254bd0b05a9be580a43ea3689d614", + "warmup_time": -1 + }, + "version": 2 +} \ No newline at end of file diff --git a/benchmarks/.asv/results/mac/machine.json b/benchmarks/.asv/results/mac/machine.json new file mode 100644 index 000000000..bcb50937c --- /dev/null +++ b/benchmarks/.asv/results/mac/machine.json @@ -0,0 +1,9 @@ +{ + "arch": "arm64", + "cpu": "Apple M1", + "machine": "mac", + "num_cpu": "8", + "os": "Darwin 23.1.0", + "ram": "17179869184", + "version": 1 +} \ No newline at end of file diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 000000000..8a0bf5270 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,22 @@ +# Squidpy Benchmarks + +This directory contains code for benchmarking Squidpy using [asv][]. + +The functionality is checked using the [`benchmark.yml`][] workflow. +Benchmarks are run using the [benchmark bot][]. + +[asv]: https://asv.readthedocs.io/ +[`benchmark.yml`]: ../.github/workflows/benchmark.yml +[benchmark bot]: https://github.com/apps/scverse-benchmark # TODO + +## Data processing in benchmarks + +# TODO +Each dataset is processed so it has + +- `.layers['counts']` (containing data in C/row-major format) and `.layers['counts-off-axis']` (containing data in FORTRAN/column-major format) +- `.X` and `.layers['off-axis']` with log-transformed data (formats like above) +- a `.var['mt']` boolean column indicating mitochondrial genes + +The benchmarks are set up so the `layer` parameter indicates the layer that will be moved into `.X` before the benchmark. +That way, we don’t need to add `layer=layer` everywhere. diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json new file mode 100644 index 000000000..c48b03c81 --- /dev/null +++ b/benchmarks/asv.conf.json @@ -0,0 +1,172 @@ +{ + // The version of the config file format. Do not change, unless + // you know what you are doing. + "version": 1, + + // The name of the project being benchmarked + "project": "squidpy", + + // The project's homepage + "project_url": "https://squidpy.readthedocs.io/", + + // The URL or local path of the source code repository for the + // project being benchmarked + "repo": "..", + + // The Python project's subdirectory in your repo. If missing or + // the empty string, the project is assumed to be located at the root + // of the repository. + // "repo_subdir": "", + + // Customizable commands for building, installing, and + // uninstalling the project. See asv.conf.json documentation. + // + // "install_command": ["python -mpip install {wheel_file}"], + // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"], + "build_command": [ + "python -m pip install build", + "python -m build --wheel -o {build_cache_dir} {build_dir}", + ], + + // List of branches to benchmark. If not provided, defaults to "master" + // (for git) or "default" (for mercurial). + "branches": ["main"], // for git + + // The DVCS being used. If not set, it will be automatically + // determined from "repo" by looking at the protocol in the URL + // (if remote), or by looking for special directories, such as + // ".git" (if local). + "dvcs": "git", + + // The tool to use to create environments. May be "conda", + // "virtualenv" or other value depending on the plugins in use. + // If missing or the empty string, the tool will be automatically + // determined by looking for tools on the PATH environment + // variable. + "environment_type": "conda", + + // timeout in seconds for installing any dependencies in environment + // defaults to 10 min + //"install_timeout": 600, + + // the base URL to show a commit for the project. + "show_commit_url": "https://github.com/scverse/squidpy/commit/", + + // The Pythons you'd like to test against. If not provided, defaults + // to the current version of Python used to run `asv`. + // "pythons": ["3.11", "3.13"], + + // The list of conda channel names to be searched for benchmark + // dependency packages in the specified order + "conda_channels": ["conda-forge", "defaults"], + + // The matrix of dependencies to test. Each key is the name of a + // package (in PyPI) and the values are version numbers. An empty + // list or empty string indicates to just test against the default + // (latest) version. null indicates that the package is to not be + // installed. If the package to be tested is only available from + // PyPi, and the 'environment_type' is conda, then you can preface + // the package name by 'pip+', and the package will be installed via + // pip (with all the conda available packages installed first, + // followed by the pip installed packages). + // + "matrix": { + "numpy": [""], + // "scipy": ["1.2", ""], + "scipy": [""], + "h5py": [""], + "natsort": [""], + "pandas": [""], + "memory_profiler": [""], + // "zarr": ["2.18.4"], + "pytest": [""], + // "scanpy": [""], + "squidpy": [""], + "python-igraph": [""], + // "psutil": [""] + "pooch": [""], + "scikit-image": [""], + // "scikit-misc": [""], + }, + + // Combinations of libraries/python versions can be excluded/included + // from the set to test. Each entry is a dictionary containing additional + // key-value pairs to include/exclude. + // + // An exclude entry excludes entries where all values match. The + // values are regexps that should match the whole string. + // + // An include entry adds an environment. Only the packages listed + // are installed. The 'python' key is required. The exclude rules + // do not apply to includes. + // + // In addition to package names, the following keys are available: + // + // - python + // Python version, as in the *pythons* variable above. + // - environment_type + // Environment type, as above. + // - sys_platform + // Platform, as in sys.platform. Possible values for the common + // cases: 'linux2', 'win32', 'cygwin', 'darwin'. + // + // "exclude": [ + // {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows + // {"environment_type": "conda", "six": null}, // don't run without six on conda + // ], + // + // "include": [ + // // additional env for python2.7 + // {"python": "2.7", "numpy": "1.8"}, + // // additional env if run on windows+conda + // {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""}, + // ], + + // The directory (relative to the current directory) that benchmarks are + // stored in. If not provided, defaults to "benchmarks" + // "benchmark_dir": "benchmarks", + + // The directory (relative to the current directory) to cache the Python + // environments in. If not provided, defaults to "env" + "env_dir": ".asv/env", + + // The directory (relative to the current directory) that raw benchmark + // results are stored in. If not provided, defaults to "results". + "results_dir": ".asv/results", + + // The directory (relative to the current directory) that the html tree + // should be written to. If not provided, defaults to "html". + "html_dir": ".asv/html", + + // The number of characters to retain in the commit hashes. + // "hash_length": 8, + + // `asv` will cache results of the recent builds in each + // environment, making them faster to install next time. This is + // the number of builds to keep, per environment. + // "build_cache_size": 2, + + // The commits after which the regression search in `asv publish` + // should start looking for regressions. Dictionary whose keys are + // regexps matching to benchmark names, and values corresponding to + // the commit (exclusive) after which to start looking for + // regressions. The default is to start from the first commit + // with results. If the commit is `null`, regression detection is + // skipped for the matching benchmark. + // + // "regressions_first_commits": { + // "some_benchmark": "352cdf", // Consider regressions only after this commit + // "another_benchmark": null, // Skip regression detection altogether + // }, + + // The thresholds for relative change in results, after which `asv + // publish` starts reporting regressions. Dictionary of the same + // form as in ``regressions_first_commits``, with values + // indicating the thresholds. If multiple entries match, the + // maximum is taken. If no entry matches, the default is 5%. + // + // "regressions_thresholds": { + // "some_benchmark": 0.01, // Threshold of 1% + // "another_benchmark": 0.5, // Threshold of 50% + // }, +} diff --git a/benchmarks/benchmarks/__init__.py b/benchmarks/benchmarks/__init__.py new file mode 100644 index 000000000..255d98fe4 --- /dev/null +++ b/benchmarks/benchmarks/__init__.py @@ -0,0 +1 @@ +"""ASV benchmark suite for sqidpy.""" diff --git a/benchmarks/benchmarks/co_occurence.py b/benchmarks/benchmarks/co_occurence.py new file mode 100644 index 000000000..16263725b --- /dev/null +++ b/benchmarks/benchmarks/co_occurence.py @@ -0,0 +1,32 @@ +"""Benchmark tool operations in Squidpy. + +API documentation: . +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import squidpy as sq + + +if TYPE_CHECKING: + from anndata import AnnData + +# setup variables + +adata: AnnData + + +def setup(): + global adata # noqa: PLW0603 + adata = sq.datasets.imc() + + +def time_co_occurrence(): + sq.gr.co_occurrence(adata, cluster_key="cell type") + + +def peakmem_co_occurrence(): + sq.gr.co_occurrence(adata, cluster_key="cell type") + diff --git a/src/squidpy/gr/_ppatterns.py b/src/squidpy/gr/_ppatterns.py index 187f1156a..6e3230be0 100644 --- a/src/squidpy/gr/_ppatterns.py +++ b/src/squidpy/gr/_ppatterns.py @@ -3,6 +3,7 @@ from __future__ import annotations from collections.abc import Iterable, Sequence +from importlib.util import find_spec from itertools import chain from typing import TYPE_CHECKING, Any, Literal @@ -10,7 +11,7 @@ import numpy as np import pandas as pd from anndata import AnnData -from numba import njit +from numba import njit, prange from numpy.random import default_rng from scanpy import logging as logg from scanpy.get import _get_obs_rep @@ -266,85 +267,120 @@ def _score_helper( return score_perms -@njit( - ft[:, :, :](tt(it[:], 2), ft[:, :], it[:], ft[:], bl), - parallel=False, - fastmath=True, -) +@njit(parallel=True, fastmath=True, cache=True) def _occur_count( - clust: tuple[NDArrayA, NDArrayA], - pw_dist: NDArrayA, - labs_unique: NDArrayA, - interval: NDArrayA, - same_split: bool, + spatial_x: NDArrayA, spatial_y: NDArrayA, thresholds: NDArrayA, label_idx: NDArrayA, n: int, k: int, l_val: int ) -> NDArrayA: - num = labs_unique.shape[0] - out = np.zeros((num, num, interval.shape[0] - 1), dtype=ft) - - for idx in range(interval.shape[0] - 1): - co_occur = np.zeros((num, num), dtype=ft) - probs_con = np.zeros((num, num), dtype=ft) - - thres_max = interval[idx + 1] - clust_x, clust_y = clust - - # Modified to compute co-occurrence probability ratio over increasing radii sizes as opposed to discrete interval bins - # Need pw_dist > 0 to avoid counting a cell with itself as co-occurrence - idx_x, idx_y = np.nonzero((pw_dist <= thres_max) & (pw_dist > 0)) - x = clust_x[idx_x] - y = clust_y[idx_y] - # Treat computing co-occurrence using the same split and different splits differently - # Pairwise distance matrix for between the same split is symmetric and therefore only needs to be counted once - for i, j in zip(x, y): # noqa: B905 # cannot use strict=False because of numba - co_occur[i, j] += 1 - if not same_split: - co_occur[j, i] += 1 - - # Prevent divison by zero errors when we have low cell counts/small intervals - probs_matrix = co_occur / np.sum(co_occur) if np.sum(co_occur) != 0 else np.zeros((num, num), dtype=ft) - probs = np.sum(probs_matrix, axis=0) - - for c in labs_unique: - probs_conditional = ( - co_occur[c] / np.sum(co_occur[c]) if np.sum(co_occur[c]) != 0 else np.zeros(num, dtype=ft) - ) - probs_con[c, :] = np.zeros(num, dtype=ft) - for i in range(num): - if probs[i] == 0: - probs_con[c, i] = 0 - else: - probs_con[c, i] = probs_conditional[i] / probs[i] + # Allocate a 2D array to store a flat local result per point. + k2 = k * k + local_results = np.zeros((n, l_val * k2), dtype=np.int32) - out[:, :, idx] = probs_con + for i in prange(n): + local_counts: NDArrayA = np.zeros(l_val * k2, dtype=np.int32) - return out + for j in range(n): + if i == j: + continue + dx = spatial_x[i] - spatial_x[j] + dy = spatial_y[i] - spatial_y[j] + d2 = dx * dx + dy * dy + pair = label_idx[i] * k + label_idx[j] # fixed in r–loop + base = pair * l_val # first cell for that pair -def _co_occurrence_helper( - idx_splits: Iterable[tuple[int, int]], - spatial_splits: Sequence[NDArrayA], - labs_splits: Sequence[NDArrayA], - labs_unique: NDArrayA, - interval: NDArrayA, - queue: SigQueue | None = None, -) -> pd.DataFrame: - out_lst = [] - for t in idx_splits: - idx_x, idx_y = t - labs_x = labs_splits[idx_x] - labs_y = labs_splits[idx_y] - dist = pairwise_distances(spatial_splits[idx_x], spatial_splits[idx_y]) + for r in range(l_val): + if d2 <= thresholds[r]: + local_counts[base + r] += 1 - out = _occur_count((labs_x, labs_y), dist, labs_unique, interval, idx_x == idx_y) - out_lst.append(out) + local_results[i] = local_counts - if queue is not None: - queue.put(Signal.UPDATE) + # reduction and reshape stay the same + result_flat = local_results.sum(axis=0) + result: NDArrayA = result_flat.reshape(k, k, l_val).copy() + + return result - if queue is not None: - queue.put(Signal.FINISH) - return out_lst +@njit(parallel=True, fastmath=True, cache=True) +def _co_occurrence_helper(v_x: NDArrayA, v_y: NDArrayA, v_radium: NDArrayA, labs: NDArrayA) -> NDArrayA: + """ + Fast co-occurrence probability computation using the new numba-accelerated counting. + + Parameters + ---------- + v_x : np.ndarray, float64 + x–coordinates. + v_y : np.ndarray, float64 + y–coordinates. + v_radium : np.ndarray, float64 + Distance thresholds (in ascending order). + labs : np.ndarray + Cluster labels (as integers). + + Returns + ------- + occ_prob : np.ndarray + A 3D array of shape (k, k, len(v_radium)-1) containing the co-occurrence probabilities. + labs_unique : np.ndarray + Array of unique labels. + """ + n = len(v_x) + labs_unique = np.unique(labs) + k = len(labs_unique) + # l_val is the number of bins; here we assume the thresholds come from v_radium[1:]. + l_val = len(v_radium) - 1 + # Compute squared thresholds from the interval (skip the first value) + thresholds = (v_radium[1:]) ** 2 + + # Compute cco-occurence ounts. + counts = _occur_count(v_x, v_y, thresholds, labs, n, k, l_val) + + # Compute co-occurrence probabilities for each threshold bin. + occ_prob = np.empty((k, k, l_val), dtype=np.float64) + for r in prange(l_val): + co_occur = counts[:, :, r].astype(np.float64) + + # Compute the total count for this threshold. + total = 0.0 + for i in range(k): + for j in range(k): + total += co_occur[i, j] + + # Compute the normalized probability matrix. + probs_matrix = np.zeros((k, k), dtype=np.float64) + if total != 0.0: + for i in range(k): + for j in range(k): + probs_matrix[i, j] = co_occur[i, j] / total + + probs = np.zeros(k, dtype=np.float32) + for j in range(k): + s = 0.0 + for i in range(k): + s += probs_matrix[i, j] + probs[j] = s + + # Compute conditional probabilities. + probs_con = np.zeros((k, k), dtype=np.float32) + for c in range(k): + row_sum = 0.0 + for j in range(k): + row_sum += co_occur[c, j] + for i in range(k): + cond = 0.0 + if row_sum != 0.0: + cond = co_occur[c, i] / row_sum + if probs[i] == 0.0: + probs_con[c, i] = 0.0 + else: + probs_con[c, i] = cond / probs[i] + + # Transpose to match (k, k, interval). + for i in range(k): + for c in range(k): + occ_prob[i, c, r] = probs_con[c, i] + + return occ_prob @d.dedent @@ -387,6 +423,7 @@ def co_occurrence( - :attr:`anndata.AnnData.uns` ``['{cluster_key}_co_occurrence']['interval']`` - the distance thresholds computed at ``interval``. """ + if isinstance(adata, SpatialData): adata = adata.table _assert_categorical_obs(adata, key=cluster_key) @@ -394,11 +431,8 @@ def co_occurrence( spatial = adata.obsm[spatial_key].astype(fp) original_clust = adata.obs[cluster_key] - - # annotate cluster idx clust_map = {v: i for i, v in enumerate(original_clust.cat.categories.values)} labs = np.array([clust_map[c] for c in original_clust], dtype=ip) - labs_unique = np.array(list(clust_map.values()), dtype=ip) # create intervals thresholds if isinstance(interval, int): @@ -409,57 +443,21 @@ def co_occurrence( if len(interval) <= 1: raise ValueError(f"Expected interval to be of length `>= 2`, found `{len(interval)}`.") - n_obs = spatial.shape[0] - if n_splits is None: - size_arr = (n_obs**2 * spatial.itemsize) / 1024 / 1024 # calc expected mem usage - n_splits = 1 - if size_arr > 2000: - while (n_obs / n_splits) > 2048: - n_splits += 1 - logg.warning( - f"`n_splits` was automatically set to `{n_splits}` to " - f"prevent `{n_obs}x{n_obs}` distance matrix from being created" - ) - n_splits = int(max(min(n_splits, n_obs), 1)) - - # split array and labels - spatial_splits = tuple(s for s in np.array_split(spatial, n_splits, axis=0) if len(s)) - labs_splits = tuple(s for s in np.array_split(labs, n_splits, axis=0) if len(s)) - # create idx array including unique combinations and self-comparison - x, y = np.triu_indices_from(np.empty((n_splits, n_splits))) - idx_splits = list(zip(x, y, strict=False)) + spatial_x = spatial[:, 0] + spatial_y = spatial[:, 1] - n_jobs = _get_n_cores(n_jobs) + # Compute co-occurrence probabilities using the fast numba routine. + out = _co_occurrence_helper(spatial_x, spatial_y, interval, labs) start = logg.info( - f"Calculating co-occurrence probabilities for `{len(interval)}` intervals " - f"`{len(idx_splits)}` split combinations using `{n_jobs}` core(s)" - ) - - out_lst = parallelize( - _co_occurrence_helper, - collection=idx_splits, - extractor=chain.from_iterable, - n_jobs=n_jobs, - backend=backend, - show_progress_bar=show_progress_bar, - )( - spatial_splits=spatial_splits, - labs_splits=labs_splits, - labs_unique=labs_unique, - interval=interval, + f"Calculating co-occurrence probabilities for `{len(interval)}` intervals using `{n_jobs}` core(s) and `{n_splits}` splits" ) - out = list(out_lst)[0] if len(idx_splits) == 1 else sum(list(out_lst)) / len(idx_splits) if copy: logg.info("Finish", time=start) return out, interval _save_data( - adata, - attr="uns", - key=Key.uns.co_occurrence(cluster_key), - data={"occ": out, "interval": interval}, - time=start, + adata, attr="uns", key=Key.uns.co_occurrence(cluster_key), data={"occ": out, "interval": interval}, time=start )