From 20239967971d0cac8f55738ce49cd5377bf0f73b Mon Sep 17 00:00:00 2001 From: Wenjie SUN Date: Tue, 18 Mar 2025 17:42:20 +0100 Subject: [PATCH 01/18] perf implement rust co-occurrence statistics --- pyproject.toml | 3 + src/squidpy/gr/_ppatterns.py | 201 ++++++++++++++++++++++++++-------- tests/graph/test_ppatterns.py | 31 ++++++ 3 files changed, 189 insertions(+), 46 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1143cab94..c97919602 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -102,6 +102,9 @@ docs = [ "myst-nb>=0.17.1", "sphinx_copybutton>=0.5.0", ] +co_occurrence = [ + "ststat-rs", +] [project.urls] Homepage = "https://github.com/scverse/squidpy" diff --git a/src/squidpy/gr/_ppatterns.py b/src/squidpy/gr/_ppatterns.py index a61fed47e..ad72d384a 100644 --- a/src/squidpy/gr/_ppatterns.py +++ b/src/squidpy/gr/_ppatterns.py @@ -2,6 +2,8 @@ from __future__ import annotations +from importlib.util import find_spec + from collections.abc import Iterable, Sequence from itertools import chain from typing import TYPE_CHECKING, Any, Literal @@ -325,6 +327,73 @@ def _occur_count( return out +def _co_occurrence_origin ( + spatial: NDArrayA, + original_clust: pd.Series, + interval: NDArrayA, + copy: bool = False, + n_splits: int | None = None, + n_jobs: int | None = None, + backend: str = "loky", + show_progress_bar: bool = True +) -> NDArrayA: + + # annotate cluster idx + clust_map = {v: i for i, v in enumerate(original_clust.cat.categories.values)} + labs = np.array([clust_map[c] for c in original_clust], dtype=ip) + labs_unique = np.array(list(clust_map.values()), dtype=ip) + + # create intervals thresholds + if isinstance(interval, int): + thresh_min, thresh_max = _find_min_max(spatial) + interval = np.linspace(thresh_min, thresh_max, num=interval, dtype=fp) + else: + interval = np.array(sorted(interval), dtype=fp, copy=True) + if len(interval) <= 1: + raise ValueError(f"Expected interval to be of length `>= 2`, found `{len(interval)}`.") + + n_obs = spatial.shape[0] + if n_splits is None: + size_arr = (n_obs**2 * spatial.itemsize) / 1024 / 1024 # calc expected mem usage + n_splits = 1 + if size_arr > 2000: + while (n_obs / n_splits) > 2048: + n_splits += 1 + logg.warning( + f"`n_splits` was automatically set to `{n_splits}` to " + f"prevent `{n_obs}x{n_obs}` distance matrix from being created" + ) + n_splits = int(max(min(n_splits, n_obs), 1)) + + # split array and labels + spatial_splits = tuple(s for s in np.array_split(spatial, n_splits, axis=0) if len(s)) + labs_splits = tuple(s for s in np.array_split(labs, n_splits, axis=0) if len(s)) + # create idx array including unique combinations and self-comparison + x, y = np.triu_indices_from(np.empty((n_splits, n_splits))) + idx_splits = list(zip(x, y, strict=False)) + + n_jobs = _get_n_cores(n_jobs) + start = logg.info( + f"Calculating co-occurrence probabilities for `{len(interval)}` intervals " + f"`{len(idx_splits)}` split combinations using `{n_jobs}` core(s)" + ) + + out_lst = parallelize( + _co_occurrence_helper, + collection=idx_splits, + extractor=chain.from_iterable, + n_jobs=n_jobs, + backend=backend, + show_progress_bar=show_progress_bar, + )( + spatial_splits=spatial_splits, + labs_splits=labs_splits, + labs_unique=labs_unique, + interval=interval, + ) + out = list(out_lst)[0] if len(idx_splits) == 1 else sum(list(out_lst)) / len(idx_splits) + + return out def _co_occurrence_helper( idx_splits: Iterable[tuple[int, int]], @@ -352,6 +421,55 @@ def _co_occurrence_helper( return out_lst +def _co_occurrence_rs( + spatial: NDArrayA, + original_clust: pd.Series, + interval: NDArrayA +) -> NDArrayA | None: + + from ststat_rs import co_occur_count + + clust = original_clust.cat.codes.astype(np.int32) + + co_occur_3d = co_occur_count( + spatial[:, 0], + spatial[:, 1], + interval, clust + ) + + ## for each dimension + # for i in range(co_occur.shape[2]): + num = co_occur_3d.shape[0] + labs_unique = range(co_occur_3d.shape[0]) + out = np.zeros((num, num, interval.shape[0] - 1), dtype=fp) + + interval_seq = np.arange(interval.shape[0] - 2, -1, -1) + interval_seq.shape[0] + + + for i_interval in interval_seq: + # print(i_interval) + co_occur = co_occur_3d[:, :, i_interval] + + probs_matrix = co_occur / np.sum(co_occur) if np.sum(co_occur) != 0 else np.zeros((num, num), dtype=fp) + probs = np.sum(probs_matrix, axis=0) + probs_con = np.zeros((num, num), dtype=fp) + + for c in labs_unique: + probs_conditional = ( + co_occur[c] / np.sum(co_occur[c]) if np.sum(co_occur[c]) != 0 else np.zeros(num, dtype=fp) + ) + probs_con[c, :] = np.zeros(num, dtype=fp) + for i in range(num): + if probs[i] == 0: + probs_con[c, i] = 0 + else: + probs_con[c, i] = probs_conditional[i] / probs[i] + + # print(interval_seq.shape[0] - 1 - i_interval) + out[:, :, interval_seq.shape[0] - 1 - i_interval] = probs_con + + return out @d.dedent def co_occurrence( @@ -364,6 +482,7 @@ def co_occurrence( n_jobs: int | None = None, backend: str = "loky", show_progress_bar: bool = True, + use_rust: bool = False ) -> tuple[NDArrayA, NDArrayA] | None: """ Compute co-occurrence probability of clusters. @@ -381,6 +500,9 @@ def co_occurrence( Number of splits in which to divide the spatial coordinates in :attr:`anndata.AnnData.obsm` ``['{spatial_key}']``. %(parallelize)s + use_rust + Whether to use the rust implementation of the function. If ``True``, the rust implementation will be used. + If ``False``, the python implementation will. Returns ------- @@ -393,6 +515,12 @@ def co_occurrence( - :attr:`anndata.AnnData.uns` ``['{cluster_key}_co_occurrence']['interval']`` - the distance thresholds computed at ``interval``. """ + + + if use_rust and not find_spec("ststat_rs"): + ## issue an error and ask user to install ststat_rs + raise ImportError("ststat_rs not found. Please install ststat_rs to use the rust version of co-occurrence function: `pip install ststat-rs`") + if isinstance(adata, SpatialData): adata = adata.table _assert_categorical_obs(adata, key=cluster_key) @@ -401,11 +529,6 @@ def co_occurrence( spatial = adata.obsm[spatial_key].astype(fp) original_clust = adata.obs[cluster_key] - # annotate cluster idx - clust_map = {v: i for i, v in enumerate(original_clust.cat.categories.values)} - labs = np.array([clust_map[c] for c in original_clust], dtype=ip) - labs_unique = np.array(list(clust_map.values()), dtype=ip) - # create intervals thresholds if isinstance(interval, int): thresh_min, thresh_max = _find_min_max(spatial) @@ -415,46 +538,32 @@ def co_occurrence( if len(interval) <= 1: raise ValueError(f"Expected interval to be of length `>= 2`, found `{len(interval)}`.") - n_obs = spatial.shape[0] - if n_splits is None: - size_arr = (n_obs**2 * spatial.itemsize) / 1024 / 1024 # calc expected mem usage - n_splits = 1 - if size_arr > 2000: - while (n_obs / n_splits) > 2048: - n_splits += 1 - logg.warning( - f"`n_splits` was automatically set to `{n_splits}` to " - f"prevent `{n_obs}x{n_obs}` distance matrix from being created" - ) - n_splits = int(max(min(n_splits, n_obs), 1)) - - # split array and labels - spatial_splits = tuple(s for s in np.array_split(spatial, n_splits, axis=0) if len(s)) - labs_splits = tuple(s for s in np.array_split(labs, n_splits, axis=0) if len(s)) - # create idx array including unique combinations and self-comparison - x, y = np.triu_indices_from(np.empty((n_splits, n_splits))) - idx_splits = list(zip(x, y, strict=False)) - - n_jobs = _get_n_cores(n_jobs) - start = logg.info( - f"Calculating co-occurrence probabilities for `{len(interval)}` intervals " - f"`{len(idx_splits)}` split combinations using `{n_jobs}` core(s)" - ) - - out_lst = parallelize( - _co_occurrence_helper, - collection=idx_splits, - extractor=chain.from_iterable, - n_jobs=n_jobs, - backend=backend, - show_progress_bar=show_progress_bar, - )( - spatial_splits=spatial_splits, - labs_splits=labs_splits, - labs_unique=labs_unique, - interval=interval, - ) - out = list(out_lst)[0] if len(idx_splits) == 1 else sum(list(out_lst)) / len(idx_splits) + if use_rust: + out = _co_occurrence_rs( + spatial, + original_clust, + interval + ) + start = logg.info( + f"Calculating co-occurrence probabilities for `{len(interval)}` intervals" + ) + else: + # logg.info("Using python implementation of co-occurrence function. For faster computation, consider using the rust implementation by setting `use_rust=True`") + print("Using python implementation of co-occurrence function. For faster computation, consider using the rust implementation by setting `use_rust=True`") + + out = _co_occurrence_origin( + spatial, + original_clust, + interval, + copy=copy, + n_splits=n_splits, + n_jobs=n_jobs, + backend=backend, + show_progress_bar=show_progress_bar + ) + start = logg.info( + f"Calculating co-occurrence probabilities for `{len(interval)}` intervals using `{n_jobs}` core(s) and `{n_splits}` splits" + ) if copy: logg.info("Finish", time=start) @@ -465,7 +574,7 @@ def co_occurrence( attr="uns", key=Key.uns.co_occurrence(cluster_key), data={"occ": out, "interval": interval}, - time=start, + time=start ) diff --git a/tests/graph/test_ppatterns.py b/tests/graph/test_ppatterns.py index 6752b1548..91502ba2f 100644 --- a/tests/graph/test_ppatterns.py +++ b/tests/graph/test_ppatterns.py @@ -130,6 +130,24 @@ def test_co_occurrence(adata: AnnData): assert arr.shape[2] == 49 assert arr.shape[1] == arr.shape[0] == adata.obs["leiden"].unique().shape[0] +def test_co_occurrence_rs(adata: AnnData): + """ + check co_occurrence score and shape (rust implementation) + """ + co_occurrence(adata, cluster_key="leiden", use_rust=True) + + # assert occurrence in adata.uns + assert "leiden_co_occurrence" in adata.uns.keys() + assert "occ" in adata.uns["leiden_co_occurrence"].keys() + assert "interval" in adata.uns["leiden_co_occurrence"].keys() + + # assert shapes + arr = adata.uns["leiden_co_occurrence"]["occ"] + assert arr.ndim == 3 + assert arr.shape[2] == 49 + assert arr.shape[1] == arr.shape[0] == adata.obs["leiden"].unique().shape[0] + + # @pytest.mark.parametrize(("ys", "xs"), [(10, 10), (None, None), (10, 20)]) @pytest.mark.parametrize(("n_jobs", "n_splits"), [(1, 2), (2, 2)]) @@ -155,6 +173,19 @@ def test_co_occurrence_explicit_interval(adata: AnnData, size: int): assert interval is not interval_1 np.testing.assert_allclose(interval, interval_1) # allclose because in the func, we use f32 +@pytest.mark.parametrize("size", [1, 3]) +def test_co_occurrence_rs_explicit_interval(adata: AnnData, size: int): + minn, maxx = _find_min_max(adata.obsm[Key.obsm.spatial]) + interval = np.linspace(minn, maxx, size) + if size == 1: + with pytest.raises(ValueError, match=r"Expected interval to be of length"): + _ = co_occurrence(adata, cluster_key="leiden", copy=True, interval=interval, use_rust=True) + else: + _, interval_1 = co_occurrence_rs(adata, cluster_key="leiden", copy=True, interval=interval, use_rust=True) + + assert interval is not interval_1 + np.testing.assert_allclose(interval, interval_1) # allclose because in the func, we use f32 + def test_use_raw(dummy_adata: AnnData): var_names = [str(i) for i in range(10)] From e51f5467ac867613c8103bdab06b71c9c17c402a Mon Sep 17 00:00:00 2001 From: Wenjie SUN Date: Tue, 18 Mar 2025 22:11:44 +0100 Subject: [PATCH 02/18] misc: change rust-py deps --- pyproject.toml | 2 +- src/squidpy/gr/_ppatterns.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c97919602..6183a4577 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -103,7 +103,7 @@ docs = [ "sphinx_copybutton>=0.5.0", ] co_occurrence = [ - "ststat-rs", + "scstat-rs", ] [project.urls] diff --git a/src/squidpy/gr/_ppatterns.py b/src/squidpy/gr/_ppatterns.py index ad72d384a..5104cce52 100644 --- a/src/squidpy/gr/_ppatterns.py +++ b/src/squidpy/gr/_ppatterns.py @@ -427,7 +427,7 @@ def _co_occurrence_rs( interval: NDArrayA ) -> NDArrayA | None: - from ststat_rs import co_occur_count + from scstat_rs import co_occur_count clust = original_clust.cat.codes.astype(np.int32) From a5b82265371c47f6f8293803f1b8cb986c168c01 Mon Sep 17 00:00:00 2001 From: Wenjie SUN Date: Tue, 18 Mar 2025 22:35:34 +0100 Subject: [PATCH 03/18] doc: improve the documentation --- src/squidpy/gr/_ppatterns.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/squidpy/gr/_ppatterns.py b/src/squidpy/gr/_ppatterns.py index 5104cce52..601367c6f 100644 --- a/src/squidpy/gr/_ppatterns.py +++ b/src/squidpy/gr/_ppatterns.py @@ -274,6 +274,7 @@ def _score_helper( return score_perms +## The _occur_count() function is needed for the _co_occurrence_origin implementation @njit( ft[:, :, :](tt(it[:], 2), ft[:, :], it[:], ft[:], bl), parallel=False, @@ -395,6 +396,8 @@ def _co_occurrence_origin ( return out + +## The _co_occurrence_helper() function is needed for the _co_occurrence_origin implementation def _co_occurrence_helper( idx_splits: Iterable[tuple[int, int]], spatial_splits: Sequence[NDArrayA], @@ -501,8 +504,11 @@ def co_occurrence( :attr:`anndata.AnnData.obsm` ``['{spatial_key}']``. %(parallelize)s use_rust - Whether to use the rust implementation of the function. If ``True``, the rust implementation will be used. - If ``False``, the python implementation will. + Whether to use the rust implementation of the co-occurrence probability. + If ``True``, the rust implementation will be used. + If ``False``, the python implementation will be used. + For rust implementation, the `scstat-rs` package is required. + You can install it via `pip install scstat-rs`. Returns ------- @@ -517,9 +523,9 @@ def co_occurrence( """ - if use_rust and not find_spec("ststat_rs"): - ## issue an error and ask user to install ststat_rs - raise ImportError("ststat_rs not found. Please install ststat_rs to use the rust version of co-occurrence function: `pip install ststat-rs`") + if use_rust and not find_spec("scstat_rs"): + ## issue an error and ask user to install scstat_rs + raise ImportError("scstat_rs not found. Please install scstat_rs to use the rust version of co-occurrence function: `pip install scstat-rs`") if isinstance(adata, SpatialData): adata = adata.table From f7ff2936c9126d2b3f9464351189b62da4975496 Mon Sep 17 00:00:00 2001 From: Daniele Lucarelli <108922919+MDLDan@users.noreply.github.com> Date: Wed, 19 Mar 2025 09:13:02 +0100 Subject: [PATCH 04/18] add python re-implementation --- src/squidpy/gr/_ppatterns.py | 301 +++++++++++++++++------------------ 1 file changed, 144 insertions(+), 157 deletions(-) diff --git a/src/squidpy/gr/_ppatterns.py b/src/squidpy/gr/_ppatterns.py index 601367c6f..3ebdd54de 100644 --- a/src/squidpy/gr/_ppatterns.py +++ b/src/squidpy/gr/_ppatterns.py @@ -12,7 +12,7 @@ import numpy as np import pandas as pd from anndata import AnnData -from numba import njit +from numba import njit, prange from numpy.random import default_rng from scanpy import logging as logg from scanpy.get import _get_obs_rep @@ -274,155 +274,147 @@ def _score_helper( return score_perms -## The _occur_count() function is needed for the _co_occurrence_origin implementation -@njit( - ft[:, :, :](tt(it[:], 2), ft[:, :], it[:], ft[:], bl), - parallel=False, - fastmath=True, -) +@njit(parallel=True, fastmath=True) def _occur_count( - clust: tuple[NDArrayA, NDArrayA], - pw_dist: NDArrayA, - labs_unique: NDArrayA, - interval: NDArrayA, - same_split: bool, -) -> NDArrayA: - num = labs_unique.shape[0] - out = np.zeros((num, num, interval.shape[0] - 1), dtype=ft) - - for idx in range(interval.shape[0] - 1): - co_occur = np.zeros((num, num), dtype=ft) - probs_con = np.zeros((num, num), dtype=ft) - - thres_max = interval[idx + 1] - clust_x, clust_y = clust - - # Modified to compute co-occurrence probability ratio over increasing radii sizes as opposed to discrete interval bins - # Need pw_dist > 0 to avoid counting a cell with itself as co-occurrence - idx_x, idx_y = np.nonzero((pw_dist <= thres_max) & (pw_dist > 0)) - x = clust_x[idx_x] - y = clust_y[idx_y] - # Treat computing co-occurrence using the same split and different splits differently - # Pairwise distance matrix for between the same split is symmetric and therefore only needs to be counted once - for i, j in zip(x, y): # noqa: B905 # cannot use strict=False because of numba - co_occur[i, j] += 1 - if not same_split: - co_occur[j, i] += 1 - - # Prevent divison by zero errors when we have low cell counts/small intervals - probs_matrix = co_occur / np.sum(co_occur) if np.sum(co_occur) != 0 else np.zeros((num, num), dtype=ft) - probs = np.sum(probs_matrix, axis=0) - - for c in labs_unique: - probs_conditional = ( - co_occur[c] / np.sum(co_occur[c]) if np.sum(co_occur[c]) != 0 else np.zeros(num, dtype=ft) - ) - probs_con[c, :] = np.zeros(num, dtype=ft) - for i in range(num): - if probs[i] == 0: - probs_con[c, i] = 0 - else: - probs_con[c, i] = probs_conditional[i] / probs[i] - - out[:, :, idx] = probs_con - - return out - -def _co_occurrence_origin ( - spatial: NDArrayA, - original_clust: pd.Series, - interval: NDArrayA, - copy: bool = False, - n_splits: int | None = None, - n_jobs: int | None = None, - backend: str = "loky", - show_progress_bar: bool = True -) -> NDArrayA: - - # annotate cluster idx - clust_map = {v: i for i, v in enumerate(original_clust.cat.categories.values)} - labs = np.array([clust_map[c] for c in original_clust], dtype=ip) - labs_unique = np.array(list(clust_map.values()), dtype=ip) - - # create intervals thresholds - if isinstance(interval, int): - thresh_min, thresh_max = _find_min_max(spatial) - interval = np.linspace(thresh_min, thresh_max, num=interval, dtype=fp) - else: - interval = np.array(sorted(interval), dtype=fp, copy=True) - if len(interval) <= 1: - raise ValueError(f"Expected interval to be of length `>= 2`, found `{len(interval)}`.") + spatial_x: NDArrayA, + spatial_y:NDArrayA, + thresholds: NDArrayA, + label_idx: NDArrayA, + n: int, + k: int, + l_val: int +)-> NDArrayA: + + # Allocate a 2D array to store a flat local result per point. + local_results = np.zeros((n, l_val * k * k), dtype=np.int32) + for i in prange(n): + local_counts = np.zeros(l_val * k * k, dtype=np.int32) + for j in range(n): + if i == j: + continue + dx = spatial_x[i] - spatial_x[j] + dy = spatial_y[i] - spatial_y[j] + dist_sq = dx * dx + dy * dy + for r in range(l_val): + thresh = thresholds[r] + if dist_sq <= thresh: + index = r * (k * k) + label_idx[i] * k + label_idx[j] + local_counts[index] += 1 + #else: + # break # If this threshold fails, smaller ones will also fail. + for m in range(l_val * k * k): + local_results[i, m] = local_counts[m] + + # Reduce over all points: sum the local counts. + result_flat = np.zeros(l_val * k * k, dtype=np.int32) + for m in range(l_val * k * k): + s = 0 + for i in range(n): + s += local_results[i, m] + result_flat[m] = s + + # Reshape to a 3D array with shape (interval, k, k) + counts = np.empty((l_val, k, k), dtype=np.int32) + for r in range(l_val): + for i in range(k): + for j in range(k): + counts[r, i, j] = result_flat[r * (k * k) + i * k + j] + + # Rearrange axes to match the original convention: (k, k, interval) + result = np.empty((k, k, l_val), dtype=np.int32) + for i in range(k): + for j in range(k): + for r in range(l_val): + result[i, j, r] = counts[r, i, j] + return result - n_obs = spatial.shape[0] - if n_splits is None: - size_arr = (n_obs**2 * spatial.itemsize) / 1024 / 1024 # calc expected mem usage - n_splits = 1 - if size_arr > 2000: - while (n_obs / n_splits) > 2048: - n_splits += 1 - logg.warning( - f"`n_splits` was automatically set to `{n_splits}` to " - f"prevent `{n_obs}x{n_obs}` distance matrix from being created" - ) - n_splits = int(max(min(n_splits, n_obs), 1)) - - # split array and labels - spatial_splits = tuple(s for s in np.array_split(spatial, n_splits, axis=0) if len(s)) - labs_splits = tuple(s for s in np.array_split(labs, n_splits, axis=0) if len(s)) - # create idx array including unique combinations and self-comparison - x, y = np.triu_indices_from(np.empty((n_splits, n_splits))) - idx_splits = list(zip(x, y, strict=False)) - - n_jobs = _get_n_cores(n_jobs) - start = logg.info( - f"Calculating co-occurrence probabilities for `{len(interval)}` intervals " - f"`{len(idx_splits)}` split combinations using `{n_jobs}` core(s)" - ) - - out_lst = parallelize( - _co_occurrence_helper, - collection=idx_splits, - extractor=chain.from_iterable, - n_jobs=n_jobs, - backend=backend, - show_progress_bar=show_progress_bar, - )( - spatial_splits=spatial_splits, - labs_splits=labs_splits, - labs_unique=labs_unique, - interval=interval, - ) - out = list(out_lst)[0] if len(idx_splits) == 1 else sum(list(out_lst)) / len(idx_splits) - return out -## The _co_occurrence_helper() function is needed for the _co_occurrence_origin implementation def _co_occurrence_helper( - idx_splits: Iterable[tuple[int, int]], - spatial_splits: Sequence[NDArrayA], - labs_splits: Sequence[NDArrayA], - labs_unique: NDArrayA, - interval: NDArrayA, - queue: SigQueue | None = None, -) -> pd.DataFrame: - out_lst = [] - for t in idx_splits: - idx_x, idx_y = t - labs_x = labs_splits[idx_x] - labs_y = labs_splits[idx_y] - dist = pairwise_distances(spatial_splits[idx_x], spatial_splits[idx_y]) - - out = _occur_count((labs_x, labs_y), dist, labs_unique, interval, idx_x == idx_y) - out_lst.append(out) - - if queue is not None: - queue.put(Signal.UPDATE) - - if queue is not None: - queue.put(Signal.FINISH) - - return out_lst + v_x: NDArrayA, + v_y: NDArrayA, + v_radium: NDArrayA, + labs: NDArrayA +)-> NDArrayA: + """ + Fast co-occurrence probability computation using the new numba-accelerated counting. + + Parameters + ---------- + v_x : np.ndarray, float64 + x–coordinates. + v_y : np.ndarray, float64 + y–coordinates. + v_radium : np.ndarray, float64 + Distance thresholds (in ascending order). + labs : np.ndarray + Cluster labels (as integers). + + Returns + ------- + occ_prob : np.ndarray + A 3D array of shape (k, k, len(v_radium)-1) containing the co-occurrence probabilities. + labs_unique : np.ndarray + Array of unique labels. + """ + n = len(v_x) + labs_unique = np.unique(labs) + k = len(labs_unique) + # l_val is the number of bins; here we assume the thresholds come from v_radium[1:]. + l_val = len(v_radium) - 1 + # Compute squared thresholds from the interval (skip the first value) + thresholds = (v_radium[1:]) ** 2 + + # Compute cco-occurence ounts. + counts = _occur_count(v_x, v_y, thresholds, labs, n, k, l_val) + + # Compute co-occurrence probabilities for each threshold bin. + occ_prob = np.empty((k, k, l_val), dtype=np.float32) + for r in range(l_val): + co_occur = counts[:, :, r].astype(np.float32) + + # Compute the total count for this threshold. + total = 0.0 + for i in range(k): + for j in range(k): + total += co_occur[i, j] + + # Compute the normalized probability matrix. + probs_matrix = np.zeros((k, k), dtype=np.float32) + if total != 0.0: + for i in range(k): + for j in range(k): + probs_matrix[i, j] = co_occur[i, j] / total + + probs = np.zeros(k, dtype=np.float32) + for j in range(k): + s = 0.0 + for i in range(k): + s += probs_matrix[i, j] + probs[j] = s + + # Compute conditional probabilities. + probs_con = np.zeros((k, k), dtype=np.float32) + for c in range(k): + row_sum = 0.0 + for j in range(k): + row_sum += co_occur[c, j] + for i in range(k): + cond = 0.0 + if row_sum != 0.0: + cond = co_occur[c, i] / row_sum + if probs[i] == 0.0: + probs_con[c, i] = 0.0 + else: + probs_con[c, i] = cond / probs[i] + + # Transpose to match (k, k, interval). + for i in range(k): + for c in range(k): + occ_prob[i, c, r] = probs_con[c, i] + + return occ_prob def _co_occurrence_rs( spatial: NDArrayA, @@ -451,7 +443,6 @@ def _co_occurrence_rs( for i_interval in interval_seq: - # print(i_interval) co_occur = co_occur_3d[:, :, i_interval] probs_matrix = co_occur / np.sum(co_occur) if np.sum(co_occur) != 0 else np.zeros((num, num), dtype=fp) @@ -472,7 +463,7 @@ def _co_occurrence_rs( # print(interval_seq.shape[0] - 1 - i_interval) out[:, :, interval_seq.shape[0] - 1 - i_interval] = probs_con - return out + return out, labs_unique @d.dedent def co_occurrence( @@ -534,7 +525,9 @@ def co_occurrence( spatial = adata.obsm[spatial_key].astype(fp) original_clust = adata.obs[cluster_key] - + clust_map = {v: i for i, v in enumerate(original_clust.cat.categories.values)} + labs = np.array([clust_map[c] for c in original_clust], dtype=ip) + # create intervals thresholds if isinstance(interval, int): thresh_min, thresh_max = _find_min_max(spatial) @@ -556,17 +549,11 @@ def co_occurrence( else: # logg.info("Using python implementation of co-occurrence function. For faster computation, consider using the rust implementation by setting `use_rust=True`") print("Using python implementation of co-occurrence function. For faster computation, consider using the rust implementation by setting `use_rust=True`") + spatial_x = spatial[:, 0] + spatial_y = spatial[:, 1] - out = _co_occurrence_origin( - spatial, - original_clust, - interval, - copy=copy, - n_splits=n_splits, - n_jobs=n_jobs, - backend=backend, - show_progress_bar=show_progress_bar - ) + # Compute co-occurrence probabilities using the fast numba routine. + out = _co_occurrence_helper(spatial_x, spatial_y, interval, labs) start = logg.info( f"Calculating co-occurrence probabilities for `{len(interval)}` intervals using `{n_jobs}` core(s) and `{n_splits}` splits" ) From 9af42521903c82021f9f253b23083b2def4efd82 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 19 Mar 2025 09:50:26 +0000 Subject: [PATCH 05/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/squidpy/gr/_ppatterns.py | 92 ++++++++++++----------------------- tests/graph/test_ppatterns.py | 3 +- 2 files changed, 33 insertions(+), 62 deletions(-) diff --git a/src/squidpy/gr/_ppatterns.py b/src/squidpy/gr/_ppatterns.py index 3ebdd54de..e48d0671b 100644 --- a/src/squidpy/gr/_ppatterns.py +++ b/src/squidpy/gr/_ppatterns.py @@ -2,9 +2,8 @@ from __future__ import annotations -from importlib.util import find_spec - from collections.abc import Iterable, Sequence +from importlib.util import find_spec from itertools import chain from typing import TYPE_CHECKING, Any, Literal @@ -276,15 +275,8 @@ def _score_helper( @njit(parallel=True, fastmath=True) def _occur_count( - spatial_x: NDArrayA, - spatial_y:NDArrayA, - thresholds: NDArrayA, - label_idx: NDArrayA, - n: int, - k: int, - l_val: int -)-> NDArrayA: - + spatial_x: NDArrayA, spatial_y: NDArrayA, thresholds: NDArrayA, label_idx: NDArrayA, n: int, k: int, l_val: int +) -> NDArrayA: # Allocate a 2D array to store a flat local result per point. local_results = np.zeros((n, l_val * k * k), dtype=np.int32) for i in prange(n): @@ -300,11 +292,11 @@ def _occur_count( if dist_sq <= thresh: index = r * (k * k) + label_idx[i] * k + label_idx[j] local_counts[index] += 1 - #else: + # else: # break # If this threshold fails, smaller ones will also fail. for m in range(l_val * k * k): local_results[i, m] = local_counts[m] - + # Reduce over all points: sum the local counts. result_flat = np.zeros(l_val * k * k, dtype=np.int32) for m in range(l_val * k * k): @@ -319,7 +311,7 @@ def _occur_count( for i in range(k): for j in range(k): counts[r, i, j] = result_flat[r * (k * k) + i * k + j] - + # Rearrange axes to match the original convention: (k, k, interval) result = np.empty((k, k, l_val), dtype=np.int32) for i in range(k): @@ -329,17 +321,10 @@ def _occur_count( return result - - -def _co_occurrence_helper( - v_x: NDArrayA, - v_y: NDArrayA, - v_radium: NDArrayA, - labs: NDArrayA -)-> NDArrayA: +def _co_occurrence_helper(v_x: NDArrayA, v_y: NDArrayA, v_radium: NDArrayA, labs: NDArrayA) -> NDArrayA: """ Fast co-occurrence probability computation using the new numba-accelerated counting. - + Parameters ---------- v_x : np.ndarray, float64 @@ -350,7 +335,7 @@ def _co_occurrence_helper( Distance thresholds (in ascending order). labs : np.ndarray Cluster labels (as integers). - + Returns ------- occ_prob : np.ndarray @@ -365,35 +350,35 @@ def _co_occurrence_helper( l_val = len(v_radium) - 1 # Compute squared thresholds from the interval (skip the first value) thresholds = (v_radium[1:]) ** 2 - + # Compute cco-occurence ounts. counts = _occur_count(v_x, v_y, thresholds, labs, n, k, l_val) - + # Compute co-occurrence probabilities for each threshold bin. occ_prob = np.empty((k, k, l_val), dtype=np.float32) for r in range(l_val): co_occur = counts[:, :, r].astype(np.float32) - + # Compute the total count for this threshold. total = 0.0 for i in range(k): for j in range(k): total += co_occur[i, j] - + # Compute the normalized probability matrix. probs_matrix = np.zeros((k, k), dtype=np.float32) if total != 0.0: for i in range(k): for j in range(k): probs_matrix[i, j] = co_occur[i, j] / total - + probs = np.zeros(k, dtype=np.float32) for j in range(k): s = 0.0 for i in range(k): s += probs_matrix[i, j] probs[j] = s - + # Compute conditional probabilities. probs_con = np.zeros((k, k), dtype=np.float32) for c in range(k): @@ -408,29 +393,21 @@ def _co_occurrence_helper( probs_con[c, i] = 0.0 else: probs_con[c, i] = cond / probs[i] - + # Transpose to match (k, k, interval). for i in range(k): for c in range(k): occ_prob[i, c, r] = probs_con[c, i] - + return occ_prob -def _co_occurrence_rs( - spatial: NDArrayA, - original_clust: pd.Series, - interval: NDArrayA -) -> NDArrayA | None: +def _co_occurrence_rs(spatial: NDArrayA, original_clust: pd.Series, interval: NDArrayA) -> NDArrayA | None: from scstat_rs import co_occur_count clust = original_clust.cat.codes.astype(np.int32) - co_occur_3d = co_occur_count( - spatial[:, 0], - spatial[:, 1], - interval, clust - ) + co_occur_3d = co_occur_count(spatial[:, 0], spatial[:, 1], interval, clust) ## for each dimension # for i in range(co_occur.shape[2]): @@ -441,7 +418,6 @@ def _co_occurrence_rs( interval_seq = np.arange(interval.shape[0] - 2, -1, -1) interval_seq.shape[0] - for i_interval in interval_seq: co_occur = co_occur_3d[:, :, i_interval] @@ -465,6 +441,7 @@ def _co_occurrence_rs( return out, labs_unique + @d.dedent def co_occurrence( adata: AnnData | SpatialData, @@ -476,7 +453,7 @@ def co_occurrence( n_jobs: int | None = None, backend: str = "loky", show_progress_bar: bool = True, - use_rust: bool = False + use_rust: bool = False, ) -> tuple[NDArrayA, NDArrayA] | None: """ Compute co-occurrence probability of clusters. @@ -513,10 +490,11 @@ def co_occurrence( computed at ``interval``. """ - if use_rust and not find_spec("scstat_rs"): ## issue an error and ask user to install scstat_rs - raise ImportError("scstat_rs not found. Please install scstat_rs to use the rust version of co-occurrence function: `pip install scstat-rs`") + raise ImportError( + "scstat_rs not found. Please install scstat_rs to use the rust version of co-occurrence function: `pip install scstat-rs`" + ) if isinstance(adata, SpatialData): adata = adata.table @@ -527,7 +505,7 @@ def co_occurrence( original_clust = adata.obs[cluster_key] clust_map = {v: i for i, v in enumerate(original_clust.cat.categories.values)} labs = np.array([clust_map[c] for c in original_clust], dtype=ip) - + # create intervals thresholds if isinstance(interval, int): thresh_min, thresh_max = _find_min_max(spatial) @@ -538,17 +516,13 @@ def co_occurrence( raise ValueError(f"Expected interval to be of length `>= 2`, found `{len(interval)}`.") if use_rust: - out = _co_occurrence_rs( - spatial, - original_clust, - interval - ) - start = logg.info( - f"Calculating co-occurrence probabilities for `{len(interval)}` intervals" - ) + out = _co_occurrence_rs(spatial, original_clust, interval) + start = logg.info(f"Calculating co-occurrence probabilities for `{len(interval)}` intervals") else: # logg.info("Using python implementation of co-occurrence function. For faster computation, consider using the rust implementation by setting `use_rust=True`") - print("Using python implementation of co-occurrence function. For faster computation, consider using the rust implementation by setting `use_rust=True`") + print( + "Using python implementation of co-occurrence function. For faster computation, consider using the rust implementation by setting `use_rust=True`" + ) spatial_x = spatial[:, 0] spatial_y = spatial[:, 1] @@ -563,11 +537,7 @@ def co_occurrence( return out, interval _save_data( - adata, - attr="uns", - key=Key.uns.co_occurrence(cluster_key), - data={"occ": out, "interval": interval}, - time=start + adata, attr="uns", key=Key.uns.co_occurrence(cluster_key), data={"occ": out, "interval": interval}, time=start ) diff --git a/tests/graph/test_ppatterns.py b/tests/graph/test_ppatterns.py index 91502ba2f..997dcaf4f 100644 --- a/tests/graph/test_ppatterns.py +++ b/tests/graph/test_ppatterns.py @@ -130,6 +130,7 @@ def test_co_occurrence(adata: AnnData): assert arr.shape[2] == 49 assert arr.shape[1] == arr.shape[0] == adata.obs["leiden"].unique().shape[0] + def test_co_occurrence_rs(adata: AnnData): """ check co_occurrence score and shape (rust implementation) @@ -148,7 +149,6 @@ def test_co_occurrence_rs(adata: AnnData): assert arr.shape[1] == arr.shape[0] == adata.obs["leiden"].unique().shape[0] - # @pytest.mark.parametrize(("ys", "xs"), [(10, 10), (None, None), (10, 20)]) @pytest.mark.parametrize(("n_jobs", "n_splits"), [(1, 2), (2, 2)]) def test_co_occurrence_reproducibility(adata: AnnData, n_jobs: int, n_splits: int): @@ -173,6 +173,7 @@ def test_co_occurrence_explicit_interval(adata: AnnData, size: int): assert interval is not interval_1 np.testing.assert_allclose(interval, interval_1) # allclose because in the func, we use f32 + @pytest.mark.parametrize("size", [1, 3]) def test_co_occurrence_rs_explicit_interval(adata: AnnData, size: int): minn, maxx = _find_min_max(adata.obsm[Key.obsm.spatial]) From 9457767553cd58919d656031667561ccb533c102 Mon Sep 17 00:00:00 2001 From: Wenjie SUN Date: Wed, 19 Mar 2025 11:01:33 +0100 Subject: [PATCH 06/18] Clean the tests and dependencies --- pyproject.toml | 3 - src/squidpy/gr/_ppatterns.py | 136 ++++++++-------------------------- tests/graph/test_ppatterns.py | 31 -------- 3 files changed, 31 insertions(+), 139 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6183a4577..1143cab94 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -102,9 +102,6 @@ docs = [ "myst-nb>=0.17.1", "sphinx_copybutton>=0.5.0", ] -co_occurrence = [ - "scstat-rs", -] [project.urls] Homepage = "https://github.com/scverse/squidpy" diff --git a/src/squidpy/gr/_ppatterns.py b/src/squidpy/gr/_ppatterns.py index 3ebdd54de..962fa0ca9 100644 --- a/src/squidpy/gr/_ppatterns.py +++ b/src/squidpy/gr/_ppatterns.py @@ -2,8 +2,6 @@ from __future__ import annotations -from importlib.util import find_spec - from collections.abc import Iterable, Sequence from itertools import chain from typing import TYPE_CHECKING, Any, Literal @@ -276,12 +274,12 @@ def _score_helper( @njit(parallel=True, fastmath=True) def _occur_count( - spatial_x: NDArrayA, - spatial_y:NDArrayA, - thresholds: NDArrayA, - label_idx: NDArrayA, - n: int, - k: int, + spatial_x: NDArrayA, + spatial_y:NDArrayA, + thresholds: NDArrayA, + label_idx: NDArrayA, + n: int, + k: int, l_val: int )-> NDArrayA: @@ -304,7 +302,7 @@ def _occur_count( # break # If this threshold fails, smaller ones will also fail. for m in range(l_val * k * k): local_results[i, m] = local_counts[m] - + # Reduce over all points: sum the local counts. result_flat = np.zeros(l_val * k * k, dtype=np.int32) for m in range(l_val * k * k): @@ -319,7 +317,7 @@ def _occur_count( for i in range(k): for j in range(k): counts[r, i, j] = result_flat[r * (k * k) + i * k + j] - + # Rearrange axes to match the original convention: (k, k, interval) result = np.empty((k, k, l_val), dtype=np.int32) for i in range(k): @@ -332,14 +330,14 @@ def _occur_count( def _co_occurrence_helper( - v_x: NDArrayA, - v_y: NDArrayA, - v_radium: NDArrayA, + v_x: NDArrayA, + v_y: NDArrayA, + v_radium: NDArrayA, labs: NDArrayA )-> NDArrayA: """ Fast co-occurrence probability computation using the new numba-accelerated counting. - + Parameters ---------- v_x : np.ndarray, float64 @@ -350,7 +348,7 @@ def _co_occurrence_helper( Distance thresholds (in ascending order). labs : np.ndarray Cluster labels (as integers). - + Returns ------- occ_prob : np.ndarray @@ -365,35 +363,35 @@ def _co_occurrence_helper( l_val = len(v_radium) - 1 # Compute squared thresholds from the interval (skip the first value) thresholds = (v_radium[1:]) ** 2 - + # Compute cco-occurence ounts. counts = _occur_count(v_x, v_y, thresholds, labs, n, k, l_val) - + # Compute co-occurrence probabilities for each threshold bin. occ_prob = np.empty((k, k, l_val), dtype=np.float32) for r in range(l_val): co_occur = counts[:, :, r].astype(np.float32) - + # Compute the total count for this threshold. total = 0.0 for i in range(k): for j in range(k): total += co_occur[i, j] - + # Compute the normalized probability matrix. probs_matrix = np.zeros((k, k), dtype=np.float32) if total != 0.0: for i in range(k): for j in range(k): probs_matrix[i, j] = co_occur[i, j] / total - + probs = np.zeros(k, dtype=np.float32) for j in range(k): s = 0.0 for i in range(k): s += probs_matrix[i, j] probs[j] = s - + # Compute conditional probabilities. probs_con = np.zeros((k, k), dtype=np.float32) for c in range(k): @@ -408,62 +406,13 @@ def _co_occurrence_helper( probs_con[c, i] = 0.0 else: probs_con[c, i] = cond / probs[i] - + # Transpose to match (k, k, interval). for i in range(k): for c in range(k): occ_prob[i, c, r] = probs_con[c, i] - - return occ_prob -def _co_occurrence_rs( - spatial: NDArrayA, - original_clust: pd.Series, - interval: NDArrayA -) -> NDArrayA | None: - - from scstat_rs import co_occur_count - - clust = original_clust.cat.codes.astype(np.int32) - - co_occur_3d = co_occur_count( - spatial[:, 0], - spatial[:, 1], - interval, clust - ) - - ## for each dimension - # for i in range(co_occur.shape[2]): - num = co_occur_3d.shape[0] - labs_unique = range(co_occur_3d.shape[0]) - out = np.zeros((num, num, interval.shape[0] - 1), dtype=fp) - - interval_seq = np.arange(interval.shape[0] - 2, -1, -1) - interval_seq.shape[0] - - - for i_interval in interval_seq: - co_occur = co_occur_3d[:, :, i_interval] - - probs_matrix = co_occur / np.sum(co_occur) if np.sum(co_occur) != 0 else np.zeros((num, num), dtype=fp) - probs = np.sum(probs_matrix, axis=0) - probs_con = np.zeros((num, num), dtype=fp) - - for c in labs_unique: - probs_conditional = ( - co_occur[c] / np.sum(co_occur[c]) if np.sum(co_occur[c]) != 0 else np.zeros(num, dtype=fp) - ) - probs_con[c, :] = np.zeros(num, dtype=fp) - for i in range(num): - if probs[i] == 0: - probs_con[c, i] = 0 - else: - probs_con[c, i] = probs_conditional[i] / probs[i] - - # print(interval_seq.shape[0] - 1 - i_interval) - out[:, :, interval_seq.shape[0] - 1 - i_interval] = probs_con - - return out, labs_unique + return occ_prob @d.dedent def co_occurrence( @@ -475,8 +424,7 @@ def co_occurrence( n_splits: int | None = None, n_jobs: int | None = None, backend: str = "loky", - show_progress_bar: bool = True, - use_rust: bool = False + show_progress_bar: bool = True ) -> tuple[NDArrayA, NDArrayA] | None: """ Compute co-occurrence probability of clusters. @@ -494,12 +442,6 @@ def co_occurrence( Number of splits in which to divide the spatial coordinates in :attr:`anndata.AnnData.obsm` ``['{spatial_key}']``. %(parallelize)s - use_rust - Whether to use the rust implementation of the co-occurrence probability. - If ``True``, the rust implementation will be used. - If ``False``, the python implementation will be used. - For rust implementation, the `scstat-rs` package is required. - You can install it via `pip install scstat-rs`. Returns ------- @@ -514,10 +456,6 @@ def co_occurrence( """ - if use_rust and not find_spec("scstat_rs"): - ## issue an error and ask user to install scstat_rs - raise ImportError("scstat_rs not found. Please install scstat_rs to use the rust version of co-occurrence function: `pip install scstat-rs`") - if isinstance(adata, SpatialData): adata = adata.table _assert_categorical_obs(adata, key=cluster_key) @@ -527,7 +465,7 @@ def co_occurrence( original_clust = adata.obs[cluster_key] clust_map = {v: i for i, v in enumerate(original_clust.cat.categories.values)} labs = np.array([clust_map[c] for c in original_clust], dtype=ip) - + # create intervals thresholds if isinstance(interval, int): thresh_min, thresh_max = _find_min_max(spatial) @@ -537,26 +475,14 @@ def co_occurrence( if len(interval) <= 1: raise ValueError(f"Expected interval to be of length `>= 2`, found `{len(interval)}`.") - if use_rust: - out = _co_occurrence_rs( - spatial, - original_clust, - interval - ) - start = logg.info( - f"Calculating co-occurrence probabilities for `{len(interval)}` intervals" - ) - else: - # logg.info("Using python implementation of co-occurrence function. For faster computation, consider using the rust implementation by setting `use_rust=True`") - print("Using python implementation of co-occurrence function. For faster computation, consider using the rust implementation by setting `use_rust=True`") - spatial_x = spatial[:, 0] - spatial_y = spatial[:, 1] - - # Compute co-occurrence probabilities using the fast numba routine. - out = _co_occurrence_helper(spatial_x, spatial_y, interval, labs) - start = logg.info( - f"Calculating co-occurrence probabilities for `{len(interval)}` intervals using `{n_jobs}` core(s) and `{n_splits}` splits" - ) + spatial_x = spatial[:, 0] + spatial_y = spatial[:, 1] + + # Compute co-occurrence probabilities using the fast numba routine. + out = _co_occurrence_helper(spatial_x, spatial_y, interval, labs) + start = logg.info( + f"Calculating co-occurrence probabilities for `{len(interval)}` intervals using `{n_jobs}` core(s) and `{n_splits}` splits" + ) if copy: logg.info("Finish", time=start) diff --git a/tests/graph/test_ppatterns.py b/tests/graph/test_ppatterns.py index 91502ba2f..6752b1548 100644 --- a/tests/graph/test_ppatterns.py +++ b/tests/graph/test_ppatterns.py @@ -130,24 +130,6 @@ def test_co_occurrence(adata: AnnData): assert arr.shape[2] == 49 assert arr.shape[1] == arr.shape[0] == adata.obs["leiden"].unique().shape[0] -def test_co_occurrence_rs(adata: AnnData): - """ - check co_occurrence score and shape (rust implementation) - """ - co_occurrence(adata, cluster_key="leiden", use_rust=True) - - # assert occurrence in adata.uns - assert "leiden_co_occurrence" in adata.uns.keys() - assert "occ" in adata.uns["leiden_co_occurrence"].keys() - assert "interval" in adata.uns["leiden_co_occurrence"].keys() - - # assert shapes - arr = adata.uns["leiden_co_occurrence"]["occ"] - assert arr.ndim == 3 - assert arr.shape[2] == 49 - assert arr.shape[1] == arr.shape[0] == adata.obs["leiden"].unique().shape[0] - - # @pytest.mark.parametrize(("ys", "xs"), [(10, 10), (None, None), (10, 20)]) @pytest.mark.parametrize(("n_jobs", "n_splits"), [(1, 2), (2, 2)]) @@ -173,19 +155,6 @@ def test_co_occurrence_explicit_interval(adata: AnnData, size: int): assert interval is not interval_1 np.testing.assert_allclose(interval, interval_1) # allclose because in the func, we use f32 -@pytest.mark.parametrize("size", [1, 3]) -def test_co_occurrence_rs_explicit_interval(adata: AnnData, size: int): - minn, maxx = _find_min_max(adata.obsm[Key.obsm.spatial]) - interval = np.linspace(minn, maxx, size) - if size == 1: - with pytest.raises(ValueError, match=r"Expected interval to be of length"): - _ = co_occurrence(adata, cluster_key="leiden", copy=True, interval=interval, use_rust=True) - else: - _, interval_1 = co_occurrence_rs(adata, cluster_key="leiden", copy=True, interval=interval, use_rust=True) - - assert interval is not interval_1 - np.testing.assert_allclose(interval, interval_1) # allclose because in the func, we use f32 - def test_use_raw(dummy_adata: AnnData): var_names = [str(i) for i in range(10)] From 92f3da5a633020dbb81218452530590a96d383c1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 19 Mar 2025 15:03:00 +0000 Subject: [PATCH 07/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/squidpy/gr/_ppatterns.py | 20 +++++--------------- tests/graph/test_ppatterns.py | 1 + 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/src/squidpy/gr/_ppatterns.py b/src/squidpy/gr/_ppatterns.py index 36cb4e4ec..e6c992d58 100644 --- a/src/squidpy/gr/_ppatterns.py +++ b/src/squidpy/gr/_ppatterns.py @@ -275,14 +275,8 @@ def _score_helper( @njit(parallel=True, fastmath=True) def _occur_count( - spatial_x: NDArrayA, - spatial_y:NDArrayA, - thresholds: NDArrayA, - label_idx: NDArrayA, - n: int, - k: int, - l_val: int -)-> NDArrayA: + spatial_x: NDArrayA, spatial_y: NDArrayA, thresholds: NDArrayA, label_idx: NDArrayA, n: int, k: int, l_val: int +) -> NDArrayA: # Allocate a 2D array to store a flat local result per point. local_results = np.zeros((n, l_val * k * k), dtype=np.int32) for i in prange(n): @@ -327,12 +321,7 @@ def _occur_count( return result -def _co_occurrence_helper( - v_x: NDArrayA, - v_y: NDArrayA, - v_radium: NDArrayA, - labs: NDArrayA -)-> NDArrayA: +def _co_occurrence_helper(v_x: NDArrayA, v_y: NDArrayA, v_radium: NDArrayA, labs: NDArrayA) -> NDArrayA: """ Fast co-occurrence probability computation using the new numba-accelerated counting. @@ -412,6 +401,7 @@ def _co_occurrence_helper( return occ_prob + @d.dedent def co_occurrence( adata: AnnData | SpatialData, @@ -422,7 +412,7 @@ def co_occurrence( n_splits: int | None = None, n_jobs: int | None = None, backend: str = "loky", - show_progress_bar: bool = True + show_progress_bar: bool = True, ) -> tuple[NDArrayA, NDArrayA] | None: """ Compute co-occurrence probability of clusters. diff --git a/tests/graph/test_ppatterns.py b/tests/graph/test_ppatterns.py index c0b6b45fe..6752b1548 100644 --- a/tests/graph/test_ppatterns.py +++ b/tests/graph/test_ppatterns.py @@ -155,6 +155,7 @@ def test_co_occurrence_explicit_interval(adata: AnnData, size: int): assert interval is not interval_1 np.testing.assert_allclose(interval, interval_1) # allclose because in the func, we use f32 + def test_use_raw(dummy_adata: AnnData): var_names = [str(i) for i in range(10)] raw = dummy_adata[:, dummy_adata.var_names[: len(var_names)]].copy() From 52a5fae7e8ae040b1828a2733e25f26858172eea Mon Sep 17 00:00:00 2001 From: Wenjie SUN Date: Thu, 12 Jun 2025 14:44:51 +0200 Subject: [PATCH 08/18] Optimize memory access pattern & cache kernel --- src/squidpy/gr/_ppatterns.py | 62 ++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 35 deletions(-) diff --git a/src/squidpy/gr/_ppatterns.py b/src/squidpy/gr/_ppatterns.py index 5ada2a694..9a18bbaf5 100644 --- a/src/squidpy/gr/_ppatterns.py +++ b/src/squidpy/gr/_ppatterns.py @@ -266,54 +266,46 @@ def _score_helper( return score_perms - @njit(parallel=True, fastmath=True) def _occur_count( - spatial_x: NDArrayA, spatial_y: NDArrayA, thresholds: NDArrayA, label_idx: NDArrayA, n: int, k: int, l_val: int + spatial_x: NDArrayA, + spatial_y: NDArrayA, + thresholds: NDArrayA, + label_idx: NDArrayA, + n: int, + k: int, + l_val: int ) -> NDArrayA: # Allocate a 2D array to store a flat local result per point. - local_results = np.zeros((n, l_val * k * k), dtype=np.int32) + k2 = k * k + local_results = np.zeros((n, l_val * k2), dtype=np.int32) + for i in prange(n): - local_counts = np.zeros(l_val * k * k, dtype=np.int32) + local_counts = np.zeros(l_val * k2, dtype=np.int32) + for j in range(n): if i == j: continue dx = spatial_x[i] - spatial_x[j] dy = spatial_y[i] - spatial_y[j] - dist_sq = dx * dx + dy * dy - for r in range(l_val): - thresh = thresholds[r] - if dist_sq <= thresh: - index = r * (k * k) + label_idx[i] * k + label_idx[j] - local_counts[index] += 1 - # else: - # break # If this threshold fails, smaller ones will also fail. - for m in range(l_val * k * k): - local_results[i, m] = local_counts[m] - - # Reduce over all points: sum the local counts. - result_flat = np.zeros(l_val * k * k, dtype=np.int32) - for m in range(l_val * k * k): - s = 0 - for i in range(n): - s += local_results[i, m] - result_flat[m] = s - - # Reshape to a 3D array with shape (interval, k, k) - counts = np.empty((l_val, k, k), dtype=np.int32) - for r in range(l_val): - for i in range(k): - for j in range(k): - counts[r, i, j] = result_flat[r * (k * k) + i * k + j] + d2 = dx * dx + dy * dy + + pair = label_idx[i] * k + label_idx[j] # fixed in r–loop + base = pair * l_val # first cell for that pair - # Rearrange axes to match the original convention: (k, k, interval) - result = np.empty((k, k, l_val), dtype=np.int32) - for i in range(k): - for j in range(k): for r in range(l_val): - result[i, j, r] = counts[r, i, j] - return result + if d2 <= thresholds[r]: + local_counts[base + r] += 1 + local_results[i, :] = local_counts + # for m in range(l_val * k * k): + # local_results[i, m] = local_counts[m] + + # reduction and reshape stay the same + result_flat = local_results.sum(axis=0) + result = result_flat.reshape(k, k, l_val) + + return result def _co_occurrence_helper(v_x: NDArrayA, v_y: NDArrayA, v_radium: NDArrayA, labs: NDArrayA) -> NDArrayA: """ From cee646ae2d72f8d7f685058c836f7afded835789 Mon Sep 17 00:00:00 2001 From: Wenjie SUN Date: Thu, 12 Jun 2025 15:06:41 +0200 Subject: [PATCH 09/18] jit the outer function and parallelize --- src/squidpy/gr/_ppatterns.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/squidpy/gr/_ppatterns.py b/src/squidpy/gr/_ppatterns.py index 9a18bbaf5..cdc882bd8 100644 --- a/src/squidpy/gr/_ppatterns.py +++ b/src/squidpy/gr/_ppatterns.py @@ -266,7 +266,7 @@ def _score_helper( return score_perms -@njit(parallel=True, fastmath=True) +@njit(parallel=True, fastmath=True, cache=True) def _occur_count( spatial_x: NDArrayA, spatial_y: NDArrayA, @@ -296,10 +296,8 @@ def _occur_count( for r in range(l_val): if d2 <= thresholds[r]: local_counts[base + r] += 1 - local_results[i, :] = local_counts - # for m in range(l_val * k * k): - # local_results[i, m] = local_counts[m] + local_results[i, :] = local_counts # reduction and reshape stay the same result_flat = local_results.sum(axis=0) @@ -307,7 +305,12 @@ def _occur_count( return result -def _co_occurrence_helper(v_x: NDArrayA, v_y: NDArrayA, v_radium: NDArrayA, labs: NDArrayA) -> NDArrayA: +@njit(parallel=True, fastmath=True, cache=True) +def _co_occurrence_helper( + v_x: NDArrayA, + v_y: NDArrayA, + v_radium: NDArrayA, + labs: NDArrayA) -> NDArrayA: """ Fast co-occurrence probability computation using the new numba-accelerated counting. @@ -342,7 +345,7 @@ def _co_occurrence_helper(v_x: NDArrayA, v_y: NDArrayA, v_radium: NDArrayA, labs # Compute co-occurrence probabilities for each threshold bin. occ_prob = np.empty((k, k, l_val), dtype=np.float32) - for r in range(l_val): + for r in prange(l_val): co_occur = counts[:, :, r].astype(np.float32) # Compute the total count for this threshold. From 05dd724cb166137e525db3fa5a26ae7d4463a4be Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 12 Jun 2025 13:08:08 +0000 Subject: [PATCH 10/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/squidpy/gr/_ppatterns.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/src/squidpy/gr/_ppatterns.py b/src/squidpy/gr/_ppatterns.py index cdc882bd8..873b757b6 100644 --- a/src/squidpy/gr/_ppatterns.py +++ b/src/squidpy/gr/_ppatterns.py @@ -266,15 +266,10 @@ def _score_helper( return score_perms + @njit(parallel=True, fastmath=True, cache=True) def _occur_count( - spatial_x: NDArrayA, - spatial_y: NDArrayA, - thresholds: NDArrayA, - label_idx: NDArrayA, - n: int, - k: int, - l_val: int + spatial_x: NDArrayA, spatial_y: NDArrayA, thresholds: NDArrayA, label_idx: NDArrayA, n: int, k: int, l_val: int ) -> NDArrayA: # Allocate a 2D array to store a flat local result per point. k2 = k * k @@ -290,8 +285,8 @@ def _occur_count( dy = spatial_y[i] - spatial_y[j] d2 = dx * dx + dy * dy - pair = label_idx[i] * k + label_idx[j] # fixed in r–loop - base = pair * l_val # first cell for that pair + pair = label_idx[i] * k + label_idx[j] # fixed in r–loop + base = pair * l_val # first cell for that pair for r in range(l_val): if d2 <= thresholds[r]: @@ -305,12 +300,9 @@ def _occur_count( return result + @njit(parallel=True, fastmath=True, cache=True) -def _co_occurrence_helper( - v_x: NDArrayA, - v_y: NDArrayA, - v_radium: NDArrayA, - labs: NDArrayA) -> NDArrayA: +def _co_occurrence_helper(v_x: NDArrayA, v_y: NDArrayA, v_radium: NDArrayA, labs: NDArrayA) -> NDArrayA: """ Fast co-occurrence probability computation using the new numba-accelerated counting. From c490d45d57ccea292c7b5848f3958a9994f4cdc7 Mon Sep 17 00:00:00 2001 From: Wenjie SUN Date: Thu, 12 Jun 2025 15:16:56 +0200 Subject: [PATCH 11/18] Fix Mypy checking Typing error --- src/squidpy/gr/_ppatterns.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/squidpy/gr/_ppatterns.py b/src/squidpy/gr/_ppatterns.py index cdc882bd8..cbcf700a3 100644 --- a/src/squidpy/gr/_ppatterns.py +++ b/src/squidpy/gr/_ppatterns.py @@ -5,7 +5,8 @@ from collections.abc import Iterable, Sequence from importlib.util import find_spec from itertools import chain -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Any, Literal, cast +from numpy.typing import NDArray import numba.types as nt import numpy as np @@ -303,7 +304,7 @@ def _occur_count( result_flat = local_results.sum(axis=0) result = result_flat.reshape(k, k, l_val) - return result + return cast(NDArray[np.int32], result) @njit(parallel=True, fastmath=True, cache=True) def _co_occurrence_helper( From d0884ca02df45b1df92db4357b21c827097119bb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 12 Jun 2025 13:19:46 +0000 Subject: [PATCH 12/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/squidpy/gr/_ppatterns.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/squidpy/gr/_ppatterns.py b/src/squidpy/gr/_ppatterns.py index 5d2993e54..244471352 100644 --- a/src/squidpy/gr/_ppatterns.py +++ b/src/squidpy/gr/_ppatterns.py @@ -6,7 +6,6 @@ from importlib.util import find_spec from itertools import chain from typing import TYPE_CHECKING, Any, Literal, cast -from numpy.typing import NDArray import numba.types as nt import numpy as np @@ -14,6 +13,7 @@ from anndata import AnnData from numba import njit, prange from numpy.random import default_rng +from numpy.typing import NDArray from scanpy import logging as logg from scanpy.get import _get_obs_rep from scanpy.metrics._gearys_c import _gearys_c From 6e5d1df6e584328f05c6db8cf25ceea6e1b01397 Mon Sep 17 00:00:00 2001 From: Wenjie SUN Date: Thu, 12 Jun 2025 15:47:53 +0200 Subject: [PATCH 13/18] Disable the cast typing in jit --- src/squidpy/gr/_ppatterns.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/squidpy/gr/_ppatterns.py b/src/squidpy/gr/_ppatterns.py index 5d2993e54..3db050c50 100644 --- a/src/squidpy/gr/_ppatterns.py +++ b/src/squidpy/gr/_ppatterns.py @@ -5,8 +5,7 @@ from collections.abc import Iterable, Sequence from importlib.util import find_spec from itertools import chain -from typing import TYPE_CHECKING, Any, Literal, cast -from numpy.typing import NDArray +from typing import TYPE_CHECKING, Any, Literal import numba.types as nt import numpy as np @@ -297,9 +296,7 @@ def _occur_count( # reduction and reshape stay the same result_flat = local_results.sum(axis=0) - result = result_flat.reshape(k, k, l_val) - - return cast(NDArray[np.int32], result) + return result_flat.reshape(k, k, l_val).copy() @njit(parallel=True, fastmath=True, cache=True) @@ -337,9 +334,9 @@ def _co_occurrence_helper(v_x: NDArrayA, v_y: NDArrayA, v_radium: NDArrayA, labs counts = _occur_count(v_x, v_y, thresholds, labs, n, k, l_val) # Compute co-occurrence probabilities for each threshold bin. - occ_prob = np.empty((k, k, l_val), dtype=np.float32) + occ_prob = np.empty((k, k, l_val), dtype=np.float64) for r in prange(l_val): - co_occur = counts[:, :, r].astype(np.float32) + co_occur = counts[:, :, r].astype(np.float64) # Compute the total count for this threshold. total = 0.0 @@ -348,7 +345,7 @@ def _co_occurrence_helper(v_x: NDArrayA, v_y: NDArrayA, v_radium: NDArrayA, labs total += co_occur[i, j] # Compute the normalized probability matrix. - probs_matrix = np.zeros((k, k), dtype=np.float32) + probs_matrix = np.zeros((k, k), dtype=np.float64) if total != 0.0: for i in range(k): for j in range(k): From a62128b3d2d84731ae07ddf76eb401a2d42bf4a5 Mon Sep 17 00:00:00 2001 From: Wenjie SUN Date: Thu, 12 Jun 2025 16:20:02 +0200 Subject: [PATCH 14/18] Try fix typing check error by mypy --- src/squidpy/gr/_ppatterns.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/squidpy/gr/_ppatterns.py b/src/squidpy/gr/_ppatterns.py index 3db050c50..19a3e78fb 100644 --- a/src/squidpy/gr/_ppatterns.py +++ b/src/squidpy/gr/_ppatterns.py @@ -296,7 +296,9 @@ def _occur_count( # reduction and reshape stay the same result_flat = local_results.sum(axis=0) - return result_flat.reshape(k, k, l_val).copy() + result = result_flat.reshape(k, k, l_val).astype(np.int32) + + return result @njit(parallel=True, fastmath=True, cache=True) From 0aab85671732ce0f1ddb89b11942122bafb8037a Mon Sep 17 00:00:00 2001 From: Wenjie SUN Date: Sat, 14 Jun 2025 20:58:20 +0200 Subject: [PATCH 15/18] Try: fix typing check error by mypy --- src/squidpy/gr/_ppatterns.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/squidpy/gr/_ppatterns.py b/src/squidpy/gr/_ppatterns.py index 19a3e78fb..17b1b5578 100644 --- a/src/squidpy/gr/_ppatterns.py +++ b/src/squidpy/gr/_ppatterns.py @@ -276,7 +276,7 @@ def _occur_count( local_results = np.zeros((n, l_val * k2), dtype=np.int32) for i in prange(n): - local_counts = np.zeros(l_val * k2, dtype=np.int32) + local_counts: NDArrayA = np.zeros(l_val * k2, dtype=np.int32) for j in range(n): if i == j: @@ -292,11 +292,11 @@ def _occur_count( if d2 <= thresholds[r]: local_counts[base + r] += 1 - local_results[i, :] = local_counts + local_results[i] = local_counts # reduction and reshape stay the same result_flat = local_results.sum(axis=0) - result = result_flat.reshape(k, k, l_val).astype(np.int32) + result = result_flat.reshape(k, k, l_val).copy() return result From 9ac269392846ecc043c81085dda9160ba16d162d Mon Sep 17 00:00:00 2001 From: Wenjie SUN Date: Sat, 14 Jun 2025 21:02:28 +0200 Subject: [PATCH 16/18] Try: fix typing check error by mypy --- src/squidpy/gr/_ppatterns.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/squidpy/gr/_ppatterns.py b/src/squidpy/gr/_ppatterns.py index 17b1b5578..6e3230be0 100644 --- a/src/squidpy/gr/_ppatterns.py +++ b/src/squidpy/gr/_ppatterns.py @@ -296,7 +296,7 @@ def _occur_count( # reduction and reshape stay the same result_flat = local_results.sum(axis=0) - result = result_flat.reshape(k, k, l_val).copy() + result: NDArrayA = result_flat.reshape(k, k, l_val).copy() return result From 482d0d8a67f0609a50bcacc838ce34990984a57b Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Thu, 10 Jul 2025 15:07:10 +0200 Subject: [PATCH 17/18] init for benchmark --- .github/workflows/benchmark.yml | 62 +++++++ benchmarks/.asv/results/benchmarks.json | 27 +++ benchmarks/.asv/results/mac/machine.json | 9 + benchmarks/README.md | 21 +++ benchmarks/asv.conf.json | 171 ++++++++++++++++++ benchmarks/benchmarks/__init__.py | 1 + benchmarks/benchmarks/_utils.py | 216 +++++++++++++++++++++++ benchmarks/benchmarks/co_occurence.py | 34 ++++ 8 files changed, 541 insertions(+) create mode 100644 .github/workflows/benchmark.yml create mode 100644 benchmarks/.asv/results/benchmarks.json create mode 100644 benchmarks/.asv/results/mac/machine.json create mode 100644 benchmarks/README.md create mode 100644 benchmarks/asv.conf.json create mode 100644 benchmarks/benchmarks/__init__.py create mode 100644 benchmarks/benchmarks/_utils.py create mode 100644 benchmarks/benchmarks/co_occurence.py diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 000000000..1e952440b --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,62 @@ +name: Benchmark + +on: + push: + branches: [main] + pull_request: + branches: [main] + +env: + FORCE_COLOR: "1" + +defaults: + run: + shell: bash -e {0} # -e to fail on error + +jobs: + benchmark: + runs-on: ${{ matrix.os }} + + strategy: + fail-fast: false + matrix: + python: ["3.13"] + os: [ubuntu-latest] + + env: + OS: ${{ matrix.os }} + PYTHON: ${{ matrix.python }} + ASV_DIR: "./benchmarks" + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Fetch main branch for `asv run`’s hash + run: git fetch origin main:main + if: ${{ github.ref_name != 'main' }} + + - name: Set up Python ${{ matrix.python }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + cache: 'pip' + + - name: Cache datasets + uses: actions/cache@v4 + with: + path: | + ~/.cache + key: benchmark-state-${{ hashFiles('benchmarks/**') }} + + - name: Install dependencies + run: pip install 'asv>=0.6.4' + + - name: Configure ASV + working-directory: ${{ env.ASV_DIR }} + run: asv machine --yes + + - name: Quick benchmark run + working-directory: ${{ env.ASV_DIR }} + run: asv run --dry-run --quick --show-stderr --verbose HEAD^! diff --git a/benchmarks/.asv/results/benchmarks.json b/benchmarks/.asv/results/benchmarks.json new file mode 100644 index 000000000..708d1c7d3 --- /dev/null +++ b/benchmarks/.asv/results/benchmarks.json @@ -0,0 +1,27 @@ +{ + "co_occurence.peakmem_co_occurrence": { + "code": "def peakmem_co_occurrence():\n sq.gr.co_occurrence(adata, cluster_key=\"cell type\")\n\ndef setup():\n global adata # noqa: PLW0603\n adata = sq.datasets.imc()", + "name": "co_occurence.peakmem_co_occurrence", + "param_names": [], + "params": [], + "type": "peakmemory", + "unit": "bytes", + "version": "8e588751fbcc15d56cfe73dbb3844752d6cac545c868bca1ce1045f0a54e301d" + }, + "co_occurence.time_co_occurrence": { + "code": "def time_co_occurrence():\n sq.gr.co_occurrence(adata, cluster_key=\"cell type\")\n\ndef setup():\n global adata # noqa: PLW0603\n adata = sq.datasets.imc()", + "min_run_count": 2, + "name": "co_occurence.time_co_occurrence", + "number": 0, + "param_names": [], + "params": [], + "repeat": 0, + "rounds": 2, + "sample_time": 0.01, + "type": "time", + "unit": "seconds", + "version": "e764c1b999e315fffac8cac6708ede350ff254bd0b05a9be580a43ea3689d614", + "warmup_time": -1 + }, + "version": 2 +} \ No newline at end of file diff --git a/benchmarks/.asv/results/mac/machine.json b/benchmarks/.asv/results/mac/machine.json new file mode 100644 index 000000000..bcb50937c --- /dev/null +++ b/benchmarks/.asv/results/mac/machine.json @@ -0,0 +1,9 @@ +{ + "arch": "arm64", + "cpu": "Apple M1", + "machine": "mac", + "num_cpu": "8", + "os": "Darwin 23.1.0", + "ram": "17179869184", + "version": 1 +} \ No newline at end of file diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 000000000..3546e79ea --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,21 @@ +# Scanpy Benchmarks + +This directory contains code for benchmarking Scanpy using [asv][]. + +The functionality is checked using the [`benchmark.yml`][] workflow. +Benchmarks are run using the [benchmark bot][]. + +[asv]: https://asv.readthedocs.io/ +[`benchmark.yml`]: ../.github/workflows/benchmark.yml +[benchmark bot]: https://github.com/apps/scverse-benchmark + +## Data processing in benchmarks + +Each dataset is processed so it has + +- `.layers['counts']` (containing data in C/row-major format) and `.layers['counts-off-axis']` (containing data in FORTRAN/column-major format) +- `.X` and `.layers['off-axis']` with log-transformed data (formats like above) +- a `.var['mt']` boolean column indicating mitochondrial genes + +The benchmarks are set up so the `layer` parameter indicates the layer that will be moved into `.X` before the benchmark. +That way, we don’t need to add `layer=layer` everywhere. diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json new file mode 100644 index 000000000..907a6292e --- /dev/null +++ b/benchmarks/asv.conf.json @@ -0,0 +1,171 @@ +{ + // The version of the config file format. Do not change, unless + // you know what you are doing. + "version": 1, + + // The name of the project being benchmarked + "project": "squidpy", + + // The project's homepage + "project_url": "https://squidpy.readthedocs.io/", + + // The URL or local path of the source code repository for the + // project being benchmarked + "repo": "..", + + // The Python project's subdirectory in your repo. If missing or + // the empty string, the project is assumed to be located at the root + // of the repository. + // "repo_subdir": "", + + // Customizable commands for building, installing, and + // uninstalling the project. See asv.conf.json documentation. + // + // "install_command": ["python -mpip install {wheel_file}"], + // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"], + "build_command": [ + "python -m pip install build", + "python -m build --wheel -o {build_cache_dir} {build_dir}", + ], + + // List of branches to benchmark. If not provided, defaults to "master" + // (for git) or "default" (for mercurial). + "branches": ["main"], // for git + + // The DVCS being used. If not set, it will be automatically + // determined from "repo" by looking at the protocol in the URL + // (if remote), or by looking for special directories, such as + // ".git" (if local). + "dvcs": "git", + + // The tool to use to create environments. May be "conda", + // "virtualenv" or other value depending on the plugins in use. + // If missing or the empty string, the tool will be automatically + // determined by looking for tools on the PATH environment + // variable. + "environment_type": "conda", + + // timeout in seconds for installing any dependencies in environment + // defaults to 10 min + //"install_timeout": 600, + + // the base URL to show a commit for the project. + "show_commit_url": "https://github.com/scverse/squidpy/commit/", + + // The Pythons you'd like to test against. If not provided, defaults + // to the current version of Python used to run `asv`. + // "pythons": ["3.11", "3.13"], + + // The list of conda channel names to be searched for benchmark + // dependency packages in the specified order + "conda_channels": ["conda-forge", "defaults"], + + // The matrix of dependencies to test. Each key is the name of a + // package (in PyPI) and the values are version numbers. An empty + // list or empty string indicates to just test against the default + // (latest) version. null indicates that the package is to not be + // installed. If the package to be tested is only available from + // PyPi, and the 'environment_type' is conda, then you can preface + // the package name by 'pip+', and the package will be installed via + // pip (with all the conda available packages installed first, + // followed by the pip installed packages). + // + "matrix": { + "numpy": [""], + // "scipy": ["1.2", ""], + "scipy": [""], + "h5py": [""], + "natsort": [""], + "pandas": [""], + "memory_profiler": [""], + "zarr": ["2.18.4"], + "pytest": [""], + "scanpy": [""], + "python-igraph": [""], + // "psutil": [""] + "pooch": [""], + "scikit-image": [""], + // "scikit-misc": [""], + }, + + // Combinations of libraries/python versions can be excluded/included + // from the set to test. Each entry is a dictionary containing additional + // key-value pairs to include/exclude. + // + // An exclude entry excludes entries where all values match. The + // values are regexps that should match the whole string. + // + // An include entry adds an environment. Only the packages listed + // are installed. The 'python' key is required. The exclude rules + // do not apply to includes. + // + // In addition to package names, the following keys are available: + // + // - python + // Python version, as in the *pythons* variable above. + // - environment_type + // Environment type, as above. + // - sys_platform + // Platform, as in sys.platform. Possible values for the common + // cases: 'linux2', 'win32', 'cygwin', 'darwin'. + // + // "exclude": [ + // {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows + // {"environment_type": "conda", "six": null}, // don't run without six on conda + // ], + // + // "include": [ + // // additional env for python2.7 + // {"python": "2.7", "numpy": "1.8"}, + // // additional env if run on windows+conda + // {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""}, + // ], + + // The directory (relative to the current directory) that benchmarks are + // stored in. If not provided, defaults to "benchmarks" + // "benchmark_dir": "benchmarks", + + // The directory (relative to the current directory) to cache the Python + // environments in. If not provided, defaults to "env" + "env_dir": ".asv/env", + + // The directory (relative to the current directory) that raw benchmark + // results are stored in. If not provided, defaults to "results". + "results_dir": ".asv/results", + + // The directory (relative to the current directory) that the html tree + // should be written to. If not provided, defaults to "html". + "html_dir": ".asv/html", + + // The number of characters to retain in the commit hashes. + // "hash_length": 8, + + // `asv` will cache results of the recent builds in each + // environment, making them faster to install next time. This is + // the number of builds to keep, per environment. + // "build_cache_size": 2, + + // The commits after which the regression search in `asv publish` + // should start looking for regressions. Dictionary whose keys are + // regexps matching to benchmark names, and values corresponding to + // the commit (exclusive) after which to start looking for + // regressions. The default is to start from the first commit + // with results. If the commit is `null`, regression detection is + // skipped for the matching benchmark. + // + // "regressions_first_commits": { + // "some_benchmark": "352cdf", // Consider regressions only after this commit + // "another_benchmark": null, // Skip regression detection altogether + // }, + + // The thresholds for relative change in results, after which `asv + // publish` starts reporting regressions. Dictionary of the same + // form as in ``regressions_first_commits``, with values + // indicating the thresholds. If multiple entries match, the + // maximum is taken. If no entry matches, the default is 5%. + // + // "regressions_thresholds": { + // "some_benchmark": 0.01, // Threshold of 1% + // "another_benchmark": 0.5, // Threshold of 50% + // }, +} diff --git a/benchmarks/benchmarks/__init__.py b/benchmarks/benchmarks/__init__.py new file mode 100644 index 000000000..255d98fe4 --- /dev/null +++ b/benchmarks/benchmarks/__init__.py @@ -0,0 +1 @@ +"""ASV benchmark suite for sqidpy.""" diff --git a/benchmarks/benchmarks/_utils.py b/benchmarks/benchmarks/_utils.py new file mode 100644 index 000000000..e27e7e70d --- /dev/null +++ b/benchmarks/benchmarks/_utils.py @@ -0,0 +1,216 @@ +from __future__ import annotations + +import itertools +import warnings +from functools import cache +from typing import TYPE_CHECKING + +import numpy as np +import pooch +from anndata import concat +from asv_runner.benchmarks.mark import skip_for_params +import squidpy as sq + +import scanpy as sc +from scanpy._compat import CSRBase + +if TYPE_CHECKING: + from collections.abc import Callable, Sequence + from collections.abc import Set as AbstractSet + from typing import Literal, Protocol, TypeVar + + from anndata import AnnData + + from scanpy._compat import CSCBase + + C = TypeVar("C", bound=Callable) + + class ParamSkipper(Protocol): + def __call__(self, **skipped: AbstractSet) -> Callable[[C], C]: ... + + Dataset = Literal["pbmc68k_reduced", "pbmc3k", "bmmc", "lung93k"] + KeyX = Literal[None, "off-axis"] + KeyCount = Literal["counts", "counts-off-axis"] + + + + +@cache +def _pbmc68k_reduced() -> AnnData: + """A small datasets with a dense `.X`.""" # noqa: D401 + adata = sc.datasets.pbmc68k_reduced() + assert isinstance(adata.X, np.ndarray) + assert not np.isfortran(adata.X) + + # raw has the same number of genes, so we can use it for counts + # it doesn’t actually contain counts for some reason, but close enough + assert isinstance(adata.raw.X, CSRBase) + adata.layers["counts"] = adata.raw.X.toarray(order="C") + mapper = dict( + percent_mito="pct_counts_mt", + n_counts="total_counts", + ) + adata.obs.rename(columns=mapper, inplace=True) + return adata + + +def pbmc68k_reduced() -> AnnData: + return _pbmc68k_reduced().copy() + + +@cache +def _pbmc3k() -> AnnData: + adata = sc.datasets.pbmc3k() + assert isinstance(adata.X, CSRBase) + adata.layers["counts"] = adata.X.astype(np.int32, copy=True) + sc.pp.log1p(adata) + return adata + + +def pbmc3k() -> AnnData: + return _pbmc3k().copy() + + +@cache +def _bmmc(n_obs: int = 4000) -> AnnData: + registry = pooch.create( + path=pooch.os_cache("pooch"), + base_url="doi:10.6084/m9.figshare.22716739.v1/", + ) + registry.load_registry_from_doi() + samples = {smp: f"{smp}_filtered_feature_bc_matrix.h5" for smp in ("s1d1", "s1d3")} + adatas = {} + + for sample_id, filename in samples.items(): + path = registry.fetch(filename) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", r"Variable names are not unique") + sample_adata = sc.read_10x_h5(path) + sample_adata.var_names_make_unique() + sc.pp.subsample(sample_adata, n_obs=n_obs // len(samples)) + adatas[sample_id] = sample_adata + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", r"Observation names are not unique") + adata = concat(adatas, label="sample") + adata.obs_names_make_unique() + + assert isinstance(adata.X, CSRBase) + adata.layers["counts"] = adata.X.astype(np.int32, copy=True) + sc.pp.log1p(adata) + adata.obs["n_counts"] = adata.layers["counts"].sum(axis=1).A1 + return adata + + +def bmmc(n_obs: int = 400) -> AnnData: + return _bmmc(n_obs).copy() + + +@cache +def _lung93k() -> AnnData: + path = pooch.retrieve( + url="https://figshare.com/ndownloader/files/45788454", + known_hash="md5:4f28af5ff226052443e7e0b39f3f9212", + ) + adata = sc.read_h5ad(path) + assert isinstance(adata.X, CSRBase) + adata.layers["counts"] = adata.X.astype(np.int32, copy=True) + sc.pp.log1p(adata) + return adata + + +def lung93k() -> AnnData: + return _lung93k().copy() + + +def to_off_axis(x: np.ndarray | CSRBase) -> np.ndarray | CSCBase: + if isinstance(x, CSRBase): + return x.tocsc() + if isinstance(x, np.ndarray): + assert not np.isfortran(x) + return x.copy(order="F") + msg = f"Unexpected type {type(x)}" + raise TypeError(msg) + + +def _get_dataset_raw(dataset: Dataset) -> tuple[AnnData, str | None]: + match dataset: + case "pbmc68k_reduced": + adata, batch_key = pbmc68k_reduced(), None + case "pbmc3k": + adata, batch_key = pbmc3k(), None # can’t use this with batches + case "bmmc": + # TODO: allow specifying bigger variant + adata, batch_key = bmmc(400), "sample" + case "lung93k": + adata, batch_key = lung93k(), "PatientNumber" + case _: + msg = f"Unknown dataset {dataset}" + raise AssertionError(msg) + + # add off-axis layers + adata.layers["off-axis"] = to_off_axis(adata.X) + adata.layers["counts-off-axis"] = to_off_axis(adata.layers["counts"]) + + # add mitochondrial gene and pre-compute qc metrics + adata.var["mt"] = adata.var_names.str.startswith("MT-") + assert adata.var["mt"].sum() > 0, "no MT genes in dataset" + sc.pp.calculate_qc_metrics( + adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True + ) + + return adata, batch_key + + +def get_dataset(dataset: Dataset, *, layer: KeyX = None) -> tuple[AnnData, str | None]: + adata, batch_key = _get_dataset_raw(dataset) + if layer is not None: + adata.X = adata.layers.pop(layer) + return adata, batch_key + + +def get_count_dataset( + dataset: Dataset, *, layer: KeyCount = "counts" +) -> tuple[AnnData, str | None]: + adata, batch_key = _get_dataset_raw(dataset) + + adata.X = adata.layers.pop(layer) + # remove indicators that X was transformed + adata.uns.pop("log1p", None) + + return adata, batch_key + + +def param_skipper( + param_names: Sequence[str], params: tuple[Sequence[object], ...] +) -> ParamSkipper: + """Create a decorator that will skip all combinations that contain any of the given parameters. + + Examples + -------- + >>> param_names = ["letters", "numbers"] + >>> params = [["a", "b"], [3, 4, 5]] + >>> skip_when = param_skipper(param_names, params) + + >>> @skip_when(letters={"a"}, numbers={3}) + ... def func(a, b): + ... print(a, b) + >>> run_as_asv_benchmark(func) + b 4 + b 5 + + """ + + def skip(**skipped: AbstractSet) -> Callable[[C], C]: + skipped_combs = [ + tuple(record.values()) + for record in ( + dict(zip(param_names, vals, strict=True)) + for vals in itertools.product(*params) + ) + if any(v in skipped.get(n, set()) for n, v in record.items()) + ] + # print(skipped_combs, file=sys.stderr) + return skip_for_params(skipped_combs) + + return skip diff --git a/benchmarks/benchmarks/co_occurence.py b/benchmarks/benchmarks/co_occurence.py new file mode 100644 index 000000000..dd62144ff --- /dev/null +++ b/benchmarks/benchmarks/co_occurence.py @@ -0,0 +1,34 @@ +"""Benchmark tool operations in Squidpy. + +API documentation: . +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import scanpy as sc +import squidpy as sq + +from ._utils import pbmc68k_reduced + +if TYPE_CHECKING: + from anndata import AnnData + +# setup variables + +adata: AnnData + + +def setup(): + global adata # noqa: PLW0603 + adata = sq.datasets.imc() + + +def time_co_occurrence(): + sq.gr.co_occurrence(adata, cluster_key="cell type") + + +def peakmem_co_occurrence(): + sq.gr.co_occurrence(adata, cluster_key="cell type") + From 18a767cd160d8580bdac990878bcd15ced724902 Mon Sep 17 00:00:00 2001 From: selmanozleyen Date: Thu, 10 Jul 2025 15:14:07 +0200 Subject: [PATCH 18/18] add the array order and stuff later. Also sparse vs dense later maybe --- benchmarks/README.md | 7 +- benchmarks/asv.conf.json | 5 +- benchmarks/benchmarks/_utils.py | 216 -------------------------- benchmarks/benchmarks/co_occurence.py | 2 - 4 files changed, 7 insertions(+), 223 deletions(-) delete mode 100644 benchmarks/benchmarks/_utils.py diff --git a/benchmarks/README.md b/benchmarks/README.md index 3546e79ea..8a0bf5270 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -1,16 +1,17 @@ -# Scanpy Benchmarks +# Squidpy Benchmarks -This directory contains code for benchmarking Scanpy using [asv][]. +This directory contains code for benchmarking Squidpy using [asv][]. The functionality is checked using the [`benchmark.yml`][] workflow. Benchmarks are run using the [benchmark bot][]. [asv]: https://asv.readthedocs.io/ [`benchmark.yml`]: ../.github/workflows/benchmark.yml -[benchmark bot]: https://github.com/apps/scverse-benchmark +[benchmark bot]: https://github.com/apps/scverse-benchmark # TODO ## Data processing in benchmarks +# TODO Each dataset is processed so it has - `.layers['counts']` (containing data in C/row-major format) and `.layers['counts-off-axis']` (containing data in FORTRAN/column-major format) diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json index 907a6292e..c48b03c81 100644 --- a/benchmarks/asv.conf.json +++ b/benchmarks/asv.conf.json @@ -78,9 +78,10 @@ "natsort": [""], "pandas": [""], "memory_profiler": [""], - "zarr": ["2.18.4"], + // "zarr": ["2.18.4"], "pytest": [""], - "scanpy": [""], + // "scanpy": [""], + "squidpy": [""], "python-igraph": [""], // "psutil": [""] "pooch": [""], diff --git a/benchmarks/benchmarks/_utils.py b/benchmarks/benchmarks/_utils.py deleted file mode 100644 index e27e7e70d..000000000 --- a/benchmarks/benchmarks/_utils.py +++ /dev/null @@ -1,216 +0,0 @@ -from __future__ import annotations - -import itertools -import warnings -from functools import cache -from typing import TYPE_CHECKING - -import numpy as np -import pooch -from anndata import concat -from asv_runner.benchmarks.mark import skip_for_params -import squidpy as sq - -import scanpy as sc -from scanpy._compat import CSRBase - -if TYPE_CHECKING: - from collections.abc import Callable, Sequence - from collections.abc import Set as AbstractSet - from typing import Literal, Protocol, TypeVar - - from anndata import AnnData - - from scanpy._compat import CSCBase - - C = TypeVar("C", bound=Callable) - - class ParamSkipper(Protocol): - def __call__(self, **skipped: AbstractSet) -> Callable[[C], C]: ... - - Dataset = Literal["pbmc68k_reduced", "pbmc3k", "bmmc", "lung93k"] - KeyX = Literal[None, "off-axis"] - KeyCount = Literal["counts", "counts-off-axis"] - - - - -@cache -def _pbmc68k_reduced() -> AnnData: - """A small datasets with a dense `.X`.""" # noqa: D401 - adata = sc.datasets.pbmc68k_reduced() - assert isinstance(adata.X, np.ndarray) - assert not np.isfortran(adata.X) - - # raw has the same number of genes, so we can use it for counts - # it doesn’t actually contain counts for some reason, but close enough - assert isinstance(adata.raw.X, CSRBase) - adata.layers["counts"] = adata.raw.X.toarray(order="C") - mapper = dict( - percent_mito="pct_counts_mt", - n_counts="total_counts", - ) - adata.obs.rename(columns=mapper, inplace=True) - return adata - - -def pbmc68k_reduced() -> AnnData: - return _pbmc68k_reduced().copy() - - -@cache -def _pbmc3k() -> AnnData: - adata = sc.datasets.pbmc3k() - assert isinstance(adata.X, CSRBase) - adata.layers["counts"] = adata.X.astype(np.int32, copy=True) - sc.pp.log1p(adata) - return adata - - -def pbmc3k() -> AnnData: - return _pbmc3k().copy() - - -@cache -def _bmmc(n_obs: int = 4000) -> AnnData: - registry = pooch.create( - path=pooch.os_cache("pooch"), - base_url="doi:10.6084/m9.figshare.22716739.v1/", - ) - registry.load_registry_from_doi() - samples = {smp: f"{smp}_filtered_feature_bc_matrix.h5" for smp in ("s1d1", "s1d3")} - adatas = {} - - for sample_id, filename in samples.items(): - path = registry.fetch(filename) - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", r"Variable names are not unique") - sample_adata = sc.read_10x_h5(path) - sample_adata.var_names_make_unique() - sc.pp.subsample(sample_adata, n_obs=n_obs // len(samples)) - adatas[sample_id] = sample_adata - - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", r"Observation names are not unique") - adata = concat(adatas, label="sample") - adata.obs_names_make_unique() - - assert isinstance(adata.X, CSRBase) - adata.layers["counts"] = adata.X.astype(np.int32, copy=True) - sc.pp.log1p(adata) - adata.obs["n_counts"] = adata.layers["counts"].sum(axis=1).A1 - return adata - - -def bmmc(n_obs: int = 400) -> AnnData: - return _bmmc(n_obs).copy() - - -@cache -def _lung93k() -> AnnData: - path = pooch.retrieve( - url="https://figshare.com/ndownloader/files/45788454", - known_hash="md5:4f28af5ff226052443e7e0b39f3f9212", - ) - adata = sc.read_h5ad(path) - assert isinstance(adata.X, CSRBase) - adata.layers["counts"] = adata.X.astype(np.int32, copy=True) - sc.pp.log1p(adata) - return adata - - -def lung93k() -> AnnData: - return _lung93k().copy() - - -def to_off_axis(x: np.ndarray | CSRBase) -> np.ndarray | CSCBase: - if isinstance(x, CSRBase): - return x.tocsc() - if isinstance(x, np.ndarray): - assert not np.isfortran(x) - return x.copy(order="F") - msg = f"Unexpected type {type(x)}" - raise TypeError(msg) - - -def _get_dataset_raw(dataset: Dataset) -> tuple[AnnData, str | None]: - match dataset: - case "pbmc68k_reduced": - adata, batch_key = pbmc68k_reduced(), None - case "pbmc3k": - adata, batch_key = pbmc3k(), None # can’t use this with batches - case "bmmc": - # TODO: allow specifying bigger variant - adata, batch_key = bmmc(400), "sample" - case "lung93k": - adata, batch_key = lung93k(), "PatientNumber" - case _: - msg = f"Unknown dataset {dataset}" - raise AssertionError(msg) - - # add off-axis layers - adata.layers["off-axis"] = to_off_axis(adata.X) - adata.layers["counts-off-axis"] = to_off_axis(adata.layers["counts"]) - - # add mitochondrial gene and pre-compute qc metrics - adata.var["mt"] = adata.var_names.str.startswith("MT-") - assert adata.var["mt"].sum() > 0, "no MT genes in dataset" - sc.pp.calculate_qc_metrics( - adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True - ) - - return adata, batch_key - - -def get_dataset(dataset: Dataset, *, layer: KeyX = None) -> tuple[AnnData, str | None]: - adata, batch_key = _get_dataset_raw(dataset) - if layer is not None: - adata.X = adata.layers.pop(layer) - return adata, batch_key - - -def get_count_dataset( - dataset: Dataset, *, layer: KeyCount = "counts" -) -> tuple[AnnData, str | None]: - adata, batch_key = _get_dataset_raw(dataset) - - adata.X = adata.layers.pop(layer) - # remove indicators that X was transformed - adata.uns.pop("log1p", None) - - return adata, batch_key - - -def param_skipper( - param_names: Sequence[str], params: tuple[Sequence[object], ...] -) -> ParamSkipper: - """Create a decorator that will skip all combinations that contain any of the given parameters. - - Examples - -------- - >>> param_names = ["letters", "numbers"] - >>> params = [["a", "b"], [3, 4, 5]] - >>> skip_when = param_skipper(param_names, params) - - >>> @skip_when(letters={"a"}, numbers={3}) - ... def func(a, b): - ... print(a, b) - >>> run_as_asv_benchmark(func) - b 4 - b 5 - - """ - - def skip(**skipped: AbstractSet) -> Callable[[C], C]: - skipped_combs = [ - tuple(record.values()) - for record in ( - dict(zip(param_names, vals, strict=True)) - for vals in itertools.product(*params) - ) - if any(v in skipped.get(n, set()) for n, v in record.items()) - ] - # print(skipped_combs, file=sys.stderr) - return skip_for_params(skipped_combs) - - return skip diff --git a/benchmarks/benchmarks/co_occurence.py b/benchmarks/benchmarks/co_occurence.py index dd62144ff..16263725b 100644 --- a/benchmarks/benchmarks/co_occurence.py +++ b/benchmarks/benchmarks/co_occurence.py @@ -7,10 +7,8 @@ from typing import TYPE_CHECKING -import scanpy as sc import squidpy as sq -from ._utils import pbmc68k_reduced if TYPE_CHECKING: from anndata import AnnData