Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/squidpy/_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,15 @@ def decorator2(obj: Any) -> Any:
_seed = """\
seed
Random seed for reproducibility."""
_seed_versionchanged = """\
.. versionchanged:: 1.8.4
Every permutation now draws from an independent seed spawned from a
:class:`numpy.random.SeedSequence`, following `NumPy's guidance on parallel random
number generation <https://numpy.org/doc/stable/reference/random/parallel.html>`_.
Consequently the permutation-based results no longer depend on ``n_jobs`` / ``backend``, but
results obtained with a given ``seed`` differ from those produced by squidpy < 1.8.4.
See `#1232 <https://github.com/scverse/squidpy/issues/1232>`_ and
`#1233 <https://github.com/scverse/squidpy/issues/1233>`_."""
_n_perms = """\
n_perms
Number of permutations for the permutation test."""
Expand Down Expand Up @@ -413,6 +422,7 @@ def decorator2(obj: Any) -> Any:
copy_cont=_copy_cont,
numba_parallel=_numba_parallel,
seed=_seed,
seed_versionchanged=_seed_versionchanged,
n_perms=_n_perms,
img_layer=_img_layer,
feature_name=_feature_name,
Expand Down
7 changes: 7 additions & 0 deletions src/squidpy/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,13 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:
return wrapper


def _spawn_seeds(seed: int | None, n: int) -> list[int]:
# uint32 integer seeds, derived from independent SeedSequence children (avoids the correlation
# of sequential seeds). uint32 is required to stay reproducible with numba/legacy RandomState,
# which only accept uint32 integer seeds.
return [int(s.generate_state(1)[0]) for s in np.random.SeedSequence(seed).spawn(n)]


def thread_map(
fn: Callable[..., Any],
items: Sequence[Any],
Expand Down
33 changes: 23 additions & 10 deletions src/squidpy/gr/_ligrec.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,14 @@
from squidpy._constants._constants import ComplexPolicy, CorrAxis
from squidpy._constants._pkg_constants import Key
from squidpy._docs import d, inject_docs
from squidpy._utils import NDArrayA, Signal, SigQueue, _get_n_cores, parallelize
from squidpy._utils import (
NDArrayA,
Signal,
SigQueue,
_get_n_cores,
_spawn_seeds,
parallelize,
)
from squidpy._validators import assert_positive, check_tuple_needles
from squidpy.gr._utils import (
_assert_categorical_obs,
Expand Down Expand Up @@ -332,6 +339,8 @@ def test(
"""
Perform the permutation test as described in :cite:`cellphonedb`.

%(seed_versionchanged)s

Parameters
----------
%(cluster_key)s
Expand Down Expand Up @@ -753,6 +762,7 @@ def extractor(res: Sequence[TempResult]) -> TempResult:
# (n_cells, n_genes)
data = np.array(data[data.columns.difference(["clusters"])].values, dtype=np.float64, order="C")
# all 3 should be C contiguous
seeds = _spawn_seeds(seed, n_perms)
return parallelize( # type: ignore[no-any-return]
_analysis_helper,
np.arange(n_perms, dtype=np.int32).tolist(),
Expand All @@ -767,7 +777,7 @@ def extractor(res: Sequence[TempResult]) -> TempResult:
interactions,
interaction_clusters=interaction_clusters,
clustering=clustering,
seed=seed,
seeds=seeds,
numba_parallel=numba_parallel,
)

Expand All @@ -780,7 +790,7 @@ def _analysis_helper(
interactions: NDArrayA,
interaction_clusters: NDArrayA,
clustering: NDArrayA,
seed: int | None = None,
seeds: Sequence[int],
numba_parallel: bool | None = None,
queue: SigQueue | None = None,
) -> TempResult:
Expand All @@ -790,7 +800,7 @@ def _analysis_helper(
Parameters
----------
perms
Permutation indices. Only used to set the ``seed``.
Permutation indices. Used to index ``seeds`` and to decide which worker returns the means.
data
Array of shape `(n_cells, n_genes)`.
mean
Expand All @@ -804,8 +814,8 @@ def _analysis_helper(
Array of shape `(n_interaction_clusters, 2)`.
clustering
Array of shape `(n_cells,)` containing the original clustering.
seed
Random seed for :class:`numpy.random.RandomState`.
seeds
One ``uint32`` integer seed per permutation, indexed by the values in ``perms``.
numba_parallel
Whether to use :func:`numba.prange` or not. If `None`, it's determined automatically.
queue
Expand All @@ -820,9 +830,8 @@ def _analysis_helper(
- `'pvalues'` - array of shape `(n_interactions, n_interaction_clusters)` containing `np.sum(T0 > T)`
where `T0` is the test statistic under null hypothesis and `T` is the true test statistic.
"""
rs = np.random.RandomState(None if seed is None else perms[0] + seed)

clustering = clustering.copy()
# used as a read-only base; each permutation shuffles its own copy (see the loop below)
clustering_base = clustering.copy()
n_cls = mean.shape[1]
return_means = np.min(perms) == 0

Expand All @@ -848,7 +857,11 @@ def _analysis_helper(
res_means = None
test = _test

for _ in perms:
for p in perms:
# shuffle from the same base with a per-permutation seed, so each permutation is
# independent of the others and of how the permutations are split across jobs
rs = np.random.RandomState(seeds[p])
clustering = clustering_base.copy()
rs.shuffle(clustering)
error = test(interactions, interaction_clusters, data, clustering, mean, mask, res=res)
if error:
Expand Down
31 changes: 22 additions & 9 deletions src/squidpy/gr/_nhood.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,14 @@
from squidpy._constants._constants import Centrality
from squidpy._constants._pkg_constants import Key
from squidpy._docs import d, inject_docs
from squidpy._utils import NDArrayA, Signal, SigQueue, _get_n_cores, parallelize
from squidpy._utils import (
NDArrayA,
Signal,
SigQueue,
_get_n_cores,
_spawn_seeds,
parallelize,
)
from squidpy._validators import assert_positive
from squidpy.gr._utils import (
_assert_categorical_obs,
Expand Down Expand Up @@ -153,6 +160,8 @@ def nhood_enrichment(
"""
Compute neighborhood enrichment by permutation test.

%(seed_versionchanged)s

Parameters
----------
%(adata)s
Expand Down Expand Up @@ -200,6 +209,7 @@ def nhood_enrichment(

n_jobs = _get_n_cores(n_jobs)
start = logg.info(f"Calculating neighborhood enrichment using `{n_jobs}` core(s)")
seeds = _spawn_seeds(seed, n_perms)

perms = parallelize(
_nhood_enrichment_helper,
Expand All @@ -215,7 +225,7 @@ def nhood_enrichment(
int_clust=int_clust,
libraries=libraries,
n_cls=n_cls,
seed=seed,
seeds=seeds,
)
zscore = (count - perms.mean(axis=0)) / perms.std(axis=0)

Expand Down Expand Up @@ -444,19 +454,22 @@ def _nhood_enrichment_helper(
int_clust: NDArrayA,
libraries: pd.Series[CategoricalDtype] | None,
n_cls: int,
seed: int | None = None,
seeds: Sequence[int],
queue: SigQueue | None = None,
) -> NDArrayA:
perms = np.empty((len(ixs), n_cls, n_cls), dtype=np.float64)
int_clust = int_clust.copy() # threading
rs = np.random.RandomState(seed=None if seed is None else seed + ixs[0])
int_clust = int_clust.copy() # threading; used as a read-only base for each permutation

for i in range(len(ixs)):
for i, ix in enumerate(ixs):
# shuffle from the same base with a per-permutation seed, so each permutation is
# independent of the others and of how the permutations are split across jobs
rs = np.random.RandomState(seeds[ix])
if libraries is not None:
int_clust = _shuffle_group(int_clust, libraries, rs)
shuffled = _shuffle_group(int_clust, libraries, rs)
else:
rs.shuffle(int_clust)
perms[i, ...] = callback(indices, indptr, int_clust)
shuffled = int_clust.copy()
rs.shuffle(shuffled)
perms[i, ...] = callback(indices, indptr, shuffled)

if queue is not None:
queue.put(Signal.UPDATE)
Expand Down
26 changes: 17 additions & 9 deletions src/squidpy/gr/_ppatterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import pandas as pd
from anndata import AnnData
from numba import njit, prange
from numpy.random import default_rng
from scanpy import logging as logg
from scanpy.metrics import gearys_c, morans_i
from scipy import stats
Expand All @@ -23,7 +22,15 @@
from squidpy._constants._constants import SpatialAutocorr
from squidpy._constants._pkg_constants import Key
from squidpy._docs import d, inject_docs
from squidpy._utils import NDArrayA, Signal, SigQueue, _get_n_cores, deprecated_params, parallelize
from squidpy._utils import (
NDArrayA,
Signal,
SigQueue,
_get_n_cores,
_spawn_seeds,
deprecated_params,
parallelize,
)
from squidpy._validators import assert_key_in_adata, assert_positive
from squidpy.gr._utils import (
_assert_categorical_obs,
Expand Down Expand Up @@ -78,6 +85,8 @@ def spatial_autocorr(
(``'pval_sim'``, ``'pval_z_sim'``) are unaffected.
See `#1183 <https://github.com/scverse/squidpy/issues/1183>`_.

%(seed_versionchanged)s

Parameters
----------
%(adata)s
Expand Down Expand Up @@ -211,16 +220,16 @@ def extract_obsm(adata: AnnData, ixs: int | Sequence[int] | None) -> tuple[NDArr
if n_perms is not None:
assert_positive(n_perms, name="n_perms")
perms = list(np.arange(n_perms))
seeds = _spawn_seeds(seed, n_perms)

score_perms = parallelize(
_score_helper,
collection=perms,
extractor=np.concatenate,
use_ixs=True,
n_jobs=n_jobs,
backend=backend,
show_progress_bar=show_progress_bar,
)(mode=mode, g=g, vals=vals, seed=seed)
)(mode=mode, g=g, vals=vals, seeds=seeds)
else:
score_perms = None

Expand All @@ -247,20 +256,19 @@ def extract_obsm(adata: AnnData, ixs: int | Sequence[int] | None) -> tuple[NDArr


def _score_helper(
ix: int,
perms: Sequence[int],
mode: SpatialAutocorr,
g: spmatrix,
vals: NDArrayA,
seed: int | None = None,
seeds: Sequence[int],
queue: SigQueue | None = None,
) -> pd.DataFrame:
score_perms = np.empty((len(perms), vals.shape[0]))
rng = default_rng(None if seed is None else ix + seed)
func = morans_i if mode == SpatialAutocorr.MORAN else gearys_c

for i in range(len(perms)):
idx_shuffle = rng.permutation(g.shape[0])
for i, p in enumerate(perms):
rs = np.random.RandomState(seeds[p])
idx_shuffle = rs.permutation(g.shape[0])
score_perms[i, :] = func(g[idx_shuffle, :], vals)

if queue is not None:
Expand Down
32 changes: 23 additions & 9 deletions src/squidpy/gr/_ripley.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import numpy as np
import pandas as pd
from anndata import AnnData
from numpy.random import default_rng
from scanpy import logging as logg
from scipy.spatial import ConvexHull, Delaunay
from scipy.spatial.distance import pdist
Expand All @@ -18,7 +17,7 @@
from squidpy._constants._constants import RipleyStat
from squidpy._constants._pkg_constants import Key
from squidpy._docs import d, inject_docs
from squidpy._utils import NDArrayA
from squidpy._utils import NDArrayA, _spawn_seeds
from squidpy.gr._utils import _assert_categorical_obs, _assert_spatial_basis, _save_data, extract_adata_if_sdata

__all__ = ["ripley"]
Expand Down Expand Up @@ -48,6 +47,15 @@ def ripley(
According to the `'mode'` argument, it calculates one of the following Ripley's statistics:
`{rp.F.s!r}`, `{rp.G.s!r}` or `{rp.L.s!r}` statistics.

.. versionchanged:: 1.8.4
Every simulation now draws from an independent seed spawned from a
:class:`numpy.random.SeedSequence`, following `NumPy's guidance on parallel random
number generation <https://numpy.org/doc/stable/reference/random/parallel.html>`_.
Previously every simulation reused the same ``seed`` and therefore produced an identical
point pattern; the simulated statistics (and hence the p-values) are now genuinely
independent across simulations, but results obtained with a given ``seed`` differ from
those produced by squidpy < 1.8.4.

`{rp.F.s!r}`, `{rp.G.s!r}` are defined as:

.. math::
Expand Down Expand Up @@ -133,11 +141,12 @@ def ripley(
start = logg.info(
f"Calculating Ripley's {mode} statistic for `{le.classes_.shape[0]}` clusters and `{n_simulations}` simulations"
)
obs_seed, *sim_seeds = _spawn_seeds(seed, n_simulations + 1)

for i in np.arange(np.max(cluster_idx) + 1):
coord_c = coordinates[cluster_idx == i, :]
if mode == RipleyStat.F:
random = _ppp(hull, n_simulations=1, n_observations=n_observations, seed=seed)
random = _ppp(hull, n_simulations=1, n_observations=n_observations, seed=obs_seed)
tree_c = NearestNeighbors(metric=metric, n_neighbors=n_neigh).fit(coord_c)
distances, _ = tree_c.kneighbors(random, n_neighbors=n_neigh)
bins, obs_stats = _f_g_function(distances.squeeze(), support)
Expand All @@ -156,7 +165,7 @@ def ripley(
pvalues = np.ones((le.classes_.shape[0], len(bins)))

for i in range(n_simulations):
random_i = _ppp(hull, n_simulations=1, n_observations=n_observations, seed=seed)
random_i = _ppp(hull, n_simulations=1, n_observations=n_observations, seed=sim_seeds[i])
if mode == RipleyStat.F:
tree_i = NearestNeighbors(metric=metric, n_neighbors=n_neigh).fit(random_i)
distances_i, _ = tree_i.kneighbors(random, n_neighbors=1)
Expand Down Expand Up @@ -216,7 +225,12 @@ def _l_function(distances: NDArrayA, support: NDArrayA, n: int, area: float) ->
return support, l_estimate


def _ppp(hull: ConvexHull, n_simulations: int, n_observations: int, seed: int | None = None) -> NDArrayA:
def _ppp(
hull: ConvexHull,
n_simulations: int,
n_observations: int,
seed: int,
) -> NDArrayA:
"""
Simulate Poisson Point Process on a polygon.

Expand All @@ -229,13 +243,13 @@ def _ppp(hull: ConvexHull, n_simulations: int, n_observations: int, seed: int |
n_observations
Number of observations to sample from each simulation.
seed
Random seed.
``uint32`` integer seed for the legacy :class:`numpy.random.RandomState`.

Returns
-------
An Array with shape ``(n_simulation, n_observations, 2)``.
"""
rng = default_rng(None if seed is None else seed)
rs = np.random.RandomState(seed)
vxs = hull.points[hull.vertices]
deln = Delaunay(vxs)

Expand All @@ -246,8 +260,8 @@ def _ppp(hull: ConvexHull, n_simulations: int, n_observations: int, seed: int |
i_obs = 0
while i_obs < n_observations:
x, y = (
rng.uniform(bbox[0], bbox[2]),
rng.uniform(bbox[1], bbox[3]),
rs.uniform(bbox[0], bbox[2]),
rs.uniform(bbox[1], bbox[3]),
)
if deln.find_simplex((x, y)) >= 0:
result[i_sim, i_obs] = (x, y)
Expand Down
Binary file modified tests/_data/ligrec_no_numba.pickle
Binary file not shown.
9 changes: 9 additions & 0 deletions tests/graph/test_ligrec.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,15 @@ def test_reproducibility_cores(self, adata: AnnData, interactions: Interactions_
assert not np.allclose(r3["pvalues"], r1["pvalues"])
assert not np.allclose(r3["pvalues"], r2["pvalues"])

def test_n_jobs_invariance(self, adata: AnnData, interactions: Interactions_t):
"""The number of workers must not change the result (one seed is spawned per permutation)."""
kw = {"interactions": interactions, "n_perms": 25, "copy": True, "show_progress_bar": False, "seed": 42}
res_serial = ligrec(adata, _CK, n_jobs=1, **kw)
res_parallel = ligrec(adata, _CK, n_jobs=2, **kw)

np.testing.assert_allclose(res_serial["means"], res_parallel["means"])
np.testing.assert_allclose(res_serial["pvalues"], res_parallel["pvalues"])

def test_reproducibility_numba_parallel_off(self, adata: AnnData, interactions: Interactions_t):
t1 = time()
r1 = ligrec(
Expand Down
Loading
Loading