diff --git a/src/squidpy/_docs.py b/src/squidpy/_docs.py index 3915f658c..3c9020ed3 100644 --- a/src/squidpy/_docs.py +++ b/src/squidpy/_docs.py @@ -49,6 +49,15 @@ def decorator2(obj: Any) -> Any: _seed = """\ seed Random seed for reproducibility.""" +_seed_versionchanged = """\ +.. versionchanged:: 1.8.4 + Every permutation now draws from an independent seed spawned from a + :class:`numpy.random.SeedSequence`, following `NumPy's guidance on parallel random + number generation `_. + Consequently the permutation-based results no longer depend on ``n_jobs`` / ``backend``, but + results obtained with a given ``seed`` differ from those produced by squidpy < 1.8.4. + See `#1232 `_ and + `#1233 `_.""" _n_perms = """\ n_perms Number of permutations for the permutation test.""" @@ -413,6 +422,7 @@ def decorator2(obj: Any) -> Any: copy_cont=_copy_cont, numba_parallel=_numba_parallel, seed=_seed, + seed_versionchanged=_seed_versionchanged, n_perms=_n_perms, img_layer=_img_layer, feature_name=_feature_name, diff --git a/src/squidpy/_utils.py b/src/squidpy/_utils.py index 17e6a7a83..977784965 100644 --- a/src/squidpy/_utils.py +++ b/src/squidpy/_utils.py @@ -243,6 +243,13 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: return wrapper +def _spawn_seeds(seed: int | None, n: int) -> list[int]: + # uint32 integer seeds, derived from independent SeedSequence children (avoids the correlation + # of sequential seeds). uint32 is required to stay reproducible with numba/legacy RandomState, + # which only accept uint32 integer seeds. + return [int(s.generate_state(1)[0]) for s in np.random.SeedSequence(seed).spawn(n)] + + def thread_map( fn: Callable[..., Any], items: Sequence[Any], diff --git a/src/squidpy/gr/_ligrec.py b/src/squidpy/gr/_ligrec.py index 1aecc85b0..81d2ddd3b 100644 --- a/src/squidpy/gr/_ligrec.py +++ b/src/squidpy/gr/_ligrec.py @@ -20,7 +20,14 @@ from squidpy._constants._constants import ComplexPolicy, CorrAxis from squidpy._constants._pkg_constants import Key from squidpy._docs import d, inject_docs -from squidpy._utils import NDArrayA, Signal, SigQueue, _get_n_cores, parallelize +from squidpy._utils import ( + NDArrayA, + Signal, + SigQueue, + _get_n_cores, + _spawn_seeds, + parallelize, +) from squidpy._validators import assert_positive, check_tuple_needles from squidpy.gr._utils import ( _assert_categorical_obs, @@ -332,6 +339,8 @@ def test( """ Perform the permutation test as described in :cite:`cellphonedb`. + %(seed_versionchanged)s + Parameters ---------- %(cluster_key)s @@ -753,6 +762,7 @@ def extractor(res: Sequence[TempResult]) -> TempResult: # (n_cells, n_genes) data = np.array(data[data.columns.difference(["clusters"])].values, dtype=np.float64, order="C") # all 3 should be C contiguous + seeds = _spawn_seeds(seed, n_perms) return parallelize( # type: ignore[no-any-return] _analysis_helper, np.arange(n_perms, dtype=np.int32).tolist(), @@ -767,7 +777,7 @@ def extractor(res: Sequence[TempResult]) -> TempResult: interactions, interaction_clusters=interaction_clusters, clustering=clustering, - seed=seed, + seeds=seeds, numba_parallel=numba_parallel, ) @@ -780,7 +790,7 @@ def _analysis_helper( interactions: NDArrayA, interaction_clusters: NDArrayA, clustering: NDArrayA, - seed: int | None = None, + seeds: Sequence[int], numba_parallel: bool | None = None, queue: SigQueue | None = None, ) -> TempResult: @@ -790,7 +800,7 @@ def _analysis_helper( Parameters ---------- perms - Permutation indices. Only used to set the ``seed``. + Permutation indices. Used to index ``seeds`` and to decide which worker returns the means. data Array of shape `(n_cells, n_genes)`. mean @@ -804,8 +814,8 @@ def _analysis_helper( Array of shape `(n_interaction_clusters, 2)`. clustering Array of shape `(n_cells,)` containing the original clustering. - seed - Random seed for :class:`numpy.random.RandomState`. + seeds + One ``uint32`` integer seed per permutation, indexed by the values in ``perms``. numba_parallel Whether to use :func:`numba.prange` or not. If `None`, it's determined automatically. queue @@ -820,9 +830,8 @@ def _analysis_helper( - `'pvalues'` - array of shape `(n_interactions, n_interaction_clusters)` containing `np.sum(T0 > T)` where `T0` is the test statistic under null hypothesis and `T` is the true test statistic. """ - rs = np.random.RandomState(None if seed is None else perms[0] + seed) - - clustering = clustering.copy() + # used as a read-only base; each permutation shuffles its own copy (see the loop below) + clustering_base = clustering.copy() n_cls = mean.shape[1] return_means = np.min(perms) == 0 @@ -848,7 +857,11 @@ def _analysis_helper( res_means = None test = _test - for _ in perms: + for p in perms: + # shuffle from the same base with a per-permutation seed, so each permutation is + # independent of the others and of how the permutations are split across jobs + rs = np.random.RandomState(seeds[p]) + clustering = clustering_base.copy() rs.shuffle(clustering) error = test(interactions, interaction_clusters, data, clustering, mean, mask, res=res) if error: diff --git a/src/squidpy/gr/_nhood.py b/src/squidpy/gr/_nhood.py index c679f42dd..17aa5d2a1 100644 --- a/src/squidpy/gr/_nhood.py +++ b/src/squidpy/gr/_nhood.py @@ -20,7 +20,14 @@ from squidpy._constants._constants import Centrality from squidpy._constants._pkg_constants import Key from squidpy._docs import d, inject_docs -from squidpy._utils import NDArrayA, Signal, SigQueue, _get_n_cores, parallelize +from squidpy._utils import ( + NDArrayA, + Signal, + SigQueue, + _get_n_cores, + _spawn_seeds, + parallelize, +) from squidpy._validators import assert_positive from squidpy.gr._utils import ( _assert_categorical_obs, @@ -153,6 +160,8 @@ def nhood_enrichment( """ Compute neighborhood enrichment by permutation test. + %(seed_versionchanged)s + Parameters ---------- %(adata)s @@ -200,6 +209,7 @@ def nhood_enrichment( n_jobs = _get_n_cores(n_jobs) start = logg.info(f"Calculating neighborhood enrichment using `{n_jobs}` core(s)") + seeds = _spawn_seeds(seed, n_perms) perms = parallelize( _nhood_enrichment_helper, @@ -215,7 +225,7 @@ def nhood_enrichment( int_clust=int_clust, libraries=libraries, n_cls=n_cls, - seed=seed, + seeds=seeds, ) zscore = (count - perms.mean(axis=0)) / perms.std(axis=0) @@ -444,19 +454,22 @@ def _nhood_enrichment_helper( int_clust: NDArrayA, libraries: pd.Series[CategoricalDtype] | None, n_cls: int, - seed: int | None = None, + seeds: Sequence[int], queue: SigQueue | None = None, ) -> NDArrayA: perms = np.empty((len(ixs), n_cls, n_cls), dtype=np.float64) - int_clust = int_clust.copy() # threading - rs = np.random.RandomState(seed=None if seed is None else seed + ixs[0]) + int_clust = int_clust.copy() # threading; used as a read-only base for each permutation - for i in range(len(ixs)): + for i, ix in enumerate(ixs): + # shuffle from the same base with a per-permutation seed, so each permutation is + # independent of the others and of how the permutations are split across jobs + rs = np.random.RandomState(seeds[ix]) if libraries is not None: - int_clust = _shuffle_group(int_clust, libraries, rs) + shuffled = _shuffle_group(int_clust, libraries, rs) else: - rs.shuffle(int_clust) - perms[i, ...] = callback(indices, indptr, int_clust) + shuffled = int_clust.copy() + rs.shuffle(shuffled) + perms[i, ...] = callback(indices, indptr, shuffled) if queue is not None: queue.put(Signal.UPDATE) diff --git a/src/squidpy/gr/_ppatterns.py b/src/squidpy/gr/_ppatterns.py index 7b786cc39..8e17845e8 100644 --- a/src/squidpy/gr/_ppatterns.py +++ b/src/squidpy/gr/_ppatterns.py @@ -10,7 +10,6 @@ import pandas as pd from anndata import AnnData from numba import njit, prange -from numpy.random import default_rng from scanpy import logging as logg from scanpy.metrics import gearys_c, morans_i from scipy import stats @@ -23,7 +22,15 @@ from squidpy._constants._constants import SpatialAutocorr from squidpy._constants._pkg_constants import Key from squidpy._docs import d, inject_docs -from squidpy._utils import NDArrayA, Signal, SigQueue, _get_n_cores, deprecated_params, parallelize +from squidpy._utils import ( + NDArrayA, + Signal, + SigQueue, + _get_n_cores, + _spawn_seeds, + deprecated_params, + parallelize, +) from squidpy._validators import assert_key_in_adata, assert_positive from squidpy.gr._utils import ( _assert_categorical_obs, @@ -78,6 +85,8 @@ def spatial_autocorr( (``'pval_sim'``, ``'pval_z_sim'``) are unaffected. See `#1183 `_. + %(seed_versionchanged)s + Parameters ---------- %(adata)s @@ -211,16 +220,16 @@ def extract_obsm(adata: AnnData, ixs: int | Sequence[int] | None) -> tuple[NDArr if n_perms is not None: assert_positive(n_perms, name="n_perms") perms = list(np.arange(n_perms)) + seeds = _spawn_seeds(seed, n_perms) score_perms = parallelize( _score_helper, collection=perms, extractor=np.concatenate, - use_ixs=True, n_jobs=n_jobs, backend=backend, show_progress_bar=show_progress_bar, - )(mode=mode, g=g, vals=vals, seed=seed) + )(mode=mode, g=g, vals=vals, seeds=seeds) else: score_perms = None @@ -247,20 +256,19 @@ def extract_obsm(adata: AnnData, ixs: int | Sequence[int] | None) -> tuple[NDArr def _score_helper( - ix: int, perms: Sequence[int], mode: SpatialAutocorr, g: spmatrix, vals: NDArrayA, - seed: int | None = None, + seeds: Sequence[int], queue: SigQueue | None = None, ) -> pd.DataFrame: score_perms = np.empty((len(perms), vals.shape[0])) - rng = default_rng(None if seed is None else ix + seed) func = morans_i if mode == SpatialAutocorr.MORAN else gearys_c - for i in range(len(perms)): - idx_shuffle = rng.permutation(g.shape[0]) + for i, p in enumerate(perms): + rs = np.random.RandomState(seeds[p]) + idx_shuffle = rs.permutation(g.shape[0]) score_perms[i, :] = func(g[idx_shuffle, :], vals) if queue is not None: diff --git a/src/squidpy/gr/_ripley.py b/src/squidpy/gr/_ripley.py index 7c92ae357..44d00c711 100644 --- a/src/squidpy/gr/_ripley.py +++ b/src/squidpy/gr/_ripley.py @@ -7,7 +7,6 @@ import numpy as np import pandas as pd from anndata import AnnData -from numpy.random import default_rng from scanpy import logging as logg from scipy.spatial import ConvexHull, Delaunay from scipy.spatial.distance import pdist @@ -18,7 +17,7 @@ from squidpy._constants._constants import RipleyStat from squidpy._constants._pkg_constants import Key from squidpy._docs import d, inject_docs -from squidpy._utils import NDArrayA +from squidpy._utils import NDArrayA, _spawn_seeds from squidpy.gr._utils import _assert_categorical_obs, _assert_spatial_basis, _save_data, extract_adata_if_sdata __all__ = ["ripley"] @@ -48,6 +47,15 @@ def ripley( According to the `'mode'` argument, it calculates one of the following Ripley's statistics: `{rp.F.s!r}`, `{rp.G.s!r}` or `{rp.L.s!r}` statistics. + .. versionchanged:: 1.8.4 + Every simulation now draws from an independent seed spawned from a + :class:`numpy.random.SeedSequence`, following `NumPy's guidance on parallel random + number generation `_. + Previously every simulation reused the same ``seed`` and therefore produced an identical + point pattern; the simulated statistics (and hence the p-values) are now genuinely + independent across simulations, but results obtained with a given ``seed`` differ from + those produced by squidpy < 1.8.4. + `{rp.F.s!r}`, `{rp.G.s!r}` are defined as: .. math:: @@ -133,11 +141,12 @@ def ripley( start = logg.info( f"Calculating Ripley's {mode} statistic for `{le.classes_.shape[0]}` clusters and `{n_simulations}` simulations" ) + obs_seed, *sim_seeds = _spawn_seeds(seed, n_simulations + 1) for i in np.arange(np.max(cluster_idx) + 1): coord_c = coordinates[cluster_idx == i, :] if mode == RipleyStat.F: - random = _ppp(hull, n_simulations=1, n_observations=n_observations, seed=seed) + random = _ppp(hull, n_simulations=1, n_observations=n_observations, seed=obs_seed) tree_c = NearestNeighbors(metric=metric, n_neighbors=n_neigh).fit(coord_c) distances, _ = tree_c.kneighbors(random, n_neighbors=n_neigh) bins, obs_stats = _f_g_function(distances.squeeze(), support) @@ -156,7 +165,7 @@ def ripley( pvalues = np.ones((le.classes_.shape[0], len(bins))) for i in range(n_simulations): - random_i = _ppp(hull, n_simulations=1, n_observations=n_observations, seed=seed) + random_i = _ppp(hull, n_simulations=1, n_observations=n_observations, seed=sim_seeds[i]) if mode == RipleyStat.F: tree_i = NearestNeighbors(metric=metric, n_neighbors=n_neigh).fit(random_i) distances_i, _ = tree_i.kneighbors(random, n_neighbors=1) @@ -216,7 +225,12 @@ def _l_function(distances: NDArrayA, support: NDArrayA, n: int, area: float) -> return support, l_estimate -def _ppp(hull: ConvexHull, n_simulations: int, n_observations: int, seed: int | None = None) -> NDArrayA: +def _ppp( + hull: ConvexHull, + n_simulations: int, + n_observations: int, + seed: int, +) -> NDArrayA: """ Simulate Poisson Point Process on a polygon. @@ -229,13 +243,13 @@ def _ppp(hull: ConvexHull, n_simulations: int, n_observations: int, seed: int | n_observations Number of observations to sample from each simulation. seed - Random seed. + ``uint32`` integer seed for the legacy :class:`numpy.random.RandomState`. Returns ------- An Array with shape ``(n_simulation, n_observations, 2)``. """ - rng = default_rng(None if seed is None else seed) + rs = np.random.RandomState(seed) vxs = hull.points[hull.vertices] deln = Delaunay(vxs) @@ -246,8 +260,8 @@ def _ppp(hull: ConvexHull, n_simulations: int, n_observations: int, seed: int | i_obs = 0 while i_obs < n_observations: x, y = ( - rng.uniform(bbox[0], bbox[2]), - rng.uniform(bbox[1], bbox[3]), + rs.uniform(bbox[0], bbox[2]), + rs.uniform(bbox[1], bbox[3]), ) if deln.find_simplex((x, y)) >= 0: result[i_sim, i_obs] = (x, y) diff --git a/tests/_data/ligrec_no_numba.pickle b/tests/_data/ligrec_no_numba.pickle index e83cc26ae..138fc8545 100644 Binary files a/tests/_data/ligrec_no_numba.pickle and b/tests/_data/ligrec_no_numba.pickle differ diff --git a/tests/graph/test_ligrec.py b/tests/graph/test_ligrec.py index 7d484b7d9..a72748faa 100644 --- a/tests/graph/test_ligrec.py +++ b/tests/graph/test_ligrec.py @@ -305,6 +305,15 @@ def test_reproducibility_cores(self, adata: AnnData, interactions: Interactions_ assert not np.allclose(r3["pvalues"], r1["pvalues"]) assert not np.allclose(r3["pvalues"], r2["pvalues"]) + def test_n_jobs_invariance(self, adata: AnnData, interactions: Interactions_t): + """The number of workers must not change the result (one seed is spawned per permutation).""" + kw = {"interactions": interactions, "n_perms": 25, "copy": True, "show_progress_bar": False, "seed": 42} + res_serial = ligrec(adata, _CK, n_jobs=1, **kw) + res_parallel = ligrec(adata, _CK, n_jobs=2, **kw) + + np.testing.assert_allclose(res_serial["means"], res_parallel["means"]) + np.testing.assert_allclose(res_serial["pvalues"], res_parallel["pvalues"]) + def test_reproducibility_numba_parallel_off(self, adata: AnnData, interactions: Interactions_t): t1 = time() r1 = ligrec( diff --git a/tests/graph/test_nhood.py b/tests/graph/test_nhood.py index 74764f236..814305404 100644 --- a/tests/graph/test_nhood.py +++ b/tests/graph/test_nhood.py @@ -58,6 +58,17 @@ def test_reproducibility(self, adata: AnnData, n_jobs: int): np.testing.assert_array_equal(res3.zscore, res2.zscore) np.testing.assert_array_equal(res3.counts, res2.counts) + def test_n_jobs_invariance(self, adata: AnnData): + """The number of workers must not change the result (one seed is spawned per permutation).""" + spatial_neighbors(adata) + + kw = {"cluster_key": _CK, "seed": 42, "n_perms": 20, "copy": True} + res_serial = nhood_enrichment(adata, n_jobs=1, **kw) + res_parallel = nhood_enrichment(adata, n_jobs=2, **kw) + + np.testing.assert_array_equal(res_serial.zscore, res_parallel.zscore) + np.testing.assert_array_equal(res_serial.counts, res_parallel.counts) + def test_centrality_scores(nhood_data: AnnData): adata = nhood_data diff --git a/tests/graph/test_ppatterns.py b/tests/graph/test_ppatterns.py index 158ae6f5a..8cc10ca12 100644 --- a/tests/graph/test_ppatterns.py +++ b/tests/graph/test_ppatterns.py @@ -46,9 +46,11 @@ def test_spatial_autocorr_seq_par(dummy_adata: AnnData, mode: str): assert not np.array_equal(idx_df, idx_adata) np.testing.assert_array_equal(sorted(idx_df), sorted(idx_adata)) # check parallel gives same results - with pytest.raises(AssertionError, match=r'.*\(column name="pval_z_sim"\) are different.*'): - # because the seeds will be different, we don't expect the pval_sim values to be the same - assert_frame_equal(df, df_parallel) + # each permutation now gets its own seed (spawned from a SeedSequence), so the + # simulated p-values no longer depend on how the permutations are split across jobs + np.testing.assert_allclose(df["pval_sim"].values, df_parallel["pval_sim"].values, atol=1e-12) + np.testing.assert_allclose(df["pval_z_sim"].values, df_parallel["pval_z_sim"].values, atol=1e-12) + np.testing.assert_allclose(df["var_sim"].values, df_parallel["var_sim"].values, atol=1e-12) @pytest.mark.parametrize("mode", ["moran", "geary"]) @@ -91,6 +93,18 @@ def test_spatial_autocorr_reproducibility(dummy_adata: AnnData, n_jobs: int, mod @pytest.mark.parametrize("mode", ["moran", "geary"]) +def test_spatial_autocorr_n_jobs_invariance(dummy_adata: AnnData, mode: str): + """The number of workers must not change the permutation-based results (seed spawned per permutation).""" + kw = {"mode": mode, "copy": True, "seed": 42, "n_perms": 50} + df_serial = spatial_autocorr(dummy_adata, n_jobs=1, **kw) + df_parallel = spatial_autocorr(dummy_adata, n_jobs=2, **kw) + + # align on the gene index in case the stat-based sort order ties differently + df_parallel = df_parallel.loc[df_serial.index] + for col in ["pval_sim", "pval_z_sim", "var_sim"]: + np.testing.assert_allclose(df_serial[col].values, df_parallel[col].values, atol=1e-12) + + def test_spatial_autocorr_var_norm_formula(dummy_adata: AnnData, mode: str): """Analytic ``var_norm`` must use the variance matching the chosen statistic. diff --git a/tests/graph/test_ripley.py b/tests/graph/test_ripley.py index 750acbff2..a87e01b23 100644 --- a/tests/graph/test_ripley.py +++ b/tests/graph/test_ripley.py @@ -91,3 +91,26 @@ def test_ripley_results( # assert n_zeros == n_clusters idx = np.nonzero(obs_df.bins.values)[0] assert idx.shape[0] == n_steps * n_clusters - n_clusters + + +@pytest.mark.parametrize("mode", [RipleyStat.F, RipleyStat.G, RipleyStat.L]) +def test_ripley_seed(adata_ripley: AnnData, mode: RipleyStat): + """Same seed reproduces simulations, different seeds change them, and simulations are not all identical.""" + adata = adata_ripley + kw = {"cluster_key": CLUSTER_KEY, "mode": mode.s, "n_simulations": 20, "copy": True} + + res1 = ripley(adata, seed=42, **kw) + res2 = ripley(adata, seed=42, **kw) + res3 = ripley(adata, seed=43, **kw) + + sims1 = res1["sims_stat"].pivot(index="bins", columns="simulations", values="stats").to_numpy() + sims2 = res2["sims_stat"].pivot(index="bins", columns="simulations", values="stats").to_numpy() + sims3 = res3["sims_stat"].pivot(index="bins", columns="simulations", values="stats").to_numpy() + + # same seed -> identical simulations + np.testing.assert_array_equal(sims1, sims2) + # different seed -> different simulations + assert not np.array_equal(sims1, sims3) + # regression: simulations must not all be identical to each other + # (previously every simulation reused the same seed and produced the same point pattern) + assert not np.allclose(sims1, sims1[:, [0]])