diff --git a/docs/release-notes/3807.feat.md b/docs/release-notes/3807.feat.md new file mode 100644 index 0000000000..2880e57f2b --- /dev/null +++ b/docs/release-notes/3807.feat.md @@ -0,0 +1 @@ +Introduces a `binary` method for `sc.pp.neighbors` that fully skips `umap` weighting of connectivities and thus returns `k` connectivities for `k` nearest-neighbors. It further includes a fix for the case in which custom neighbor transformers return less than k neighbors for some nodes (e.g., SNN and similar "adaptive" transformers). When `knn=True` and `method='umap'`, a warning is emitted snd specifies future behavior. The behavior of `n_neighbors` which differs depending on the nearest neighbor finder that is used internally is documented. diff --git a/src/scanpy/neighbors/__init__.py b/src/scanpy/neighbors/__init__.py index 02c5b2767b..638ad23013 100644 --- a/src/scanpy/neighbors/__init__.py +++ b/src/scanpy/neighbors/__init__.py @@ -113,9 +113,23 @@ def neighbors( # noqa: PLR0913 `n_neighbors`, that is, consider a knn graph. Otherwise, use a Gaussian Kernel to assign low weights to neighbors more distant than the `n_neighbors` nearest neighbor. + + .. note:: + The `knn` parameter affects all methods but differently: + + - 'gauss': Directly controls whether to use hard thresholding vs soft weighting + - 'umap': Affects the initial k-NN preprocessing, but UMAP still creates + varying connectivity strengths via fuzzy simplicial sets + - 'binary': Creates uniform (0/1) connectivity weights when `knn=True` + + Use `method='binary'` for strictly uniform connectivity weights. method - Use 'umap' :cite:p:`McInnes2018` or 'gauss' (Gauss kernel following :cite:t:`Coifman2005` - with adaptive width :cite:t:`Haghverdi2016`) for computing connectivities. + Use 'umap' :cite:p:`McInnes2018`, 'gauss' (Gauss kernel following :cite:t:`Coifman2005` + with adaptive width :cite:t:`Haghverdi2016`), or 'binary' for computing connectivities. + + - 'umap': Creates connectivities with varying strengths based on fuzzy simplicial sets + - 'gauss': Uses Gaussian kernels with adaptive widths, respects `knn` parameter + - 'binary': Creates binary (0/1) connectivities based on k-nearest neighbor graph transformer Approximate kNN search implementation following the API of :class:`~sklearn.neighbors.KNeighborsTransformer`. @@ -541,6 +555,7 @@ def compute_neighbors( # noqa: PLR0912 method See :func:`scanpy.pp.neighbors`. If `None`, skip calculating connectivities. + Use 'binary' for uniform connectivity weights when `knn=True`. Returns ------- @@ -570,6 +585,15 @@ def compute_neighbors( # noqa: PLR0912 if self._adata.shape[0] >= 10000 and not knn: logg.warning("Using high n_obs without `knn=True` takes a lot of memory...") + + # Warn when using knn=True with umap method + if knn and method == "umap": + logg.warning( + "Using `knn=True` with `method='umap'` affects k-NN preprocessing " + "but UMAP will still create connectivities with varying strengths. " + "For uniform connectivity weights, use `method='binary'` instead." + ) + # do not use the cached rp_forest self._rp_forest = None self.n_neighbors = n_neighbors @@ -598,7 +622,11 @@ def compute_neighbors( # noqa: PLR0912 self._rp_forest = _make_forest_dict(index) start_connect = logg.debug("computed neighbors", time=start_neighbors) - if method == "umap": + if method == "binary": + self._connectivities = _get_sparse_matrix_from_indices_distances( + knn_indices, np.ones_like(knn_distances), keep_self=False + ) + elif method == "umap": self._connectivities = _connectivity.umap( knn_indices, knn_distances, @@ -664,7 +692,7 @@ def _handle_transformer( raise ValueError(msg) # Validate `knn` - conn_method = method if method in {"gauss", None} else "umap" + conn_method = method if method in {"gauss", "binary", None} else "umap" if not knn and not (conn_method == "gauss" and transformer is None): # “knn=False” seems to be only intended for method “gauss” msg = f"`method = {method!r} only with `knn = True`." diff --git a/src/scanpy/neighbors/_common.py b/src/scanpy/neighbors/_common.py index 7112c69d02..8f80914080 100644 --- a/src/scanpy/neighbors/_common.py +++ b/src/scanpy/neighbors/_common.py @@ -15,7 +15,6 @@ def _has_self_column( indices: NDArray[np.int32 | np.int64], - distances: NDArray[np.float32 | np.float64], ) -> bool: # some algorithms have some messed up reordering. return (indices[:, 0] == np.arange(indices.shape[0])).any() @@ -25,7 +24,7 @@ def _remove_self_column( indices: NDArray[np.int32 | np.int64], distances: NDArray[np.float32 | np.float64], ) -> tuple[NDArray[np.int32 | np.int64], NDArray[np.float32 | np.float64]]: - if not _has_self_column(indices, distances): + if not _has_self_column(indices): msg = "The first neighbor should be the cell itself." raise AssertionError(msg) return indices[:, 1:], distances[:, 1:] @@ -85,7 +84,7 @@ def _get_indices_distances_from_sparse_matrix( indices, distances = _ind_dist_slow(d, n_neighbors) # handle RAPIDS style indices_distances lacking the self-column - if not _has_self_column(indices, distances): + if not _has_self_column(indices): indices = np.hstack([np.arange(indices.shape[0])[:, None], indices]) distances = np.hstack([np.zeros(distances.shape[0])[:, None], distances]) @@ -100,25 +99,45 @@ def _get_indices_distances_from_sparse_matrix( def _ind_dist_slow( d: CSRBase, /, n_neighbors: int ) -> tuple[NDArray[np.int32 | np.int64], NDArray[np.float32 | np.float64]]: - indices = np.zeros((d.shape[0], n_neighbors), dtype=int) - distances = np.zeros((d.shape[0], n_neighbors), dtype=d.dtype) + d = d.tocsr() + n_obs = d.shape[0] + indices = np.zeros((n_obs, n_neighbors), dtype=int) + indices[:, 0] = np.arange(n_obs) # set self-indices + distances = np.zeros((n_obs, n_neighbors), dtype=d.dtype) n_neighbors_m1 = n_neighbors - 1 - for i in range(indices.shape[0]): - neighbors = d[i].nonzero() # 'true' and 'spurious' zeros - indices[i, 0] = i - distances[i, 0] = 0 + + for i in range(n_obs): + row = d.getrow(i) + row_indices = row.indices + row_data = row.data + + if len(row_indices) == 0: + continue + + # self-indices are preset, so we filter them out here + mask = row_indices != i + non_self_indices = row_indices[mask] + non_self_data = row_data[mask] + # account for the fact that there might be more than n_neighbors # due to an approximate search # [the point itself was not detected as its own neighbor during the search] - if len(neighbors[1]) > n_neighbors_m1: - sorted_indices = np.argsort(d[i][neighbors].A1)[:n_neighbors_m1] - indices[i, 1:] = neighbors[1][sorted_indices] - distances[i, 1:] = d[i][ - neighbors[0][sorted_indices], neighbors[1][sorted_indices] + if len(non_self_indices) > n_neighbors_m1: + # partial sorting to get the n_neighbors - 1 smallest distances + partition_indices = np.argpartition(non_self_data, n_neighbors_m1 - 1)[ + :n_neighbors_m1 + ] + sorted_partition = partition_indices[ + np.argsort(non_self_data[partition_indices]) ] + indices[i, 1:] = non_self_indices[sorted_partition] + distances[i, 1:] = non_self_data[sorted_partition] else: - indices[i, 1:] = neighbors[1] - distances[i, 1:] = d[i][neighbors] + n_actual = len(non_self_indices) + if n_actual > 0: + indices[i, 1 : 1 + n_actual] = non_self_indices + distances[i, 1 : 1 + n_actual] = non_self_data + return indices, distances diff --git a/src/scanpy/neighbors/_types.py b/src/scanpy/neighbors/_types.py index 7c52ae56e6..050e041559 100644 --- a/src/scanpy/neighbors/_types.py +++ b/src/scanpy/neighbors/_types.py @@ -12,7 +12,7 @@ # These two are used with get_literal_vals elsewhere -_Method = Literal["umap", "gauss"] +_Method = Literal["umap", "gauss", "binary"] _KnownTransformer = Literal["pynndescent", "sklearn", "rapids"] # sphinx-autodoc-typehints can’t transitively import types from if TYPE_CHECKING blocks, diff --git a/tests/test_neighbors.py b/tests/test_neighbors.py index 05fff75196..6c29f54c55 100644 --- a/tests/test_neighbors.py +++ b/tests/test_neighbors.py @@ -132,9 +132,9 @@ def neigh() -> Neighbors: return get_neighbors() -@pytest.mark.parametrize("method", ["umap", "gauss"]) +@pytest.mark.parametrize("method", ["umap", "gauss", "binary"]) def test_distances_euclidean( - mocker: MockerFixture, neigh: Neighbors, method: Literal["umap", "gauss"] + mocker: MockerFixture, neigh: Neighbors, method: Literal["umap", "gauss", "binary"] ): """Umap and gauss behave the same for distances. diff --git a/tests/test_neighbors_common.py b/tests/test_neighbors_common.py index 2eeee1f74c..8689fd268f 100644 --- a/tests/test_neighbors_common.py +++ b/tests/test_neighbors_common.py @@ -1,13 +1,16 @@ from __future__ import annotations +import warnings from typing import TYPE_CHECKING import numpy as np import pytest from fast_array_utils.stats import is_constant +from scipy import sparse from sklearn.neighbors import KNeighborsTransformer from scanpy.neighbors._common import ( + _get_indices_distances_from_sparse_matrix, _get_sparse_matrix_from_indices_distances, _has_self_column, _ind_dist_shortcut, @@ -17,8 +20,6 @@ from collections.abc import Callable from typing import Literal - from scipy import sparse - from scanpy._compat import CSRBase @@ -43,7 +44,7 @@ def mk_knn_matrix( mat = _get_sparse_matrix_from_indices_distances(idxs, dists, keep_self=True) # check if out helper here works as expected - assert _has_self_column(idxs, dists) == (style != "rapids") + assert _has_self_column(idxs) == (style != "rapids") if duplicates: # Make sure the actual matrix has a regular sparsity pattern assert is_constant(mat.getnnz(axis=1)) @@ -98,3 +99,119 @@ def test_ind_dist_shortcut_premade( assert (mat.nnz / n_obs) == n_neighbors + 1 assert _ind_dist_shortcut(mat) is not None + + +def mk_variable_knn_matrix_self_inclusive(n_obs: int) -> CSRBase: + """Create matrix with variable neighbors per row, including self with distance 0.""" + indices_list = [[0, 1], [1, 0, 2], [2, 1], [3, 0, 1]] + distances_list = [[0.0, 0.5], [0.0, 0.3, 0.4], [0.0, 0.7], [0.0, 0.8, 0.6]] + + row_indices = [] + col_indices = [] + data = [] + + if n_obs > len(indices_list): + msg = ( + f"n_obs={n_obs} is too large for the predefined variable neighbor structure " + f"with {len(indices_list)} rows." + ) + raise ValueError(msg) + + for i in range(n_obs): + for idx, dist in zip(indices_list[i], distances_list[i], strict=True): + row_indices.append(i) + col_indices.append(idx) + data.append(dist) + + mat = sparse.csr_matrix( # noqa: TID251 + (np.array(data), (np.array(row_indices), np.array(col_indices))), + shape=(n_obs, n_obs), + ) + return mat + + +def mk_variable_knn_matrix_self_exclusive(n_obs: int) -> CSRBase: + """Create matrix with variable neighbors per row, excluding self (no distance 0).""" + indices_list = [[1], [0, 2], [1], [0, 1]] + distances_list = [[0.5], [0.3, 0.4], [0.7], [0.8, 0.6]] + + row_indices = [] + col_indices = [] + data = [] + + if n_obs > len(indices_list): + msg = ( + f"n_obs={n_obs} is too large for the predefined variable neighbor structure " + f"with {len(indices_list)} rows." + ) + raise ValueError(msg) + + for i in range(n_obs): + for idx, dist in zip(indices_list[i], distances_list[i], strict=True): + row_indices.append(i) + col_indices.append(idx) + data.append(dist) + + mat = sparse.csr_matrix( # noqa: TID251 + (np.array(data), (np.array(row_indices), np.array(col_indices))), + shape=(n_obs, n_obs), + ) + return mat + + +@pytest.mark.parametrize( + ("matrix_func", "self_inclusive"), + [ + pytest.param(mk_variable_knn_matrix_self_inclusive, True, id="self_inclusive"), + # pytest.param(mk_variable_knn_matrix_self_exclusive, False, id="self_exclusive"), + ], +) +def test_variable_neighbors_uses_slow_path(matrix_func, self_inclusive): + """Test variable neighbor counts trigger slow path with warning. + + This test mocks `sc.pp.neighbors`, which always validates the neighbors returned from + the nearest neighbor search with `_get_indices_distances_from_sparse_matrix` and then + converts them back to a sparse. + + Here we use `_get_sparse_matrix_from_indices_distances`, as is used by `method='binary'` + in `sc.pp.neighbors`. + """ + n_obs = 4 + n_neighbors = 3 # int(self_inclusive) + 2 + + mat = matrix_func(n_obs) + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + indices, distances = _get_indices_distances_from_sparse_matrix( + mat, + n_neighbors=n_neighbors, + ) + assert len(w) == 1 + assert "no constant number of neighbors" in str(w[0].message) + + assert indices.shape == (n_obs, n_neighbors) + + # first column should always be self (distance 0) + assert np.array_equal(indices[:, 0], np.arange(n_obs)) + assert np.allclose(distances[:, 0], 0.0) + + connectivities = _get_sparse_matrix_from_indices_distances( + indices, + distances, + keep_self=False, + ) + assert connectivities.shape == (n_obs, n_obs) + + # TODO: This is a hack, as we prefill distances with zeros + # `_get_sparse_matrix_from_indices_distances` does not eliminate zeros itself + # but rather searches for the self column and removes it if requested + # We can also not rely on `_get_sparse_matrix_from_indices_distances` being + # called, as it is specific to `method='binary'` in `sc.pp.neighbors`. + connectivities.eliminate_zeros() + + # check different rows have different numbers of connections + nnz_per_row = np.diff(connectivities.indptr) + assert not is_constant(nnz_per_row), ( + f"Expected variable connectivity, got {nnz_per_row}" + )