scverse · maltekuehl · Sep 19, 2025 · Sep 19, 2025 · Sep 24, 2025 · Sep 24, 2025
diff --git a/docs/release-notes/3807.feat.md b/docs/release-notes/3807.feat.md
@@ -0,0 +1 @@
+Introduces a `binary` method for `sc.pp.neighbors` that fully skips `umap` weighting of connectivities and thus returns `k` connectivities for `k` nearest-neighbors. It further includes a fix for the case in which custom neighbor transformers return less than k neighbors for some nodes (e.g., SNN and similar "adaptive" transformers). When `knn=True` and `method='umap'`, a warning is emitted snd specifies future behavior. The behavior of `n_neighbors` which differs depending on the nearest neighbor finder that is used internally is documented.
diff --git a/src/scanpy/neighbors/__init__.py b/src/scanpy/neighbors/__init__.py
@@ -113,9 +113,23 @@ def neighbors(  # noqa: PLR0913
         `n_neighbors`, that is, consider a knn graph. Otherwise, use a Gaussian
         Kernel to assign low weights to neighbors more distant than the
         `n_neighbors` nearest neighbor.
+
+        .. note::
+           The `knn` parameter affects all methods but differently:
+
+           - 'gauss': Directly controls whether to use hard thresholding vs soft weighting
+           - 'umap': Affects the initial k-NN preprocessing, but UMAP still creates
+             varying connectivity strengths via fuzzy simplicial sets
+           - 'binary': Creates uniform (0/1) connectivity weights when `knn=True`
+
+           Use `method='binary'` for strictly uniform connectivity weights.
     method
-        Use 'umap' :cite:p:`McInnes2018` or 'gauss' (Gauss kernel following :cite:t:`Coifman2005`
-        with adaptive width :cite:t:`Haghverdi2016`) for computing connectivities.
+        Use 'umap' :cite:p:`McInnes2018`, 'gauss' (Gauss kernel following :cite:t:`Coifman2005`
+        with adaptive width :cite:t:`Haghverdi2016`), or 'binary' for computing connectivities.
+
+        - 'umap': Creates connectivities with varying strengths based on fuzzy simplicial sets
+        - 'gauss': Uses Gaussian kernels with adaptive widths, respects `knn` parameter
+        - 'binary': Creates binary (0/1) connectivities based on k-nearest neighbor graph
     transformer
         Approximate kNN search implementation following the API of
         :class:`~sklearn.neighbors.KNeighborsTransformer`.
@@ -541,6 +555,7 @@ def compute_neighbors(  # noqa: PLR0912
         method
             See :func:`scanpy.pp.neighbors`.
             If `None`, skip calculating connectivities.
+            Use 'binary' for uniform connectivity weights when `knn=True`.
 
         Returns
         -------
@@ -570,6 +585,15 @@ def compute_neighbors(  # noqa: PLR0912
 
         if self._adata.shape[0] >= 10000 and not knn:
             logg.warning("Using high n_obs without `knn=True` takes a lot of memory...")
+
+        # Warn when using knn=True with umap method
+        if knn and method == "umap":
+            logg.warning(
+                "Using `knn=True` with `method='umap'` affects k-NN preprocessing "
+                "but UMAP will still create connectivities with varying strengths. "
+                "For uniform connectivity weights, use `method='binary'` instead."
+            )
+
         # do not use the cached rp_forest
         self._rp_forest = None
         self.n_neighbors = n_neighbors
@@ -598,7 +622,11 @@ def compute_neighbors(  # noqa: PLR0912
                     self._rp_forest = _make_forest_dict(index)
         start_connect = logg.debug("computed neighbors", time=start_neighbors)
 
-        if method == "umap":
+        if method == "binary":
+            self._connectivities = _get_sparse_matrix_from_indices_distances(
+                knn_indices, np.ones_like(knn_distances), keep_self=False
+            )
+        elif method == "umap":
             self._connectivities = _connectivity.umap(
                 knn_indices,
                 knn_distances,
@@ -664,7 +692,7 @@ def _handle_transformer(
             raise ValueError(msg)
 
         # Validate `knn`
-        conn_method = method if method in {"gauss", None} else "umap"
+        conn_method = method if method in {"gauss", "binary", None} else "umap"
         if not knn and not (conn_method == "gauss" and transformer is None):
             # “knn=False” seems to be only intended for method “gauss”
             msg = f"`method = {method!r} only with `knn = True`."

diff --git a/src/scanpy/neighbors/_common.py b/src/scanpy/neighbors/_common.py
@@ -15,7 +15,6 @@
 
 def _has_self_column(
     indices: NDArray[np.int32 | np.int64],
-    distances: NDArray[np.float32 | np.float64],
 ) -> bool:
     # some algorithms have some messed up reordering.
     return (indices[:, 0] == np.arange(indices.shape[0])).any()
@@ -25,7 +24,7 @@ def _remove_self_column(
     indices: NDArray[np.int32 | np.int64],
     distances: NDArray[np.float32 | np.float64],
 ) -> tuple[NDArray[np.int32 | np.int64], NDArray[np.float32 | np.float64]]:
-    if not _has_self_column(indices, distances):
+    if not _has_self_column(indices):
         msg = "The first neighbor should be the cell itself."
         raise AssertionError(msg)
     return indices[:, 1:], distances[:, 1:]
@@ -85,7 +84,7 @@ def _get_indices_distances_from_sparse_matrix(
         indices, distances = _ind_dist_slow(d, n_neighbors)
 
     # handle RAPIDS style indices_distances lacking the self-column
-    if not _has_self_column(indices, distances):
+    if not _has_self_column(indices):
         indices = np.hstack([np.arange(indices.shape[0])[:, None], indices])
         distances = np.hstack([np.zeros(distances.shape[0])[:, None], distances])
 
@@ -100,25 +99,45 @@ def _get_indices_distances_from_sparse_matrix(
 def _ind_dist_slow(
     d: CSRBase, /, n_neighbors: int
 ) -> tuple[NDArray[np.int32 | np.int64], NDArray[np.float32 | np.float64]]:
-    indices = np.zeros((d.shape[0], n_neighbors), dtype=int)
-    distances = np.zeros((d.shape[0], n_neighbors), dtype=d.dtype)
+    d = d.tocsr()
+    n_obs = d.shape[0]
+    indices = np.zeros((n_obs, n_neighbors), dtype=int)
+    indices[:, 0] = np.arange(n_obs)  # set self-indices
+    distances = np.zeros((n_obs, n_neighbors), dtype=d.dtype)
     n_neighbors_m1 = n_neighbors - 1
-    for i in range(indices.shape[0]):
-        neighbors = d[i].nonzero()  # 'true' and 'spurious' zeros
-        indices[i, 0] = i
-        distances[i, 0] = 0
+
+    for i in range(n_obs):
+        row = d.getrow(i)
+        row_indices = row.indices
+        row_data = row.data
+
+        if len(row_indices) == 0:
+            continue
+
+        # self-indices are preset, so we filter them out here
+        mask = row_indices != i
+        non_self_indices = row_indices[mask]
+        non_self_data = row_data[mask]
+
         # account for the fact that there might be more than n_neighbors
         # due to an approximate search
         # [the point itself was not detected as its own neighbor during the search]
-        if len(neighbors[1]) > n_neighbors_m1:
-            sorted_indices = np.argsort(d[i][neighbors].A1)[:n_neighbors_m1]
-            indices[i, 1:] = neighbors[1][sorted_indices]
-            distances[i, 1:] = d[i][
-                neighbors[0][sorted_indices], neighbors[1][sorted_indices]
+        if len(non_self_indices) > n_neighbors_m1:
+            # partial sorting to get the n_neighbors - 1 smallest distances
+            partition_indices = np.argpartition(non_self_data, n_neighbors_m1 - 1)[
+                :n_neighbors_m1
+            ]
+            sorted_partition = partition_indices[
+                np.argsort(non_self_data[partition_indices])
             ]
+            indices[i, 1:] = non_self_indices[sorted_partition]
+            distances[i, 1:] = non_self_data[sorted_partition]
         else:
-            indices[i, 1:] = neighbors[1]
-            distances[i, 1:] = d[i][neighbors]
+            n_actual = len(non_self_indices)
+            if n_actual > 0:
+                indices[i, 1 : 1 + n_actual] = non_self_indices
+                distances[i, 1 : 1 + n_actual] = non_self_data
+
     return indices, distances
 
 

diff --git a/src/scanpy/neighbors/_types.py b/src/scanpy/neighbors/_types.py
@@ -12,7 +12,7 @@
 
 
 # These two are used with get_literal_vals elsewhere
-_Method = Literal["umap", "gauss"]
+_Method = Literal["umap", "gauss", "binary"]
 _KnownTransformer = Literal["pynndescent", "sklearn", "rapids"]
 
 # sphinx-autodoc-typehints can’t transitively import types from if TYPE_CHECKING blocks,

diff --git a/tests/test_neighbors.py b/tests/test_neighbors.py
@@ -132,9 +132,9 @@ def neigh() -> Neighbors:
     return get_neighbors()
 
 
-@pytest.mark.parametrize("method", ["umap", "gauss"])
+@pytest.mark.parametrize("method", ["umap", "gauss", "binary"])
 def test_distances_euclidean(
-    mocker: MockerFixture, neigh: Neighbors, method: Literal["umap", "gauss"]
+    mocker: MockerFixture, neigh: Neighbors, method: Literal["umap", "gauss", "binary"]
 ):
     """Umap and gauss behave the same for distances.
 

diff --git a/tests/test_neighbors_common.py b/tests/test_neighbors_common.py
@@ -1,13 +1,16 @@
 from __future__ import annotations
 
+import warnings
 from typing import TYPE_CHECKING
 
 import numpy as np
 import pytest
 from fast_array_utils.stats import is_constant
+from scipy import sparse
 from sklearn.neighbors import KNeighborsTransformer
 
 from scanpy.neighbors._common import (
+    _get_indices_distances_from_sparse_matrix,
     _get_sparse_matrix_from_indices_distances,
     _has_self_column,
     _ind_dist_shortcut,
@@ -17,8 +20,6 @@
     from collections.abc import Callable
     from typing import Literal
 
-    from scipy import sparse
-
     from scanpy._compat import CSRBase
 
 
@@ -43,7 +44,7 @@ def mk_knn_matrix(
     mat = _get_sparse_matrix_from_indices_distances(idxs, dists, keep_self=True)
 
     # check if out helper here works as expected
-    assert _has_self_column(idxs, dists) == (style != "rapids")
+    assert _has_self_column(idxs) == (style != "rapids")
     if duplicates:
         # Make sure the actual matrix has a regular sparsity pattern
         assert is_constant(mat.getnnz(axis=1))
@@ -98,3 +99,119 @@ def test_ind_dist_shortcut_premade(
 
     assert (mat.nnz / n_obs) == n_neighbors + 1
     assert _ind_dist_shortcut(mat) is not None
+
+
+def mk_variable_knn_matrix_self_inclusive(n_obs: int) -> CSRBase:
+    """Create matrix with variable neighbors per row, including self with distance 0."""
+    indices_list = [[0, 1], [1, 0, 2], [2, 1], [3, 0, 1]]
+    distances_list = [[0.0, 0.5], [0.0, 0.3, 0.4], [0.0, 0.7], [0.0, 0.8, 0.6]]
+
+    row_indices = []
+    col_indices = []
+    data = []
+
+    if n_obs > len(indices_list):
+        msg = (
+            f"n_obs={n_obs} is too large for the predefined variable neighbor structure "
+            f"with {len(indices_list)} rows."
+        )
+        raise ValueError(msg)
+
+    for i in range(n_obs):
+        for idx, dist in zip(indices_list[i], distances_list[i], strict=True):
+            row_indices.append(i)
+            col_indices.append(idx)
+            data.append(dist)
+
+    mat = sparse.csr_matrix(  # noqa: TID251
+        (np.array(data), (np.array(row_indices), np.array(col_indices))),
+        shape=(n_obs, n_obs),
+    )
+    return mat
+
+
+def mk_variable_knn_matrix_self_exclusive(n_obs: int) -> CSRBase:
+    """Create matrix with variable neighbors per row, excluding self (no distance 0)."""
+    indices_list = [[1], [0, 2], [1], [0, 1]]
+    distances_list = [[0.5], [0.3, 0.4], [0.7], [0.8, 0.6]]
+
+    row_indices = []
+    col_indices = []
+    data = []
+
+    if n_obs > len(indices_list):
+        msg = (
+            f"n_obs={n_obs} is too large for the predefined variable neighbor structure "
+            f"with {len(indices_list)} rows."
+        )
+        raise ValueError(msg)
+
+    for i in range(n_obs):
+        for idx, dist in zip(indices_list[i], distances_list[i], strict=True):
+            row_indices.append(i)
+            col_indices.append(idx)
+            data.append(dist)
+
+    mat = sparse.csr_matrix(  # noqa: TID251
+        (np.array(data), (np.array(row_indices), np.array(col_indices))),
+        shape=(n_obs, n_obs),
+    )
+    return mat
+
+
+@pytest.mark.parametrize(
+    ("matrix_func", "self_inclusive"),
+    [
+        pytest.param(mk_variable_knn_matrix_self_inclusive, True, id="self_inclusive"),
+        # pytest.param(mk_variable_knn_matrix_self_exclusive, False, id="self_exclusive"),
+    ],
+)
+def test_variable_neighbors_uses_slow_path(matrix_func, self_inclusive):
+    """Test variable neighbor counts trigger slow path with warning.
+
+    This test mocks `sc.pp.neighbors`, which always validates the neighbors returned from
+    the nearest neighbor search with `_get_indices_distances_from_sparse_matrix` and then
+    converts them back to a sparse.
+
+    Here we use `_get_sparse_matrix_from_indices_distances`, as is used by `method='binary'`
+    in `sc.pp.neighbors`.
+    """
+    n_obs = 4
+    n_neighbors = 3  # int(self_inclusive) + 2
+
+    mat = matrix_func(n_obs)
+
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("always")
+        indices, distances = _get_indices_distances_from_sparse_matrix(
+            mat,
+            n_neighbors=n_neighbors,
+        )
+        assert len(w) == 1
+        assert "no constant number of neighbors" in str(w[0].message)
+
+    assert indices.shape == (n_obs, n_neighbors)
+
+    # first column should always be self (distance 0)
+    assert np.array_equal(indices[:, 0], np.arange(n_obs))
+    assert np.allclose(distances[:, 0], 0.0)
+
+    connectivities = _get_sparse_matrix_from_indices_distances(
+        indices,
+        distances,
+        keep_self=False,
+    )
+    assert connectivities.shape == (n_obs, n_obs)
+
+    # TODO: This is a hack, as we prefill distances with zeros
+    # `_get_sparse_matrix_from_indices_distances` does not eliminate zeros itself
+    # but rather searches for the self column and removes it if requested
+    # We can also not rely on `_get_sparse_matrix_from_indices_distances` being
+    # called, as it is specific to `method='binary'` in `sc.pp.neighbors`.
+    connectivities.eliminate_zeros()
+
+    # check different rows have different numbers of connections
+    nnz_per_row = np.diff(connectivities.indptr)
+    assert not is_constant(nnz_per_row), (
+        f"Expected variable connectivity, got {nnz_per_row}"
+    )
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Introduces a `binary` method for `sc.pp.neighbors` that fully skips `umap` weighting of connectivities and thus returns `k` connectivities for `k` nearest-neighbors. It further includes a fix for the case in which custom neighbor transformers return less than k neighbors for some nodes (e.g., SNN and similar "adaptive" transformers). When `knn=True` and `method='umap'`, a warning is emitted snd specifies future behavior. The behavior of `n_neighbors` which differs depending on the nearest neighbor finder that is used internally is documented.