Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/release-notes/3807.feat.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Introduces a `binary` method for `sc.pp.neighbors` that fully skips `umap` weighting of connectivities and thus returns `k` connectivities for `k` nearest-neighbors. It further includes a fix for the case in which custom neighbor transformers return less than k neighbors for some nodes (e.g., SNN and similar "adaptive" transformers). When `knn=True` and `method='umap'`, a warning is emitted snd specifies future behavior. The behavior of `n_neighbors` which differs depending on the nearest neighbor finder that is used internally is documented.
36 changes: 32 additions & 4 deletions src/scanpy/neighbors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,23 @@ def neighbors( # noqa: PLR0913
`n_neighbors`, that is, consider a knn graph. Otherwise, use a Gaussian
Kernel to assign low weights to neighbors more distant than the
`n_neighbors` nearest neighbor.

.. note::
The `knn` parameter affects all methods but differently:

- 'gauss': Directly controls whether to use hard thresholding vs soft weighting
- 'umap': Affects the initial k-NN preprocessing, but UMAP still creates
varying connectivity strengths via fuzzy simplicial sets
- 'binary': Creates uniform (0/1) connectivity weights when `knn=True`

Use `method='binary'` for strictly uniform connectivity weights.
method
Use 'umap' :cite:p:`McInnes2018` or 'gauss' (Gauss kernel following :cite:t:`Coifman2005`
with adaptive width :cite:t:`Haghverdi2016`) for computing connectivities.
Use 'umap' :cite:p:`McInnes2018`, 'gauss' (Gauss kernel following :cite:t:`Coifman2005`
with adaptive width :cite:t:`Haghverdi2016`), or 'binary' for computing connectivities.

- 'umap': Creates connectivities with varying strengths based on fuzzy simplicial sets
- 'gauss': Uses Gaussian kernels with adaptive widths, respects `knn` parameter
- 'binary': Creates binary (0/1) connectivities based on k-nearest neighbor graph
transformer
Approximate kNN search implementation following the API of
:class:`~sklearn.neighbors.KNeighborsTransformer`.
Expand Down Expand Up @@ -541,6 +555,7 @@ def compute_neighbors( # noqa: PLR0912
method
See :func:`scanpy.pp.neighbors`.
If `None`, skip calculating connectivities.
Use 'binary' for uniform connectivity weights when `knn=True`.

Returns
-------
Expand Down Expand Up @@ -570,6 +585,15 @@ def compute_neighbors( # noqa: PLR0912

if self._adata.shape[0] >= 10000 and not knn:
logg.warning("Using high n_obs without `knn=True` takes a lot of memory...")

# Warn when using knn=True with umap method
if knn and method == "umap":
logg.warning(
"Using `knn=True` with `method='umap'` affects k-NN preprocessing "
"but UMAP will still create connectivities with varying strengths. "
"For uniform connectivity weights, use `method='binary'` instead."
)

# do not use the cached rp_forest
self._rp_forest = None
self.n_neighbors = n_neighbors
Expand Down Expand Up @@ -598,7 +622,11 @@ def compute_neighbors( # noqa: PLR0912
self._rp_forest = _make_forest_dict(index)
start_connect = logg.debug("computed neighbors", time=start_neighbors)

if method == "umap":
if method == "binary":
self._connectivities = _get_sparse_matrix_from_indices_distances(
knn_indices, np.ones_like(knn_distances), keep_self=False
)
elif method == "umap":
self._connectivities = _connectivity.umap(
knn_indices,
knn_distances,
Expand Down Expand Up @@ -664,7 +692,7 @@ def _handle_transformer(
raise ValueError(msg)

# Validate `knn`
conn_method = method if method in {"gauss", None} else "umap"
conn_method = method if method in {"gauss", "binary", None} else "umap"
if not knn and not (conn_method == "gauss" and transformer is None):
# “knn=False” seems to be only intended for method “gauss”
msg = f"`method = {method!r} only with `knn = True`."
Expand Down
51 changes: 35 additions & 16 deletions src/scanpy/neighbors/_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@

def _has_self_column(
indices: NDArray[np.int32 | np.int64],
distances: NDArray[np.float32 | np.float64],
) -> bool:
# some algorithms have some messed up reordering.
return (indices[:, 0] == np.arange(indices.shape[0])).any()
Expand All @@ -25,7 +24,7 @@ def _remove_self_column(
indices: NDArray[np.int32 | np.int64],
distances: NDArray[np.float32 | np.float64],
) -> tuple[NDArray[np.int32 | np.int64], NDArray[np.float32 | np.float64]]:
if not _has_self_column(indices, distances):
if not _has_self_column(indices):
msg = "The first neighbor should be the cell itself."
raise AssertionError(msg)
return indices[:, 1:], distances[:, 1:]
Expand Down Expand Up @@ -85,7 +84,7 @@ def _get_indices_distances_from_sparse_matrix(
indices, distances = _ind_dist_slow(d, n_neighbors)

# handle RAPIDS style indices_distances lacking the self-column
if not _has_self_column(indices, distances):
if not _has_self_column(indices):
indices = np.hstack([np.arange(indices.shape[0])[:, None], indices])
distances = np.hstack([np.zeros(distances.shape[0])[:, None], distances])

Expand All @@ -100,25 +99,45 @@ def _get_indices_distances_from_sparse_matrix(
def _ind_dist_slow(
d: CSRBase, /, n_neighbors: int
) -> tuple[NDArray[np.int32 | np.int64], NDArray[np.float32 | np.float64]]:
indices = np.zeros((d.shape[0], n_neighbors), dtype=int)
distances = np.zeros((d.shape[0], n_neighbors), dtype=d.dtype)
d = d.tocsr()
n_obs = d.shape[0]
indices = np.zeros((n_obs, n_neighbors), dtype=int)
indices[:, 0] = np.arange(n_obs) # set self-indices
distances = np.zeros((n_obs, n_neighbors), dtype=d.dtype)
n_neighbors_m1 = n_neighbors - 1
for i in range(indices.shape[0]):
neighbors = d[i].nonzero() # 'true' and 'spurious' zeros
indices[i, 0] = i
distances[i, 0] = 0

for i in range(n_obs):
row = d.getrow(i)
row_indices = row.indices
row_data = row.data

if len(row_indices) == 0:
continue

# self-indices are preset, so we filter them out here
mask = row_indices != i
non_self_indices = row_indices[mask]
non_self_data = row_data[mask]

# account for the fact that there might be more than n_neighbors
# due to an approximate search
# [the point itself was not detected as its own neighbor during the search]
if len(neighbors[1]) > n_neighbors_m1:
sorted_indices = np.argsort(d[i][neighbors].A1)[:n_neighbors_m1]
indices[i, 1:] = neighbors[1][sorted_indices]
distances[i, 1:] = d[i][
neighbors[0][sorted_indices], neighbors[1][sorted_indices]
if len(non_self_indices) > n_neighbors_m1:
# partial sorting to get the n_neighbors - 1 smallest distances
partition_indices = np.argpartition(non_self_data, n_neighbors_m1 - 1)[
:n_neighbors_m1
]
sorted_partition = partition_indices[
np.argsort(non_self_data[partition_indices])
]
indices[i, 1:] = non_self_indices[sorted_partition]
distances[i, 1:] = non_self_data[sorted_partition]
else:
indices[i, 1:] = neighbors[1]
distances[i, 1:] = d[i][neighbors]
n_actual = len(non_self_indices)
if n_actual > 0:
indices[i, 1 : 1 + n_actual] = non_self_indices
distances[i, 1 : 1 + n_actual] = non_self_data

return indices, distances


Expand Down
2 changes: 1 addition & 1 deletion src/scanpy/neighbors/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@


# These two are used with get_literal_vals elsewhere
_Method = Literal["umap", "gauss"]
_Method = Literal["umap", "gauss", "binary"]
_KnownTransformer = Literal["pynndescent", "sklearn", "rapids"]

# sphinx-autodoc-typehints can’t transitively import types from if TYPE_CHECKING blocks,
Expand Down
4 changes: 2 additions & 2 deletions tests/test_neighbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,9 @@ def neigh() -> Neighbors:
return get_neighbors()


@pytest.mark.parametrize("method", ["umap", "gauss"])
@pytest.mark.parametrize("method", ["umap", "gauss", "binary"])
def test_distances_euclidean(
mocker: MockerFixture, neigh: Neighbors, method: Literal["umap", "gauss"]
mocker: MockerFixture, neigh: Neighbors, method: Literal["umap", "gauss", "binary"]
):
"""Umap and gauss behave the same for distances.

Expand Down
123 changes: 120 additions & 3 deletions tests/test_neighbors_common.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
from __future__ import annotations

import warnings
from typing import TYPE_CHECKING

import numpy as np
import pytest
from fast_array_utils.stats import is_constant
from scipy import sparse
from sklearn.neighbors import KNeighborsTransformer

from scanpy.neighbors._common import (
_get_indices_distances_from_sparse_matrix,
_get_sparse_matrix_from_indices_distances,
_has_self_column,
_ind_dist_shortcut,
Expand All @@ -17,8 +20,6 @@
from collections.abc import Callable
from typing import Literal

from scipy import sparse

from scanpy._compat import CSRBase


Expand All @@ -43,7 +44,7 @@ def mk_knn_matrix(
mat = _get_sparse_matrix_from_indices_distances(idxs, dists, keep_self=True)

# check if out helper here works as expected
assert _has_self_column(idxs, dists) == (style != "rapids")
assert _has_self_column(idxs) == (style != "rapids")
if duplicates:
# Make sure the actual matrix has a regular sparsity pattern
assert is_constant(mat.getnnz(axis=1))
Expand Down Expand Up @@ -98,3 +99,119 @@ def test_ind_dist_shortcut_premade(

assert (mat.nnz / n_obs) == n_neighbors + 1
assert _ind_dist_shortcut(mat) is not None


def mk_variable_knn_matrix_self_inclusive(n_obs: int) -> CSRBase:
"""Create matrix with variable neighbors per row, including self with distance 0."""
indices_list = [[0, 1], [1, 0, 2], [2, 1], [3, 0, 1]]
distances_list = [[0.0, 0.5], [0.0, 0.3, 0.4], [0.0, 0.7], [0.0, 0.8, 0.6]]

row_indices = []
col_indices = []
data = []

if n_obs > len(indices_list):
msg = (
f"n_obs={n_obs} is too large for the predefined variable neighbor structure "
f"with {len(indices_list)} rows."
)
raise ValueError(msg)

for i in range(n_obs):
for idx, dist in zip(indices_list[i], distances_list[i], strict=True):
row_indices.append(i)
col_indices.append(idx)
data.append(dist)

mat = sparse.csr_matrix( # noqa: TID251
(np.array(data), (np.array(row_indices), np.array(col_indices))),
shape=(n_obs, n_obs),
)
return mat


def mk_variable_knn_matrix_self_exclusive(n_obs: int) -> CSRBase:
"""Create matrix with variable neighbors per row, excluding self (no distance 0)."""
indices_list = [[1], [0, 2], [1], [0, 1]]
distances_list = [[0.5], [0.3, 0.4], [0.7], [0.8, 0.6]]

row_indices = []
col_indices = []
data = []

if n_obs > len(indices_list):
msg = (
f"n_obs={n_obs} is too large for the predefined variable neighbor structure "
f"with {len(indices_list)} rows."
)
raise ValueError(msg)

for i in range(n_obs):
for idx, dist in zip(indices_list[i], distances_list[i], strict=True):
row_indices.append(i)
col_indices.append(idx)
data.append(dist)

mat = sparse.csr_matrix( # noqa: TID251
(np.array(data), (np.array(row_indices), np.array(col_indices))),
shape=(n_obs, n_obs),
)
return mat


@pytest.mark.parametrize(
("matrix_func", "self_inclusive"),
[
pytest.param(mk_variable_knn_matrix_self_inclusive, True, id="self_inclusive"),
# pytest.param(mk_variable_knn_matrix_self_exclusive, False, id="self_exclusive"),
],
)
def test_variable_neighbors_uses_slow_path(matrix_func, self_inclusive):
"""Test variable neighbor counts trigger slow path with warning.

This test mocks `sc.pp.neighbors`, which always validates the neighbors returned from
the nearest neighbor search with `_get_indices_distances_from_sparse_matrix` and then
converts them back to a sparse.

Here we use `_get_sparse_matrix_from_indices_distances`, as is used by `method='binary'`
in `sc.pp.neighbors`.
"""
n_obs = 4
n_neighbors = 3 # int(self_inclusive) + 2

mat = matrix_func(n_obs)

with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
indices, distances = _get_indices_distances_from_sparse_matrix(
mat,
n_neighbors=n_neighbors,
)
assert len(w) == 1
assert "no constant number of neighbors" in str(w[0].message)

assert indices.shape == (n_obs, n_neighbors)

# first column should always be self (distance 0)
assert np.array_equal(indices[:, 0], np.arange(n_obs))
assert np.allclose(distances[:, 0], 0.0)

connectivities = _get_sparse_matrix_from_indices_distances(
indices,
distances,
keep_self=False,
)
assert connectivities.shape == (n_obs, n_obs)

# TODO: This is a hack, as we prefill distances with zeros
# `_get_sparse_matrix_from_indices_distances` does not eliminate zeros itself
# but rather searches for the self column and removes it if requested
# We can also not rely on `_get_sparse_matrix_from_indices_distances` being
# called, as it is specific to `method='binary'` in `sc.pp.neighbors`.
connectivities.eliminate_zeros()

# check different rows have different numbers of connections
nnz_per_row = np.diff(connectivities.indptr)
assert not is_constant(nnz_per_row), (
f"Expected variable connectivity, got {nnz_per_row}"
)
Loading