diff --git a/docs/release-notes/3831.feat.md b/docs/release-notes/3831.feat.md new file mode 100644 index 0000000000..3cd594e306 --- /dev/null +++ b/docs/release-notes/3831.feat.md @@ -0,0 +1 @@ +Add `jaccard` as `method` for generating connectivities in `sc.pp.neighbors()` diff --git a/src/scanpy/neighbors/__init__.py b/src/scanpy/neighbors/__init__.py index 02c5b2767b..d896583b49 100644 --- a/src/scanpy/neighbors/__init__.py +++ b/src/scanpy/neighbors/__init__.py @@ -90,7 +90,8 @@ def neighbors( # noqa: PLR0913 which also provides a method for estimating connectivities of data points - the connectivity of the manifold (`method=='umap'`). If `method=='gauss'`, connectivities are computed according to :cite:t:`Coifman2005`, in the adaption of - :cite:t:`Haghverdi2016`. + :cite:t:`Haghverdi2016`. If `method=='jaccard'`, connectivities are computed as + in PhenoGraph (:cite:p:`Levine2015`). Parameters ---------- @@ -114,8 +115,9 @@ def neighbors( # noqa: PLR0913 Kernel to assign low weights to neighbors more distant than the `n_neighbors` nearest neighbor. method - Use 'umap' :cite:p:`McInnes2018` or 'gauss' (Gauss kernel following :cite:t:`Coifman2005` - with adaptive width :cite:t:`Haghverdi2016`) for computing connectivities. + Use 'umap' :cite:p:`McInnes2018`, 'gauss' (Gauss kernel following :cite:t:`Coifman2005` + with adaptive width :cite:t:`Haghverdi2016`), or 'jaccard' (Jaccard kernel as in + PhenoGraph, :cite:p:`Levine2015`) for computing connectivities. transformer Approximate kNN search implementation following the API of :class:`~sklearn.neighbors.KNeighborsTransformer`. @@ -609,6 +611,12 @@ def compute_neighbors( # noqa: PLR0912 self._connectivities = _connectivity.gauss( self._distances, self.n_neighbors, knn=self.knn ) + elif method == "jaccard": + self._connectivities = _connectivity.jaccard( + knn_indices, + n_obs=self._adata.shape[0], + n_neighbors=self.n_neighbors, + ) elif method is not None: msg = f"{method!r} should have been coerced in _handle_transform_args" raise AssertionError(msg) @@ -631,7 +639,7 @@ def _handle_transformer( ) -> tuple[_Method | None, KnnTransformerLike, bool]: """Return effective `method` and transformer. - `method` will be coerced to `'gauss'` or `'umap'`. + `method` will be coerced to `'gauss'`, `'umap'`, or `'jaccard'`. `transformer` is coerced from a str or instance to an instance class. If `transformer` is `None` and there are few data points, @@ -650,7 +658,7 @@ def _handle_transformer( transformer is None and (use_dense_distances or self._adata.n_obs < 4096) ) - # Coerce `method` to 'gauss' or 'umap' + # Coerce `method` to 'gauss', 'umap', or 'jaccard' if method == "rapids": if transformer is not None: msg = "Can’t specify both `method = 'rapids'` and `transformer`." @@ -664,7 +672,7 @@ def _handle_transformer( raise ValueError(msg) # Validate `knn` - conn_method = method if method in {"gauss", None} else "umap" + conn_method = method if method in {"gauss", "jaccard", None} else "umap" if not knn and not (conn_method == "gauss" and transformer is None): # “knn=False” seems to be only intended for method “gauss” msg = f"`method = {method!r} only with `knn = True`." diff --git a/src/scanpy/neighbors/_connectivity.py b/src/scanpy/neighbors/_connectivity.py index d616f9e8da..f37fdbb231 100644 --- a/src/scanpy/neighbors/_connectivity.py +++ b/src/scanpy/neighbors/_connectivity.py @@ -11,6 +11,7 @@ from ._common import ( _get_indices_distances_from_dense_matrix, _get_indices_distances_from_sparse_matrix, + _get_sparse_matrix_from_indices_distances, ) D = TypeVar("D", NDArray[np.float32], CSRBase) @@ -136,3 +137,52 @@ def umap( ) return connectivities.tocsr() + + +def jaccard( + knn_indices: NDArray[np.int32 | np.int64], + *, + n_obs: int, + n_neighbors: int, +) -> CSRBase: + """Derive Jaccard connectivities between data points from kNN indices. + + Re-implements the weighting method from Phenograph, :cite:p:`Levine2015`. + + Parameters + ---------- + knn_indices + The input matrix of nearest neighbor indices for each cell. + n_obs + Number of cells in the data-set. + n_neighbors + The number of nearest neighbors to consider. + + """ + # Construct unweighted kNN adjacency matrix (self excluded, + # as in PhenoGraph) + adjacency = _get_sparse_matrix_from_indices_distances( + knn_indices, np.ones_like(knn_indices), keep_self=False + ) + + # Compute |N(i) ∩ N(j)| + i_idx = np.repeat(np.arange(n_obs), n_neighbors - 1) + j_idx = knn_indices[:, 1:].ravel() + rows_i = adjacency[i_idx, :] + rows_j = adjacency[j_idx, :] + shared = np.asarray(rows_i.multiply(rows_j).sum(axis=1)).ravel() + + # Jaccard index + jaccard = shared / (2 * (n_neighbors - 1) - shared) + + # Build connectivity matrix, filtering out zeros + mask = jaccard != 0 + connectivities = sparse.csr_matrix( # noqa: TID251 + (jaccard[mask], (i_idx[mask], j_idx[mask])), + shape=(n_obs, n_obs), + ) + + # Symmetrise by averaging (as default in PhenoGraph) + connectivities = (connectivities + connectivities.T) / 2 + + return connectivities diff --git a/src/scanpy/neighbors/_types.py b/src/scanpy/neighbors/_types.py index 7c52ae56e6..a84550d177 100644 --- a/src/scanpy/neighbors/_types.py +++ b/src/scanpy/neighbors/_types.py @@ -12,7 +12,7 @@ # These two are used with get_literal_vals elsewhere -_Method = Literal["umap", "gauss"] +_Method = Literal["umap", "gauss", "jaccard"] _KnownTransformer = Literal["pynndescent", "sklearn", "rapids"] # sphinx-autodoc-typehints can’t transitively import types from if TYPE_CHECKING blocks, diff --git a/tests/test_neighbors.py b/tests/test_neighbors.py index 05fff75196..d1aaa46303 100644 --- a/tests/test_neighbors.py +++ b/tests/test_neighbors.py @@ -123,6 +123,29 @@ ] +# jaccard"kernel – only knn results +connectivities_jaccard = [ + [0.0, 0.3333333333333333, 0.0, 0.3333333333333333], + [0.3333333333333333, 0.0, 0.16666666666666666, 0.3333333333333333], + [0.0, 0.16666666666666666, 0.0, 0.16666666666666666], + [0.3333333333333333, 0.3333333333333333, 0.16666666666666666, 0.0], +] + +transitions_sym_jaccard = [ + [0.0, 0.4225771273642583, 0.0, 0.4225771273642583], + [0.4225771273642583, 0.0, 0.4225771273642583, 0.2857142857142857], + [0.0, 0.4225771273642583, 0.0, 0.4225771273642583], + [0.4225771273642583, 0.2857142857142857, 0.4225771273642583, 0.0], +] + +transitions_jaccard = [ + [0.0, 0.5, 0.0, 0.5], + [0.35714285714285715, 0.0, 0.35714285714285715, 0.2857142857142857], + [0.0, 0.5, 0.0, 0.5], + [0.35714285714285715, 0.2857142857142857, 0.35714285714285715, 0.0], +] + + def get_neighbors() -> Neighbors: return Neighbors(AnnData(np.array(X))) @@ -132,11 +155,11 @@ def neigh() -> Neighbors: return get_neighbors() -@pytest.mark.parametrize("method", ["umap", "gauss"]) +@pytest.mark.parametrize("method", ["umap", "gauss", "jaccard"]) def test_distances_euclidean( - mocker: MockerFixture, neigh: Neighbors, method: Literal["umap", "gauss"] + mocker: MockerFixture, neigh: Neighbors, method: Literal["umap", "gauss", "jaccard"] ): - """Umap and gauss behave the same for distances. + """Umap, gauss, and jaccard behave the same for distances. They call pynndescent for large data. """ @@ -194,6 +217,13 @@ def test_distances_all(neigh: Neighbors, transformer, knn): transitions_sym_gauss_knn, id="gauss", ), + pytest.param( + "jaccard", + connectivities_jaccard, + transitions_jaccard, + transitions_sym_jaccard, + id="jaccard", + ), ], ) def test_connectivities_euclidean(neigh: Neighbors, method, conn, trans, trans_sym):