diff --git a/docs/release-notes/3616.feature.md b/docs/release-notes/3616.feature.md new file mode 100644 index 0000000000..06ea8b523d --- /dev/null +++ b/docs/release-notes/3616.feature.md @@ -0,0 +1 @@ +Add modularity scoring via {func}`modularity_adata` with support for directed/undirected graphs {smaller}`A. Karesh` diff --git a/src/scanpy/metrics/__init__.py b/src/scanpy/metrics/__init__.py index 9cdf82bdf2..5ec2229909 100644 --- a/src/scanpy/metrics/__init__.py +++ b/src/scanpy/metrics/__init__.py @@ -3,7 +3,7 @@ from __future__ import annotations from ._gearys_c import gearys_c -from ._metrics import confusion_matrix +from ._metrics import confusion_matrix, modularity, modularity_adata from ._morans_i import morans_i -__all__ = ["confusion_matrix", "gearys_c", "morans_i"] +__all__ = ["confusion_matrix", "gearys_c", "modularity", "modularity_adata", "morans_i"] diff --git a/src/scanpy/metrics/_metrics.py b/src/scanpy/metrics/_metrics.py index c24cfe9fd1..8c352abf16 100644 --- a/src/scanpy/metrics/_metrics.py +++ b/src/scanpy/metrics/_metrics.py @@ -8,10 +8,16 @@ import pandas as pd from natsort import natsorted from pandas.api.types import CategoricalDtype +from scipy.sparse import coo_matrix + +from .._compat import SpBase if TYPE_CHECKING: from collections.abc import Sequence + from anndata import AnnData + from numpy.typing import ArrayLike + def confusion_matrix( orig: pd.Series | np.ndarray | Sequence, @@ -60,7 +66,7 @@ def confusion_matrix( orig, new = pd.Series(orig), pd.Series(new) assert len(orig) == len(new) - unique_labels = pd.unique(np.concatenate((orig.values, new.values))) + unique_labels = pd.unique(np.concatenate((orig.to_numpy(), new.to_numpy()))) # Compute mtx = _confusion_matrix(orig, new, labels=unique_labels) @@ -89,3 +95,90 @@ def confusion_matrix( df = df.loc[np.array(orig_idx), np.array(new_idx)] return df + + +def modularity( + connectivities: ArrayLike | SpBase, + labels: pd.Series | ArrayLike, + *, + is_directed: bool, +) -> float: + """Compute the modularity of a graph given its connectivities and labels. + + Parameters + ---------- + connectivities: + Weighted adjacency matrix representing the graph. Can be a dense NumPy array or a sparse CSR matrix. + labels: + Cluster labels for each node in the graph. + is_directed: + Whether the graph is directed or undirected. If True, the graph is treated as directed; otherwise, it is treated as undirected. + + Returns + ------- + float + The modularity of the graph based on the provided clustering. + """ + try: + # try to import igraph in case the user wants to calculate modularity + # not in the main module to avoid import errors + import igraph as ig + except ImportError as e: + msg = "igraph is require for computing modularity" + raise ImportError(msg) from e + if isinstance(connectivities, SpBase): + # check if the connectivities is a sparse matrix + coo = coo_matrix(connectivities) + edges = list(zip(coo.row, coo.col, strict=False)) + # converting to the coo format to extract the edges and weights + # storing only non-zero elements and their indices + weights = coo.data.tolist() + graph = ig.Graph(edges=edges, directed=is_directed) + graph.es["weight"] = weights + else: + # if the graph is dense, creates it directly using igraph's adjacency matrix + dense_array = np.asarray(connectivities) + igraph_mode = ig.ADJ_DIRECTED if is_directed else ig.ADJ_UNDIRECTED + graph = ig.Graph.Weighted_Adjacency(dense_array.tolist(), mode=igraph_mode) + # cluster labels to integer codes required by igraph + labels = pd.Categorical(np.asarray(labels)).codes + + return graph.modularity(labels) + + +def modularity_adata( + adata: AnnData, + *, + label_key: str | ArrayLike = "leiden", + key: str = "neighbors", + is_directed: bool, +) -> float: + # default to leiden labels and connectivities as it is more common + """Compute modularity from an AnnData object using stored graph and clustering labels. + + Parameters + ---------- + adata: + The AnnData object containing the data. + label_key: + The key in `adata.obs` that contains the clustering labels. Defaults to "leiden". + key: + The key in `adata.obsp` that contains the connectivities. Defaults to "neighbors". + is_directed: + Whether the graph is directed or undirected. If True, the graph is treated as directed; otherwise, it is treated as undirected. + + Returns + ------- + float + The modularity of the graph based on the provided clustering. + """ + label_array = adata.obs[label_key] if isinstance(label_key, str) else label_key + connectivities_key = adata.uns[key]["connectivities_key"] + params = adata.uns[key]["params"] + connectivities = adata.obsp[connectivities_key] + is_directed = params["is_directed"] + + if isinstance(connectivities, pd.DataFrame): + connectivities = connectivities.values + + return modularity(connectivities, label_array, is_directed=is_directed) diff --git a/src/scanpy/neighbors/__init__.py b/src/scanpy/neighbors/__init__.py index 4a42111b74..8058c559db 100644 --- a/src/scanpy/neighbors/__init__.py +++ b/src/scanpy/neighbors/__init__.py @@ -21,15 +21,17 @@ from .._utils import NeighborsView, _doc_params, get_literal_vals from . import _connectivity from ._common import ( + _get_indices_distances_from_dense_matrix, _get_indices_distances_from_sparse_matrix, _get_sparse_matrix_from_indices_distances, ) +from ._connectivity import umap from ._doc import doc_n_pcs, doc_use_rep from ._types import _KnownTransformer, _Method if TYPE_CHECKING: from collections.abc import Callable, MutableMapping - from typing import Any, Literal, NotRequired + from typing import Any, Literal, NotRequired, Unpack from anndata import AnnData from igraph import Graph @@ -58,6 +60,13 @@ class KwdsForTransformer(TypedDict): random_state: _LegacyRandom +class NeighborsDict(TypedDict): # noqa: D101 + connectivities_key: str + distances_key: str + params: NeighborsParams + rp_forest: NotRequired[RPForestDict] + + class NeighborsParams(TypedDict): # noqa: D101 n_neighbors: int method: _Method @@ -74,6 +83,7 @@ def neighbors( # noqa: PLR0913 n_neighbors: int = 15, n_pcs: int | None = None, *, + distances: np.ndarray | SpBase | None = None, use_rep: str | None = None, knn: bool = True, method: _Method = "umap", @@ -83,6 +93,7 @@ def neighbors( # noqa: PLR0913 random_state: _LegacyRandom = 0, key_added: str | None = None, copy: bool = False, + is_directed: bool = False, ) -> AnnData | None: """Compute the nearest neighbors distance matrix and a neighborhood graph of observations :cite:p:`McInnes2018`. @@ -135,6 +146,7 @@ def neighbors( # noqa: PLR0913 Use :func:`rapids_singlecell.pp.neighbors` instead. metric A known metric’s name or a callable that returns a distance. + If `distances` is given, this parameter is simply stored in `.uns` (see below). *ignored if ``transformer`` is an instance.* metric_kwds @@ -186,6 +198,18 @@ def neighbors( # noqa: PLR0913 :doc:`/how-to/knn-transformers` """ + if distances is not None: + if callable(metric): + msg = "`metric` must be a string if `distances` is given." + raise TypeError(msg) + # if a precomputed distance matrix is provided, skip the PCA and distance computation + return neighbors_from_distance( + adata, + distances, + n_neighbors=n_neighbors, + metric=metric, + method=method, + ) start = logg.info("computing neighbors") adata = adata.copy() if copy else adata if adata.is_view: # we shouldn't need this here... @@ -203,51 +227,124 @@ def neighbors( # noqa: PLR0913 random_state=random_state, ) - if key_added is None: - key_added = "neighbors" - conns_key = "connectivities" - dists_key = "distances" - else: - conns_key = key_added + "_connectivities" - dists_key = key_added + "_distances" - - adata.uns[key_added] = {} - - neighbors_dict = adata.uns[key_added] - - neighbors_dict["connectivities_key"] = conns_key - neighbors_dict["distances_key"] = dists_key - - neighbors_dict["params"] = NeighborsParams( + key_added, neighbors_dict = _get_metadata( + key_added, n_neighbors=neighbors.n_neighbors, method=method, random_state=random_state, metric=metric, + **({} if not metric_kwds else dict(metric_kwds=metric_kwds)), + **({} if use_rep is None else dict(use_rep=use_rep)), + **({} if n_pcs is None else dict(n_pcs=n_pcs)), ) - if metric_kwds: - neighbors_dict["params"]["metric_kwds"] = metric_kwds - if use_rep is not None: - neighbors_dict["params"]["use_rep"] = use_rep - if n_pcs is not None: - neighbors_dict["params"]["n_pcs"] = n_pcs - adata.obsp[dists_key] = neighbors.distances - adata.obsp[conns_key] = neighbors.connectivities + neighbors_dict["params"]["is_directed"] = is_directed if neighbors.rp_forest is not None: neighbors_dict["rp_forest"] = neighbors.rp_forest + + adata.uns[key_added] = neighbors_dict + adata.obsp[neighbors_dict["distances_key"]] = neighbors.distances + adata.obsp[neighbors_dict["connectivities_key"]] = neighbors.connectivities + logg.info( " finished", time=start, deep=( f"added to `.uns[{key_added!r}]`\n" - f" `.obsp[{dists_key!r}]`, distances for each pair of neighbors\n" - f" `.obsp[{conns_key!r}]`, weighted adjacency matrix" + f" `.obsp[{neighbors_dict['distances_key']!r}]`, distances for each pair of neighbors\n" + f" `.obsp[{neighbors_dict['connectivities_key']!r}]`, weighted adjacency matrix" ), ) return adata if copy else None +def neighbors_from_distance( + adata: AnnData, + distances: np.ndarray | SpBase, + *, + n_neighbors: int = 15, + metric: _Metric = "euclidean", + method: _Method = "umap", # default to umap + key_added: str | None = None, +) -> AnnData: + """Compute neighbors from a precomputer distance matrix. + + Parameters + ---------- + adata + Annotated data matrix. + distances + Precomputed dense or sparse distance matrix. + n_neighbors + Number of nearest neighbors to use in the graph. + method + Method to use for computing the graph. Currently only 'umap' is supported. + key_added + Optional key under which to store the results. Default is 'neighbors'. + + Returns + ------- + adata + Annotated data with computed distances and connectivities. + """ + if isinstance(distances, SpBase): + distances = sparse.csr_matrix(distances) # noqa: TID251 + distances.setdiag(0) + distances.eliminate_zeros() + else: + distances = np.asarray(distances) + np.fill_diagonal(distances, 0) + + if method == "umap": + if isinstance(distances, CSRBase): + knn_indices, knn_distances = _get_indices_distances_from_sparse_matrix( + distances, n_neighbors + ) + else: + knn_indices, knn_distances = _get_indices_distances_from_dense_matrix( + distances, n_neighbors + ) + connectivities = umap( + knn_indices, knn_distances, n_obs=adata.n_obs, n_neighbors=n_neighbors + ) + elif method == "gauss": + distances = sparse.csr_matrix(distances) # noqa: TID251 + connectivities = _connectivity.gauss(distances, n_neighbors, knn=True) + else: + msg = f"Method {method} not implemented." + raise NotImplementedError(msg) + + key_added, neighbors_dict = _get_metadata( + key_added, + n_neighbors=n_neighbors, + method=method, + random_state=0, + metric=metric, + ) + adata.uns[key_added] = neighbors_dict + adata.obsp[neighbors_dict["distances_key"]] = distances + adata.obsp[neighbors_dict["connectivities_key"]] = connectivities + return adata + + +def _get_metadata( + key_added: str | None, + **params: Unpack[NeighborsParams], +) -> tuple[str, NeighborsDict]: + if key_added is None: + return "neighbors", NeighborsDict( + connectivities_key="connectivities", + distances_key="distances", + params=params, + ) + return key_added, NeighborsDict( + connectivities_key=f"{key_added}_connectivities", + distances_key=f"{key_added}_distances", + params=params, + ) + + class FlatTree(NamedTuple): # noqa: D101 hyperplanes: None offsets: None diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 414c9b479c..7fae0fe3cd 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -10,9 +10,12 @@ import pytest import threadpoolctl from scipy import sparse +from scipy.sparse import csr_matrix # noqa: TID251 import scanpy as sc +from scanpy.metrics import modularity, modularity_adata from testing.scanpy._helpers.data import pbmc68k_reduced +from testing.scanpy._pytest.marks import needs from testing.scanpy._pytest.params import ARRAY_TYPES if TYPE_CHECKING: @@ -196,3 +199,107 @@ def test_confusion_matrix_api(): pd.testing.assert_frame_equal( expected, sc.metrics.confusion_matrix(data["a"], "b", data) ) + + +# Test 1: Sample graph with clear community structure (dense & sparse, directed & undirected) +@pytest.mark.parametrize("is_directed", [False, True], ids=["undirected", "directed"]) +@pytest.mark.parametrize("use_sparse", [False, True], ids=["sparse", "dense"]) +@needs.igraph +def test_modularity_sample_structure(use_sparse, is_directed): + # 4 node adjacency matrix with two separate 2-node communities + mat = np.array( + [ + [1, 1, 0, 0], + [1, 1, 0, 0], + [0, 0, 1, 1], + [0, 0, 1, 1], + ] + ) + labels = ["A", "A", "B", "B"] + adj = csr_matrix(mat) if use_sparse else mat + score = modularity(adj, labels, is_directed=is_directed) + + # Modularity should be between 0 and 1 + assert 0 <= score <= 1 + + +# Test 2: Edge case when all nodes belong to the same community/cluster +@needs.igraph +def test_modularity_single_community(): + # fully connected graph sample + adj = np.ones((4, 4)) - np.eye(4) + labels = ["A", "A", "A", "A"] + score = modularity(adj, labels, is_directed=False) + + # modularity should be 0 + assert score == pytest.approx(0.0, rel=1e-6) + + +# Test 3: Invalad input, labels length does not match adjacency matrix size +@needs.igraph +def test_modularity_invalid_labels(): + from igraph._igraph import InternalError + + adj = np.eye(4) + labels = ["A", "A", "B"] + with pytest.raises( + InternalError, + match="Membership vector size differs", + ): + modularity(adj, labels, is_directed=False) + + +# Test 4: Pass both Louvain and Leiden clustering algorithms +@pytest.mark.parametrize("cluster_method", ["louvain", "leiden"]) +@needs.igraph +@needs.louvain +@needs.leidenalg +def test_modularity_adata_multiple_clusterings(cluster_method): + # Loading PBMC Data and compute PCA and neighbors graph + adata = sc.datasets.pbmc3k() + sc.pp.pca(adata) + sc.pp.neighbors(adata) + # Compute modularity using both Louvain and Leiden clustering + if cluster_method == "louvain": + sc.tl.louvain(adata) + score_louvain = modularity_adata( + adata, labels="louvain", obsp="connectivities", is_directed=False + ) + # Score should be between 0 and 1 + assert 0 <= score_louvain <= 1 + if cluster_method == "leiden": + sc.tl.leiden(adata) + score_leiden = modularity_adata( + adata, labels="leiden", obsp="connectivities", is_directed=False + ) + # Score should be between 0 and 1 + assert 0 <= score_leiden <= 1 + + +# Test 5: Modularity should be the same no matter the order of the labels +@needs.igraph +def test_modularity_order(): + adj = np.array( + [ + [1, 1, 0, 0], + [1, 1, 0, 0], + [0, 0, 1, 1], + [0, 0, 1, 1], + ] + ) + labels1 = ["A", "A", "B", "B"] + labels2 = ["B", "B", "A", "A"] + score_1 = modularity(adj, labels1, is_directed=False) + score_2 = modularity(adj, labels2, is_directed=False) + assert score_1 == score_2 + + +# Test 6: Modularity on disconnected graph lke edge-case behavior in some algorithms +@needs.igraph +def test_modularity_disconnected_graph(): + adj = np.zeros((4, 4)) + labels = ["A", "B", "C", "D"] + score = modularity(adj, labels, is_directed=False) + + # Modularity should be undefined for disconnected graphs + assert np.isnan(score) diff --git a/tests/test_neighbors.py b/tests/test_neighbors.py index 904c47f813..7562394548 100644 --- a/tests/test_neighbors.py +++ b/tests/test_neighbors.py @@ -13,6 +13,7 @@ from scanpy import Neighbors from scanpy._compat import CSBase from testing.scanpy._helpers import anndata_v0_8_constructor_compat +from testing.scanpy._helpers.data import pbmc68k_reduced if TYPE_CHECKING: from typing import Literal @@ -241,3 +242,22 @@ def test_restore_n_neighbors(neigh, conv): ad.uns["neighbors"] = dict(connectivities=conv(neigh.connectivities)) neigh_restored = Neighbors(ad) assert neigh_restored.n_neighbors == 1 + + +def test_neighbors_distance_equivalence(): + adata = pbmc68k_reduced() + adata_d = adata.copy() + + sc.pp.neighbors(adata) + # reusing the same distances + sc.pp.neighbors(adata_d, distances=adata.obsp["distances"]) + np.testing.assert_allclose( + adata.obsp["connectivities"].toarray(), + adata_d.obsp["connectivities"].toarray(), + rtol=1e-5, + ) + np.testing.assert_allclose( + adata.obsp["distances"].toarray(), + adata_d.obsp["distances"].toarray(), + rtol=1e-5, + )