feat: add modularity to scanpy.metrics (#3613)

amalia-k510 · flying-sheep · pre-commit-ci[bot] · web-flow · commit 9aaf126a9c44 · 2026-01-22T17:19:22.000+01:00
Co-authored-by: Philipp A. &lt;flying-sheep@web.de&gt;
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
diff --git a/docs/api/metrics.md b/docs/api/metrics.md
@@ -15,6 +15,7 @@ Collections of useful measurements for evaluating results.
    :nosignatures:
    :toctree: ../generated/
 
+   metrics.modularity
    metrics.confusion_matrix
    metrics.gearys_c
    metrics.morans_i
diff --git a/docs/conf.py b/docs/conf.py
@@ -260,7 +260,10 @@ def setup(app: Sphinx) -> None:
     "scanpy.plotting._dotplot.DotPlot": "scanpy.pl.DotPlot",
     "scanpy.plotting._stacked_violin.StackedViolin": "scanpy.pl.StackedViolin",
     "pandas.core.series.Series": "pandas.Series",
+    # https://github.com/pandas-dev/pandas/issues/63810
+    "pandas.api.typing.aliases.AnyArrayLike": ("doc", "pandas:reference/aliases"),
     "numpy.bool_": "numpy.bool",  # Since numpy 2, numpy.bool is the canonical dtype
+    "numpy.typing.ArrayLike": ("py:data", "numpy.typing.ArrayLike"),
 }
 
 nitpick_ignore = [
diff --git a/docs/release-notes/3613.feat.md b/docs/release-notes/3613.feat.md
@@ -0,0 +1 @@
+Add modularity scoring via {func}`scanpy.metrics.modularity` with support for directed/undirected graphs {smaller}`A. Karesh`
diff --git a/hatch.toml b/hatch.toml
@@ -4,6 +4,7 @@ dependency-groups = [ "dev" ]
 
 [envs.docs]
 dependency-groups = [ "doc" ]
+extra-dependencies = [ "pandas>=3" ]
 scripts.build = "sphinx-build -M html docs docs/_build -W {args}"
 scripts.open = "python3 -m webbrowser -t docs/_build/html/index.html"
 scripts.clean = "git clean -fdX -- {args:docs}"
diff --git a/pyproject.toml b/pyproject.toml
@@ -113,7 +113,7 @@ test = [
 doc = [
     "sphinx>=8.2.3",
     "sphinx-book-theme>=1.1.0",
-    "scanpydoc>=0.16",
+    "scanpydoc>=0.16.1",
     "sphinx-autodoc-typehints>=1.25.2",
     "sphinx-issues>=5.0.1",
     "myst-parser>=2",
diff --git a/src/scanpy/_utils/__init__.py b/src/scanpy/_utils/__init__.py
@@ -883,20 +883,25 @@ class NeighborsView:
         This defines where to look for neighbors dictionary,
         connectivities, distances.
 
-        neigh = NeighborsView(adata, key)
-        neigh['distances']
-        neigh['connectivities']
-        neigh['params']
-        'connectivities' in neigh
-        'params' in neigh
-
-        is the same as
-
-        adata.obsp[adata.uns[key]['distances_key']]
-        adata.obsp[adata.uns[key]['connectivities_key']]
-        adata.uns[key]['params']
-        adata.uns[key]['connectivities_key'] in adata.obsp
-        'params' in adata.uns[key]
+    Examples
+    --------
+    >>> import scanpy as sc
+    >>> adata = sc.datasets.pbmc68k_reduced()
+    >>> key = "neighbors"
+
+    >>> neigh = NeighborsView(adata, key)
+    >>> d = neigh["distances"]
+    >>> c = neigh["connectivities"]
+    >>> p = neigh["params"]
+
+    is the same as doing this manually
+
+    >>> d_key = adata.uns[key].get("distances_key", "distances")
+    >>> c_key = adata.uns[key].get("connectivities_key", "connectivities")
+    >>> assert d is adata.obsp[d_key]
+    >>> assert c is adata.obsp[c_key]
+    >>> assert p is adata.uns[key]["params"]
+    >>> assert c_key in adata.obsp
 
     """
 
diff --git a/src/scanpy/metrics/__init__.py b/src/scanpy/metrics/__init__.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 from ._gearys_c import gearys_c
-from ._metrics import confusion_matrix
+from ._metrics import confusion_matrix, modularity
 from ._morans_i import morans_i
 
-__all__ = ["confusion_matrix", "gearys_c", "morans_i"]
+__all__ = ["confusion_matrix", "gearys_c", "modularity", "morans_i"]
diff --git a/src/scanpy/metrics/_metrics.py b/src/scanpy/metrics/_metrics.py
@@ -2,15 +2,28 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, overload
 
 import numpy as np
 import pandas as pd
+from anndata import AnnData
 from natsort import natsorted
 from pandas.api.types import CategoricalDtype
 
+from .._utils import NeighborsView
+
 if TYPE_CHECKING:
     from collections.abc import Sequence
+    from typing import Literal
+
+    if TYPE_CHECKING:
+        from pandas.api.typing.aliases import AnyArrayLike
+    else:  # sphinx-autodoc-typehints will execute the outer block, but end up here:
+        AnyArrayLike = type(
+            "AnyArrayLike", (), dict(__module__="pandas.api.typing.aliases")
+        )
+
+    from .._compat import SpBase
 
 
 def confusion_matrix(
@@ -89,3 +102,119 @@ def confusion_matrix(
     df = df.loc[np.array(orig_idx), np.array(new_idx)]
 
     return df
+
+
+@overload
+def modularity(
+    connectivities: AnyArrayLike | SpBase, /, labels: AnyArrayLike, *, is_directed: bool
+) -> float: ...
+
+
+@overload
+def modularity(
+    adata: AnnData,
+    /,
+    labels: str | AnyArrayLike = "leiden",
+    *,
+    neighbors_key: str | None = None,
+    mode: Literal["calculate", "update", "retrieve"] = "calculate",
+) -> float: ...
+
+
+def modularity(
+    adata_or_connectivities: AnnData | AnyArrayLike | SpBase,
+    /,
+    labels: str | AnyArrayLike = "leiden",
+    *,
+    neighbors_key: str | None = None,
+    is_directed: bool | None = None,
+    mode: Literal["calculate", "update", "retrieve"] = "calculate",
+) -> float:
+    """Compute the modularity of a graph given its connectivities and labels.
+
+    Parameters
+    ----------
+    adata_or_connectivities
+        The AnnData object containing the data or a weighted adjacency matrix representing the graph.
+    labels
+        Cluster labels for each node in the graph.
+        When `AnnData` is provided, this can be the key in `adata.obs` that contains the clustering labels and defaults to `"leiden"`.
+    neighbors_key
+        When `AnnData` is provided, the key in `adata.obsp` that contains the connectivities.
+    is_directed
+        Whether the connectivities are directed or undirected.
+        Always `False` if `AnnData` is provided, as connectivities are derived from (symmetric) neighbors.
+    mode
+        When `AnnData` is provided,
+        this controls if the stored modularity is retrieved,
+        or if we should calculate it (and optionally update it in `adata.uns[labels]`).
+
+    Returns
+    -------
+    The modularity of the graph based on the provided clustering.
+    """
+    if isinstance(adata_or_connectivities, AnnData):
+        if is_directed:
+            msg = f"Connectivities stored in `AnnData` are undirected, can’t specify `{is_directed=!r}`"
+            raise ValueError(msg)
+        return modularity_adata(
+            adata_or_connectivities,
+            labels=labels,
+            neighbors_key=neighbors_key,
+            mode=mode,
+        )
+    if isinstance(labels, str):
+        msg = "`labels` must be provided as array when passing a connectivities array"
+        raise TypeError(msg)
+    if is_directed is None:
+        msg = "`is_directed` must be provided when passing a connectivities array"
+        raise TypeError(msg)
+    return modularity_array(
+        adata_or_connectivities, labels=labels, is_directed=is_directed
+    )
+
+
+def modularity_adata(
+    adata: AnnData,
+    /,
+    *,
+    labels: str | AnyArrayLike,
+    neighbors_key: str | None,
+    mode: Literal["calculate", "update", "retrieve"],
+) -> float:
+    if mode in {"retrieve", "update"} and not isinstance(labels, str):
+        msg = "`labels` must be a string when `mode` is `'retrieve'` or `'update'`"
+        raise ValueError(msg)
+    if mode == "retrieve":
+        return adata.uns[labels]["modularity"]
+
+    labels_vec = adata.obs[labels] if isinstance(labels, str) else labels
+    connectivities = NeighborsView(adata, neighbors_key)["connectivities"]
+
+    # distances are treated as symmetric, so connectivities as well
+    m = modularity(connectivities, labels_vec, is_directed=False)
+    if mode == "update":
+        adata.uns[labels]["modularity"] = m
+    return m
+
+
+def modularity_array(
+    connectivities: AnyArrayLike | SpBase, /, *, labels: AnyArrayLike, is_directed: bool
+) -> float:
+    try:
+        import igraph as ig
+    except ImportError as e:  # pragma: no cover
+        msg = "igraph is require for computing modularity"
+        raise ImportError(msg) from e
+    igraph_mode: str = ig.ADJ_DIRECTED if is_directed else ig.ADJ_UNDIRECTED
+    graph: ig.Graph = ig.Graph.Weighted_Adjacency(connectivities, mode=igraph_mode)
+    return graph.modularity(_codes(labels))
+
+
+def _codes(labels: AnyArrayLike) -> AnyArrayLike:
+    """Convert cluster labels to integer codes as required by igraph."""
+    if isinstance(labels, pd.Series):
+        labels = labels.astype("category").array
+    if not isinstance(labels, pd.Categorical):
+        labels = pd.Categorical(labels)
+    return labels.codes
diff --git a/src/scanpy/tools/_leiden.py b/src/scanpy/tools/_leiden.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, cast
 
 import numpy as np
 import pandas as pd
@@ -47,7 +47,7 @@ def leiden(  # noqa: PLR0912, PLR0913, PLR0915
     flavor: Literal["leidenalg", "igraph"] | None = None,
     **clustering_args,
 ) -> AnnData | None:
-    """Cluster cells into subgroups :cite:p:`Traag2019`.
+    r"""Cluster cells into subgroups :cite:p:`Traag2019`.
 
     Cluster cells using the Leiden algorithm :cite:p:`Traag2019`,
     an improved version of the Louvain algorithm :cite:p:`Blondel2008`.
@@ -120,6 +120,12 @@ def leiden(  # noqa: PLR0912, PLR0913, PLR0915
         A dict with the values for the parameters `resolution`, `random_state`,
         and `n_iterations`.
 
+    `adata.uns['leiden' | key_added]['modularity']` : :class:`float`
+        The modularity score of the final clustering,
+        as calculated by the `flavor`.
+        Use :func:`scanpy.metrics.modularity`\ `(adata, mode='calculate' | 'update')`
+        to calculate a score independent of `flavor`.
+
     """
     if flavor is None:
         flavor = "leidenalg"
@@ -178,7 +184,10 @@ def leiden(  # noqa: PLR0912, PLR0913, PLR0915
         if use_weights:
             clustering_args["weights"] = np.array(g.es["weight"]).astype(np.float64)
         clustering_args["seed"] = random_state
-        part = leidenalg.find_partition(g, partition_type, **clustering_args)
+        part = cast(
+            "MutableVertexPartition",
+            leidenalg.find_partition(g, partition_type, **clustering_args),
+        )
     else:
         g = _utils.get_igraph_from_adjacency(adjacency, directed=False)
         if use_weights:
@@ -212,6 +221,7 @@ def leiden(  # noqa: PLR0912, PLR0913, PLR0915
         random_state=random_state,
         n_iterations=n_iterations,
     )
+    adata.uns[key_added]["modularity"] = part.modularity
     logg.info(
         "    finished",
         time=start,
diff --git a/tests/test_clustering.py b/tests/test_clustering.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from functools import partial
+from typing import TYPE_CHECKING
 
 import pandas as pd
 import pytest
@@ -10,21 +11,27 @@
 from testing.scanpy._helpers.data import pbmc68k_reduced
 from testing.scanpy._pytest.marks import needs
 
+if TYPE_CHECKING:
+    from typing import Literal
+
 
 @pytest.fixture
 def adata_neighbors():
     return pbmc68k_reduced()
 
 
-FLAVORS = [
-    pytest.param("igraph", marks=needs.igraph),
-    pytest.param("leidenalg", marks=needs.leidenalg),
-]
+@pytest.fixture(
+    params=[
+        pytest.param("igraph", marks=needs.igraph),
+        pytest.param("leidenalg", marks=needs.leidenalg),
+    ]
+)
+def flavor(request: pytest.FixtureRequest) -> Literal["igraph", "leidenalg"]:
+    return request.param
 
 
 @needs.leidenalg
 @needs.igraph
-@pytest.mark.parametrize("flavor", FLAVORS)
 @pytest.mark.parametrize("resolution", [1, 2])
 @pytest.mark.parametrize("n_iterations", [-1, 3])
 def test_leiden_basic(adata_neighbors, flavor, resolution, n_iterations):
@@ -44,7 +51,6 @@ def test_leiden_basic(adata_neighbors, flavor, resolution, n_iterations):
 
 @needs.leidenalg
 @needs.igraph
-@pytest.mark.parametrize("flavor", FLAVORS)
 def test_leiden_random_state(adata_neighbors, flavor):
     is_leiden_alg = flavor == "leidenalg"
     n_iterations = 2 if is_leiden_alg else -1
@@ -72,8 +78,18 @@ def test_leiden_random_state(adata_neighbors, flavor):
         directed=is_leiden_alg,
         n_iterations=n_iterations,
     )
+    # reproducible
     pd.testing.assert_series_equal(adata_1.obs["leiden"], adata_1_again.obs["leiden"])
+    assert (
+        pytest.approx(adata_1.uns["leiden"]["modularity"])
+        == adata_1_again.uns["leiden"]["modularity"]
+    )
+    # different clustering
     assert not adata_2.obs["leiden"].equals(adata_1_again.obs["leiden"])
+    assert (
+        pytest.approx(adata_2.uns["leiden"]["modularity"])
+        != adata_1_again.uns["leiden"]["modularity"]
+    )
 
 
 @needs.igraph
diff --git a/tests/test_metrics.py b/tests/test_metrics.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+Add modularity scoring via {func}`scanpy.metrics.modularity` with support for directed/undirected graphs {smaller}`A. Karesh`