From f76dc7b628c1c300c1c39cc60592ff844fdd2bdb Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Fri, 25 Apr 2025 12:57:22 +0200 Subject: [PATCH 01/29] Modularity score functions with comments --- src/scanpy/metrics/__init__.py | 4 +- src/scanpy/metrics/_metrics.py | 67 +++++++++++++++++++++++++++++++++- 2 files changed, 68 insertions(+), 3 deletions(-) diff --git a/src/scanpy/metrics/__init__.py b/src/scanpy/metrics/__init__.py index 9cdf82bdf2..5ec2229909 100644 --- a/src/scanpy/metrics/__init__.py +++ b/src/scanpy/metrics/__init__.py @@ -3,7 +3,7 @@ from __future__ import annotations from ._gearys_c import gearys_c -from ._metrics import confusion_matrix +from ._metrics import confusion_matrix, modularity, modularity_adata from ._morans_i import morans_i -__all__ = ["confusion_matrix", "gearys_c", "morans_i"] +__all__ = ["confusion_matrix", "gearys_c", "modularity", "modularity_adata", "morans_i"] diff --git a/src/scanpy/metrics/_metrics.py b/src/scanpy/metrics/_metrics.py index c24cfe9fd1..f8dec547b8 100644 --- a/src/scanpy/metrics/_metrics.py +++ b/src/scanpy/metrics/_metrics.py @@ -4,11 +4,15 @@ from typing import TYPE_CHECKING +import igraph as ig import numpy as np import pandas as pd +from anndata import AnnData from natsort import natsorted from pandas.api.types import CategoricalDtype +from scanpy._compat import CSRBase + if TYPE_CHECKING: from collections.abc import Sequence @@ -60,7 +64,9 @@ def confusion_matrix( orig, new = pd.Series(orig), pd.Series(new) assert len(orig) == len(new) - unique_labels = pd.unique(np.concatenate((orig.values, new.values))) + unique_labels = pd.unique( + np.concatenate((np.asarray(orig.values), np.asarray(new.values))) + ) # Compute mtx = _confusion_matrix(orig, new, labels=unique_labels) @@ -89,3 +95,62 @@ def confusion_matrix( df = df.loc[np.array(orig_idx), np.array(new_idx)] return df + + +def modularity(connectivities, labels, mode="UNDIRECTED") -> float: + # default mode is undirected?? can be specified as directed or undirected + """Compute the modularity of a graph given its connectivities and labels. + + Parameters + ---------- + connectivities: array-like or sparse matrix + Weighted adjacency matrix representing the graph. Can be a dense NumPy array or a sparse CSR matrix. + labels: array-like or pandas.Series + Cluster labels for each node in the graph. + mode: str + The mode of the graph. Can be "UNDIRECTED" or "DIRECTED". Default is "UNDIRECTED". + + Returns + ------- + float + The modularity of the graph based on the provided clustering. + """ + if isinstance(connectivities, CSRBase): + # Convert sparse matrix to dense format so that igraph can handle it + # Weighted_Adjacency expects with nested lists or numpy arrays and not sparse + # matrices + dense_connectivities = connectivities.toarray() + else: + dense_connectivities = connectivities + # creating igraph graph from the dense connectivity matrix + graph = ig.Graph.Weighted_Adjacency(dense_connectivities.tolist(), mode=mode) + if isinstance(labels, pd.Series): + labels = labels.values + # making sure labels are in the right format, i.e., a list of integers + labels = pd.Categorical(np.asarray(labels)).codes + return graph.modularity(labels) + + +def modularity_adata(adata: AnnData, labels="leiden", obsp="connectivities") -> float: + """Compute modularity from an AnnData object using stored graph and clustering labels. + + Parameters + ---------- + adata: AnnData + The AnnData object containing the data. + labels: str or array-like + The key in adata.obs that contains the cluster labels. + obsp: str + The key in adata.obsp that contains the connectivities. + + Returns + ------- + float + The modularity of the graph based on the provided clustering. + """ + # if user passes leiden or louvain as a string, this will get the actual labels + # from adata.obs + label_array = adata.obs[labels] if isinstance(labels, str) else labels + # extracting the connectivities from adata.obsp["connectivities"] + connectivities = adata.obsp[obsp] + return modularity(connectivities, label_array) From f092469ae96b1c679a2e34b50097c478cd56c4e1 Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Fri, 25 Apr 2025 13:05:12 +0200 Subject: [PATCH 02/29] typo fix --- src/scanpy/metrics/_metrics.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/scanpy/metrics/_metrics.py b/src/scanpy/metrics/_metrics.py index f8dec547b8..580a24a8cf 100644 --- a/src/scanpy/metrics/_metrics.py +++ b/src/scanpy/metrics/_metrics.py @@ -117,8 +117,7 @@ def modularity(connectivities, labels, mode="UNDIRECTED") -> float: """ if isinstance(connectivities, CSRBase): # Convert sparse matrix to dense format so that igraph can handle it - # Weighted_Adjacency expects with nested lists or numpy arrays and not sparse - # matrices + # Weighted_Adjacency expects with nested lists or numpy arrays and not sparse matrices dense_connectivities = connectivities.toarray() else: dense_connectivities = connectivities From 68652a7ca8f9a5a9a10d26351fccb0a7291c498e Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Wed, 7 May 2025 14:32:07 +0200 Subject: [PATCH 03/29] modularity code updated and 6 tests written for modularity --- src/scanpy/metrics/_metrics.py | 78 +++++++++++++----- tests/test_metrics.py | 139 +++++++++++++++++++++++++++++++++ 2 files changed, 196 insertions(+), 21 deletions(-) diff --git a/src/scanpy/metrics/_metrics.py b/src/scanpy/metrics/_metrics.py index 580a24a8cf..f44fe403c5 100644 --- a/src/scanpy/metrics/_metrics.py +++ b/src/scanpy/metrics/_metrics.py @@ -2,19 +2,22 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast -import igraph as ig import numpy as np import pandas as pd -from anndata import AnnData from natsort import natsorted from pandas.api.types import CategoricalDtype +from scipy.sparse import coo_matrix -from scanpy._compat import CSRBase +from .._compat import CSRBase, SpBase if TYPE_CHECKING: from collections.abc import Sequence + from typing import Literal + + from anndata import AnnData + from numpy.typing import ArrayLike def confusion_matrix( @@ -97,8 +100,13 @@ def confusion_matrix( return df -def modularity(connectivities, labels, mode="UNDIRECTED") -> float: - # default mode is undirected?? can be specified as directed or undirected +def modularity( + connectivities: ArrayLike | SpBase, + labels: pd.Series | ArrayLike, + mode: Literal["UNDIRECTED", "DIRECTED"] = "UNDIRECTED", +) -> float: + # accepting both dense or spare matrices as the connectivity graph + # setting mode between directed and undirected """Compute the modularity of a graph given its connectivities and labels. Parameters @@ -115,22 +123,40 @@ def modularity(connectivities, labels, mode="UNDIRECTED") -> float: float The modularity of the graph based on the provided clustering. """ - if isinstance(connectivities, CSRBase): - # Convert sparse matrix to dense format so that igraph can handle it - # Weighted_Adjacency expects with nested lists or numpy arrays and not sparse matrices - dense_connectivities = connectivities.toarray() + try: + # try to import igraph in case the user wants to calculate modularity + # not in the main module to avoid import errors + import igraph as ig + except ImportError as e: + msg = "igraph is require for computing modularity" + raise ImportError(msg) from e + if isinstance(connectivities, CSRBase | SpBase): + # check if the connectivities is a sparse matrix + coo = coo_matrix(connectivities) + edges = list(zip(coo.row, coo.col, strict=False)) + # converting to the coo format to extract the edges and weights + # storing only non-zero elements and their indices + weights = coo.data.tolist() + graph = ig.Graph(edges=edges, directed=mode == "DIRECTED") + graph.es["weight"] = weights else: - dense_connectivities = connectivities - # creating igraph graph from the dense connectivity matrix - graph = ig.Graph.Weighted_Adjacency(dense_connectivities.tolist(), mode=mode) - if isinstance(labels, pd.Series): - labels = labels.values - # making sure labels are in the right format, i.e., a list of integers + # if the graph is dense, creates it directly using igraph's adjacency matrix + dense_array = np.asarray(connectivities) + igraph_mode = ig.ADJ_UNDIRECTED if mode == "UNDIRECTED" else ig.ADJ_DIRECTED + graph = ig.Graph.Weighted_Adjacency(dense_array.tolist(), mode=igraph_mode) + # cluster labels to integer codes required by igraph labels = pd.Categorical(np.asarray(labels)).codes + return graph.modularity(labels) -def modularity_adata(adata: AnnData, labels="leiden", obsp="connectivities") -> float: +def modularity_adata( + adata: AnnData, + labels: str | ArrayLike = "leiden", + obsp: str = "connectivities", + mode: Literal["UNDIRECTED", "DIRECTED"] = "UNDIRECTED", +) -> float: + # default to leiden labels and connectivities as it is more common """Compute modularity from an AnnData object using stored graph and clustering labels. Parameters @@ -147,9 +173,19 @@ def modularity_adata(adata: AnnData, labels="leiden", obsp="connectivities") -> float The modularity of the graph based on the provided clustering. """ - # if user passes leiden or louvain as a string, this will get the actual labels - # from adata.obs + # if labels is a key in adata.obs, get the values from adata.obs + # otherwise, assume it is an array-like object label_array = adata.obs[labels] if isinstance(labels, str) else labels - # extracting the connectivities from adata.obsp["connectivities"] connectivities = adata.obsp[obsp] - return modularity(connectivities, label_array) + + if isinstance(connectivities, CSRBase): + # converting to dense if it is a CSR matrix + dense = connectivities.toarray() + else: + toarray = getattr(connectivities, "toarray", None) + dense = toarray() if callable(toarray) else connectivities + + if isinstance(dense, pd.DataFrame): + dense = dense.values + dense = cast("ArrayLike", dense) + return modularity(dense, label_array, mode=mode) diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 414c9b479c..edf5e2f95a 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -10,8 +10,10 @@ import pytest import threadpoolctl from scipy import sparse +from scipy.sparse import csr_matrix import scanpy as sc +from scanpy.metrics import modularity, modularity_adata from testing.scanpy._helpers.data import pbmc68k_reduced from testing.scanpy._pytest.params import ARRAY_TYPES @@ -196,3 +198,140 @@ def test_confusion_matrix_api(): pd.testing.assert_frame_equal( expected, sc.metrics.confusion_matrix(data["a"], "b", data) ) + + +# importing igraph, louvain, leiden if available +try: + import igraph as ig + + HAS_IGRAPH = True +except ImportError: + HAS_IGRAPH = False + +try: + import louvain + + HAS_LOUVAIN = True +except ImportError: + HAS_LOUVAIN = False + +try: + import leidenalg + + HAS_LEIDEN = True +except ImportError: + HAS_LEIDEN = False + + +# Test 1: Sample graph with clear community structure (dense & sparse, directed & undirected) +@pytest.mark.parametrize( + "mode", ["UNDIRECTED", "DIRECTED"], ids=["undirected", "directed"] +) +@pytest.mark.parametrize("use_sparse", [False, True], ids=["sparse", "dense"]) +def test_modularity_sample_structure(mode, use_sparse): + if HAS_IGRAPH is False: + pytest.skip("igraph is not installed") + # 4 node adjacency matrix with two separate 2-node communities + mat = np.array( + [ + [1, 1, 0, 0], + [1, 1, 0, 0], + [0, 0, 1, 1], + [0, 0, 1, 1], + ] + ) + labels = ["A", "A", "B", "B"] + adj = csr_matrix(mat) if use_sparse else mat + score = modularity(adj, labels, mode=mode) + + # Modularity should be between 0 and 1 + assert 0 <= score <= 1 + + +# Test 2: Edge case when all nodes belong to the same community/cluster +def test_modularity_single_community(): + if HAS_IGRAPH is False: + pytest.skip("igraph is not installed") + # fully connected graph sample + adj = np.ones((4, 4)) - np.eye(4) + labels = ["A", "A", "A", "A"] + score = modularity(adj, labels) + + # modularity should be 0 + assert score == pytest.approx(0.0, rel=1e-6) + + +# Test 3: Invalad input, labels length does not match adjacency matrix size +def test_modularity_invalid_labels(): + if HAS_IGRAPH is False: + pytest.skip("igraph is not installed") + from igraph._igraph import InternalError + + adj = np.eye(4) + labels = ["A", "A", "B"] + with pytest.raises( + InternalError, + match="Membership vector size differs", + ): + modularity(adj, labels) + + +# Test 4: Pass both Louvain and Leiden clustering algorithms +@pytest.mark.parametrize("cluster_method", ["louvain", "leiden"]) +def test_modularity_adata_multiple_clusterings(cluster_method): + if HAS_IGRAPH is False: + pytest.skip("igraph is not installed") + if cluster_method == "louvain" and HAS_LOUVAIN is False: + pytest.skip("louvain is not installed") + if cluster_method == "leiden" and HAS_LEIDEN is False: + pytest.skip("leiden is not installed") + # Loading PBMC Data and compute PCA and neighbors graph + adata = sc.datasets.pbmc3k() + sc.pp.pca(adata) + sc.pp.neighbors(adata) + # Compute modularity using both Louvain and Leiden clustering + if cluster_method == "louvain": + sc.tl.louvain(adata) + score_louvain = modularity_adata( + adata, labels="louvain", obsp="connectivities", mode="UNDIRECTED" + ) + # Score should be between 0 and 1 + assert 0 <= score_louvain <= 1 + if cluster_method == "leiden": + sc.tl.leiden(adata) + score_leiden = modularity_adata( + adata, labels="leiden", obsp="connectivities", mode="UNDIRECTED" + ) + # Score should be between 0 and 1 + assert 0 <= score_leiden <= 1 + + +# Test 5: Modularity should be the same no matter the order of the labels +def test_modularity_order(): + if HAS_IGRAPH is False: + pytest.skip("igraph is not installed") + adj = np.array( + [ + [1, 1, 0, 0], + [1, 1, 0, 0], + [0, 0, 1, 1], + [0, 0, 1, 1], + ] + ) + labels1 = ["A", "A", "B", "B"] + labels2 = ["B", "B", "A", "A"] + score_1 = modularity(adj, labels1) + score_2 = modularity(adj, labels2) + assert score_1 == score_2 + + +# Test 6: Modularity on disconnected graph lke edge-case behavior in some algorithms +def test_modularity_disconnected_graph(): + if HAS_IGRAPH is False: + pytest.skip("igraph is not installed") + adj = np.zeros((4, 4)) + labels = ["A", "B", "C", "D"] + score = modularity(adj, labels) + + # Modularity should be undefined for disconnected graphs + assert np.isnan(score) From 948319a44d23a1550a5a32eaea99c9bcf99180d3 Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Wed, 7 May 2025 14:47:16 +0200 Subject: [PATCH 04/29] error fixing from pipelines --- tests/test_metrics.py | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/tests/test_metrics.py b/tests/test_metrics.py index edf5e2f95a..f69ad0a803 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -1,5 +1,6 @@ from __future__ import annotations +import importlib.util import warnings from functools import partial from string import ascii_letters @@ -201,26 +202,9 @@ def test_confusion_matrix_api(): # importing igraph, louvain, leiden if available -try: - import igraph as ig - - HAS_IGRAPH = True -except ImportError: - HAS_IGRAPH = False - -try: - import louvain - - HAS_LOUVAIN = True -except ImportError: - HAS_LOUVAIN = False - -try: - import leidenalg - - HAS_LEIDEN = True -except ImportError: - HAS_LEIDEN = False +HAS_IGRAPH = importlib.util.find_spec("igraph") is not None +HAS_LOUVAIN = importlib.util.find_spec("louvain") is not None +HAS_LEIDEN = importlib.util.find_spec("leidenalg") is not None # Test 1: Sample graph with clear community structure (dense & sparse, directed & undirected) From 6a64330ec7f8c773566bf74c49907c8c54248ea7 Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Wed, 7 May 2025 14:55:09 +0200 Subject: [PATCH 05/29] ruff error fix --- tests/test_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_metrics.py b/tests/test_metrics.py index f69ad0a803..80002a8c30 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -11,7 +11,7 @@ import pytest import threadpoolctl from scipy import sparse -from scipy.sparse import csr_matrix +from scipy.sparse import csr_matrix # noqa: TID251 import scanpy as sc from scanpy.metrics import modularity, modularity_adata From 793351f00707ed1ea3544c268decf88d8bbc901a Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Wed, 7 May 2025 15:21:56 +0200 Subject: [PATCH 06/29] keywords variable fix --- src/scanpy/metrics/_metrics.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/scanpy/metrics/_metrics.py b/src/scanpy/metrics/_metrics.py index f44fe403c5..e2709cefe0 100644 --- a/src/scanpy/metrics/_metrics.py +++ b/src/scanpy/metrics/_metrics.py @@ -152,6 +152,7 @@ def modularity( def modularity_adata( adata: AnnData, + *, labels: str | ArrayLike = "leiden", obsp: str = "connectivities", mode: Literal["UNDIRECTED", "DIRECTED"] = "UNDIRECTED", From 92d8e268058a48cef7177759bc196aa632207768 Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Wed, 7 May 2025 17:20:13 +0200 Subject: [PATCH 07/29] neighbors from a precomputed distance matrix, still need to make sure on how to integrate the two --- src/scanpy/neighbors/__init__.py | 52 ++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/src/scanpy/neighbors/__init__.py b/src/scanpy/neighbors/__init__.py index 4a42111b74..a5fa4566ff 100644 --- a/src/scanpy/neighbors/__init__.py +++ b/src/scanpy/neighbors/__init__.py @@ -21,9 +21,11 @@ from .._utils import NeighborsView, _doc_params, get_literal_vals from . import _connectivity from ._common import ( + _get_indices_distances_from_dense_matrix, _get_indices_distances_from_sparse_matrix, _get_sparse_matrix_from_indices_distances, ) +from ._connectivity import umap from ._doc import doc_n_pcs, doc_use_rep from ._types import _KnownTransformer, _Method @@ -248,6 +250,56 @@ def neighbors( # noqa: PLR0913 return adata if copy else None +def neighbors_from_distance( + adata: AnnData, + distance_matrix: np.ndarray, + n_neighbors: int = 15, + method: Literal["umap"] = "umap", +) -> None: + """Compute neighbors from a precomputer distance matrix. + + Parameters + ---------- + adata : AnnData + Annotated data matrix. + distance_matrix : np.ndarray + Precomputed dense or sparse distance matrix. + n_neighbors : int + Number of nearest neighbors to use in the graph. + method : str + Method to use for computing the graph. Currently only 'umap' is supported. + """ + if isinstance(distance_matrix, SpBase): + distance_matrix = np.asarray(sparse.csr_matrix(distance_matrix).toarray()) # noqa: TID251 + + knn_indices, knn_distances = _get_indices_distances_from_dense_matrix( + distance_matrix, n_neighbors + ) + if method == "umap": + connectivities = umap( + knn_indices, + knn_distances, + n_obs=adata.n_obs, + n_neighbors=n_neighbors, + ) + else: + msg = f"Method {method} not implemented." + raise NotImplementedError(msg) + + adata.obsp["connectivities"] = connectivities + adata.obsp["distances"] = sparse.csr_matrix(distance_matrix) # noqa: TID251 + adata.uns["neighbors"] = { + "connectivities_key": "connectivities", + "distances_key": "distances", + "params": { + "n_neighbors": n_neighbors, + "method": method, + "random_state": 0, + "metric": "euclidean", + }, + } + + class FlatTree(NamedTuple): # noqa: D101 hyperplanes: None offsets: None From 198c4fb9149da308d347adc309921fd3370837be Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Wed, 7 May 2025 17:44:15 +0200 Subject: [PATCH 08/29] revert back --- src/scanpy/neighbors/__init__.py | 113 ++++++++++++++++++------------- 1 file changed, 65 insertions(+), 48 deletions(-) diff --git a/src/scanpy/neighbors/__init__.py b/src/scanpy/neighbors/__init__.py index a5fa4566ff..5584eedc94 100644 --- a/src/scanpy/neighbors/__init__.py +++ b/src/scanpy/neighbors/__init__.py @@ -76,6 +76,7 @@ def neighbors( # noqa: PLR0913 n_neighbors: int = 15, n_pcs: int | None = None, *, + distance_matrix: np.ndarray | None = None, use_rep: str | None = None, knn: bool = True, method: _Method = "umap", @@ -188,6 +189,13 @@ def neighbors( # noqa: PLR0913 :doc:`/how-to/knn-transformers` """ + # if distance_matrix is not None: + # return neighbors_from_distance( + # adata, + # distance_matrix, + # n_neighbors=n_neighbors, + # method=method, + # ) start = logg.info("computing neighbors") adata = adata.copy() if copy else adata if adata.is_view: # we shouldn't need this here... @@ -250,54 +258,63 @@ def neighbors( # noqa: PLR0913 return adata if copy else None -def neighbors_from_distance( - adata: AnnData, - distance_matrix: np.ndarray, - n_neighbors: int = 15, - method: Literal["umap"] = "umap", -) -> None: - """Compute neighbors from a precomputer distance matrix. - - Parameters - ---------- - adata : AnnData - Annotated data matrix. - distance_matrix : np.ndarray - Precomputed dense or sparse distance matrix. - n_neighbors : int - Number of nearest neighbors to use in the graph. - method : str - Method to use for computing the graph. Currently only 'umap' is supported. - """ - if isinstance(distance_matrix, SpBase): - distance_matrix = np.asarray(sparse.csr_matrix(distance_matrix).toarray()) # noqa: TID251 - - knn_indices, knn_distances = _get_indices_distances_from_dense_matrix( - distance_matrix, n_neighbors - ) - if method == "umap": - connectivities = umap( - knn_indices, - knn_distances, - n_obs=adata.n_obs, - n_neighbors=n_neighbors, - ) - else: - msg = f"Method {method} not implemented." - raise NotImplementedError(msg) - - adata.obsp["connectivities"] = connectivities - adata.obsp["distances"] = sparse.csr_matrix(distance_matrix) # noqa: TID251 - adata.uns["neighbors"] = { - "connectivities_key": "connectivities", - "distances_key": "distances", - "params": { - "n_neighbors": n_neighbors, - "method": method, - "random_state": 0, - "metric": "euclidean", - }, - } +# def neighbors_from_distance( +# adata: AnnData, +# distance_matrix: np.ndarray, +# n_neighbors: int = 15, +# method: Literal["umap", "gauss"] = "umap", # default to umap +# ) -> None: +# # computes the neighborhood graph from a precomputed distance matrix +# # only umap is supported +# # skipping PCA and distance computation and goes straight to the graph +# """Compute neighbors from a precomputer distance matrix. + +# Parameters +# ---------- +# adata : AnnData +# Annotated data matrix. +# distance_matrix : np.ndarray +# Precomputed dense or sparse distance matrix. +# n_neighbors : int +# Number of nearest neighbors to use in the graph. +# method : str +# Method to use for computing the graph. Currently only 'umap' is supported. +# """ +# if isinstance(distance_matrix, SpBase): +# distance_matrix = np.asarray(sparse.csr_matrix(distance_matrix).toarray()) + +# knn_indices, knn_distances = _get_indices_distances_from_dense_matrix( +# distance_matrix, n_neighbors +# ) +# if method == "umap": +# connectivities = umap( +# knn_indices, +# knn_distances, +# n_obs=adata.n_obs, +# n_neighbors=n_neighbors, +# ) +# elif method == "gauss": +# connectivities = _connectivity.gauss( +# sparse.csr_matrix(distance_matrix), +# n_neighbors, +# knn=True, +# ) +# else: +# msg = f"Method {method} not implemented." +# raise NotImplementedError(msg) + +# adata.obsp["connectivities"] = connectivities +# adata.obsp["distances"] = sparse.csr_matrix(distance_matrix) +# adata.uns["neighbors"] = { +# "connectivities_key": "connectivities", +# "distances_key": "distances", +# "params": { +# "n_neighbors": n_neighbors, +# "method": method, +# "random_state": 0, +# "metric": "euclidean", +# }, +# } class FlatTree(NamedTuple): # noqa: D101 From e7fb67ab54738ef1c8e9224cd326b3508a67c432 Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Wed, 7 May 2025 18:11:35 +0200 Subject: [PATCH 09/29] code only for the prexisting distance matrix --- src/scanpy/neighbors/__init__.py | 128 +++++++++++++++---------------- 1 file changed, 64 insertions(+), 64 deletions(-) diff --git a/src/scanpy/neighbors/__init__.py b/src/scanpy/neighbors/__init__.py index 5584eedc94..f8df266670 100644 --- a/src/scanpy/neighbors/__init__.py +++ b/src/scanpy/neighbors/__init__.py @@ -189,13 +189,13 @@ def neighbors( # noqa: PLR0913 :doc:`/how-to/knn-transformers` """ - # if distance_matrix is not None: - # return neighbors_from_distance( - # adata, - # distance_matrix, - # n_neighbors=n_neighbors, - # method=method, - # ) + if distance_matrix is not None: + return neighbors_from_distance( + adata, + distance_matrix, + n_neighbors=n_neighbors, + method=method, + ) start = logg.info("computing neighbors") adata = adata.copy() if copy else adata if adata.is_view: # we shouldn't need this here... @@ -258,63 +258,63 @@ def neighbors( # noqa: PLR0913 return adata if copy else None -# def neighbors_from_distance( -# adata: AnnData, -# distance_matrix: np.ndarray, -# n_neighbors: int = 15, -# method: Literal["umap", "gauss"] = "umap", # default to umap -# ) -> None: -# # computes the neighborhood graph from a precomputed distance matrix -# # only umap is supported -# # skipping PCA and distance computation and goes straight to the graph -# """Compute neighbors from a precomputer distance matrix. - -# Parameters -# ---------- -# adata : AnnData -# Annotated data matrix. -# distance_matrix : np.ndarray -# Precomputed dense or sparse distance matrix. -# n_neighbors : int -# Number of nearest neighbors to use in the graph. -# method : str -# Method to use for computing the graph. Currently only 'umap' is supported. -# """ -# if isinstance(distance_matrix, SpBase): -# distance_matrix = np.asarray(sparse.csr_matrix(distance_matrix).toarray()) - -# knn_indices, knn_distances = _get_indices_distances_from_dense_matrix( -# distance_matrix, n_neighbors -# ) -# if method == "umap": -# connectivities = umap( -# knn_indices, -# knn_distances, -# n_obs=adata.n_obs, -# n_neighbors=n_neighbors, -# ) -# elif method == "gauss": -# connectivities = _connectivity.gauss( -# sparse.csr_matrix(distance_matrix), -# n_neighbors, -# knn=True, -# ) -# else: -# msg = f"Method {method} not implemented." -# raise NotImplementedError(msg) - -# adata.obsp["connectivities"] = connectivities -# adata.obsp["distances"] = sparse.csr_matrix(distance_matrix) -# adata.uns["neighbors"] = { -# "connectivities_key": "connectivities", -# "distances_key": "distances", -# "params": { -# "n_neighbors": n_neighbors, -# "method": method, -# "random_state": 0, -# "metric": "euclidean", -# }, -# } +def neighbors_from_distance( + adata: AnnData, + distance_matrix: np.ndarray, + n_neighbors: int = 15, + method: Literal["umap", "gauss"] = "umap", # default to umap +) -> None: + # computes the neighborhood graph from a precomputed distance matrix + # only umap is supported + # skipping PCA and distance computation and goes straight to the graph + """Compute neighbors from a precomputer distance matrix. + + Parameters + ---------- + adata : AnnData + Annotated data matrix. + distance_matrix : np.ndarray + Precomputed dense or sparse distance matrix. + n_neighbors : int + Number of nearest neighbors to use in the graph. + method : str + Method to use for computing the graph. Currently only 'umap' is supported. + """ + if isinstance(distance_matrix, SpBase): + distance_matrix = np.asarray(sparse.csr_matrix(distance_matrix).toarray()) # noqa: TID251 + + knn_indices, knn_distances = _get_indices_distances_from_dense_matrix( + distance_matrix, n_neighbors + ) + if method == "umap": + connectivities = umap( + knn_indices, + knn_distances, + n_obs=adata.n_obs, + n_neighbors=n_neighbors, + ) + elif method == "gauss": + connectivities = _connectivity.gauss( + sparse.csr_matrix(distance_matrix), # noqa: TID251 + n_neighbors, + knn=True, + ) + else: + msg = f"Method {method} not implemented." + raise NotImplementedError(msg) + + adata.obsp["connectivities"] = connectivities + adata.obsp["distances"] = sparse.csr_matrix(distance_matrix) # noqa: TID251 + adata.uns["neighbors"] = { + "connectivities_key": "connectivities", + "distances_key": "distances", + "params": { + "n_neighbors": n_neighbors, + "method": method, + "random_state": 0, + "metric": "euclidean", + }, + } class FlatTree(NamedTuple): # noqa: D101 From 14cb44159ae3547d3377044fe7ca44548dedfde8 Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Thu, 8 May 2025 17:50:47 +0200 Subject: [PATCH 10/29] initial changes for the neighborhors --- src/scanpy/neighbors/__init__.py | 69 +++++++++++++++++++++++++------- 1 file changed, 54 insertions(+), 15 deletions(-) diff --git a/src/scanpy/neighbors/__init__.py b/src/scanpy/neighbors/__init__.py index f8df266670..d12300e31c 100644 --- a/src/scanpy/neighbors/__init__.py +++ b/src/scanpy/neighbors/__init__.py @@ -190,6 +190,8 @@ def neighbors( # noqa: PLR0913 """ if distance_matrix is not None: + # Added this to support the new distance matrix function + # if a precomputed distance matrix is provided, skip the PCA and distance computation return neighbors_from_distance( adata, distance_matrix, @@ -260,33 +262,60 @@ def neighbors( # noqa: PLR0913 def neighbors_from_distance( adata: AnnData, - distance_matrix: np.ndarray, + distance_matrix: np.ndarray | SpBase, n_neighbors: int = 15, method: Literal["umap", "gauss"] = "umap", # default to umap -) -> None: + key_added: str | None = None, +) -> AnnData: # computes the neighborhood graph from a precomputed distance matrix - # only umap is supported + # both umap an gauss are supported, default is umap # skipping PCA and distance computation and goes straight to the graph + # key_added is the key under which to store the results in adata.uns or adata.obsp """Compute neighbors from a precomputer distance matrix. Parameters ---------- - adata : AnnData + adata Annotated data matrix. - distance_matrix : np.ndarray + distance_matrix Precomputed dense or sparse distance matrix. - n_neighbors : int + n_neighbors Number of nearest neighbors to use in the graph. - method : str + method Method to use for computing the graph. Currently only 'umap' is supported. + key_added + Optional key under which to store the results. Default is 'neighbors'. + + Returns + ------- + adata + Annotated data with computed distances and connectivities. """ if isinstance(distance_matrix, SpBase): - distance_matrix = np.asarray(sparse.csr_matrix(distance_matrix).toarray()) # noqa: TID251 + # spare matrices can save memory for large datasets + # csr_matrix is the most efficient format for sparse matrices + # setting the diagonal to 0 is important = distance to self must not affect umap or gauss + # elimimate zeros is important to save memory, avoids storing explicit zeros + distance_matrix = sparse.csr_matrix(distance_matrix) # noqa: TID251 + distance_matrix.setdiag(0) + distance_matrix.eliminate_zeros() + # extracting for each observation the indices and distances of the n_neighbors + # being then used by umap or gauss + knn_indices, knn_distances = _get_indices_distances_from_sparse_matrix( + distance_matrix, n_neighbors + ) + else: + # if it is dense, converting it to ndarray + # and setting the diagonal to 0 + # extracting knn indices and distances + distance_matrix = np.asarray(distance_matrix) + np.fill_diagonal(distance_matrix, 0) + knn_indices, knn_distances = _get_indices_distances_from_dense_matrix( + distance_matrix, n_neighbors + ) - knn_indices, knn_distances = _get_indices_distances_from_dense_matrix( - distance_matrix, n_neighbors - ) if method == "umap": + # using umap to build connectivities from distances connectivities = umap( knn_indices, knn_distances, @@ -294,6 +323,8 @@ def neighbors_from_distance( n_neighbors=n_neighbors, ) elif method == "gauss": + # using gauss to build connectivities from distances + # requires sparse matrix for efficiency connectivities = _connectivity.gauss( sparse.csr_matrix(distance_matrix), # noqa: TID251 n_neighbors, @@ -302,10 +333,17 @@ def neighbors_from_distance( else: msg = f"Method {method} not implemented." raise NotImplementedError(msg) - - adata.obsp["connectivities"] = connectivities - adata.obsp["distances"] = sparse.csr_matrix(distance_matrix) # noqa: TID251 - adata.uns["neighbors"] = { + # defining where to store graph info + key = "neighbors" if key_added is None else key_added + dists_key = "distances" if key_added is None else key_added + "_distances" + conns_key = "connectivities" if key_added is None else key_added + "_connectivities" + # storing the actual distance and connectivitiy matrices as obsp + adata.uns[dists_key] = sparse.csr_matrix(distance_matrix) # noqa: TID251 + adata.obsp[conns_key] = connectivities + # populating with metadata describing how neighbors were computed + # I think might be important as many functions downstream rely + # on .uns['neighbors'] to find correct .obsp key + adata.uns[key] = { "connectivities_key": "connectivities", "distances_key": "distances", "params": { @@ -315,6 +353,7 @@ def neighbors_from_distance( "metric": "euclidean", }, } + return adata class FlatTree(NamedTuple): # noqa: D101 From 0ce8c157e5a5c0cc9a3982dd731458666111e307 Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Mon, 12 May 2025 12:18:59 +0200 Subject: [PATCH 11/29] distances name switch and sparse array allowed --- src/scanpy/metrics/__init__.py | 4 +- src/scanpy/metrics/_metrics.py | 101 +------------------------ src/scanpy/neighbors/__init__.py | 32 ++++---- tests/test_metrics.py | 123 ------------------------------- 4 files changed, 20 insertions(+), 240 deletions(-) diff --git a/src/scanpy/metrics/__init__.py b/src/scanpy/metrics/__init__.py index 5ec2229909..417b070df5 100644 --- a/src/scanpy/metrics/__init__.py +++ b/src/scanpy/metrics/__init__.py @@ -3,7 +3,7 @@ from __future__ import annotations from ._gearys_c import gearys_c -from ._metrics import confusion_matrix, modularity, modularity_adata +from ._metrics import confusion_matrix from ._morans_i import morans_i -__all__ = ["confusion_matrix", "gearys_c", "modularity", "modularity_adata", "morans_i"] +__all__ = ["confusion_matrix", "gearys_c", "modularity"] diff --git a/src/scanpy/metrics/_metrics.py b/src/scanpy/metrics/_metrics.py index e2709cefe0..dbe4406533 100644 --- a/src/scanpy/metrics/_metrics.py +++ b/src/scanpy/metrics/_metrics.py @@ -2,22 +2,15 @@ from __future__ import annotations -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING import numpy as np import pandas as pd from natsort import natsorted from pandas.api.types import CategoricalDtype -from scipy.sparse import coo_matrix - -from .._compat import CSRBase, SpBase if TYPE_CHECKING: from collections.abc import Sequence - from typing import Literal - - from anndata import AnnData - from numpy.typing import ArrayLike def confusion_matrix( @@ -98,95 +91,3 @@ def confusion_matrix( df = df.loc[np.array(orig_idx), np.array(new_idx)] return df - - -def modularity( - connectivities: ArrayLike | SpBase, - labels: pd.Series | ArrayLike, - mode: Literal["UNDIRECTED", "DIRECTED"] = "UNDIRECTED", -) -> float: - # accepting both dense or spare matrices as the connectivity graph - # setting mode between directed and undirected - """Compute the modularity of a graph given its connectivities and labels. - - Parameters - ---------- - connectivities: array-like or sparse matrix - Weighted adjacency matrix representing the graph. Can be a dense NumPy array or a sparse CSR matrix. - labels: array-like or pandas.Series - Cluster labels for each node in the graph. - mode: str - The mode of the graph. Can be "UNDIRECTED" or "DIRECTED". Default is "UNDIRECTED". - - Returns - ------- - float - The modularity of the graph based on the provided clustering. - """ - try: - # try to import igraph in case the user wants to calculate modularity - # not in the main module to avoid import errors - import igraph as ig - except ImportError as e: - msg = "igraph is require for computing modularity" - raise ImportError(msg) from e - if isinstance(connectivities, CSRBase | SpBase): - # check if the connectivities is a sparse matrix - coo = coo_matrix(connectivities) - edges = list(zip(coo.row, coo.col, strict=False)) - # converting to the coo format to extract the edges and weights - # storing only non-zero elements and their indices - weights = coo.data.tolist() - graph = ig.Graph(edges=edges, directed=mode == "DIRECTED") - graph.es["weight"] = weights - else: - # if the graph is dense, creates it directly using igraph's adjacency matrix - dense_array = np.asarray(connectivities) - igraph_mode = ig.ADJ_UNDIRECTED if mode == "UNDIRECTED" else ig.ADJ_DIRECTED - graph = ig.Graph.Weighted_Adjacency(dense_array.tolist(), mode=igraph_mode) - # cluster labels to integer codes required by igraph - labels = pd.Categorical(np.asarray(labels)).codes - - return graph.modularity(labels) - - -def modularity_adata( - adata: AnnData, - *, - labels: str | ArrayLike = "leiden", - obsp: str = "connectivities", - mode: Literal["UNDIRECTED", "DIRECTED"] = "UNDIRECTED", -) -> float: - # default to leiden labels and connectivities as it is more common - """Compute modularity from an AnnData object using stored graph and clustering labels. - - Parameters - ---------- - adata: AnnData - The AnnData object containing the data. - labels: str or array-like - The key in adata.obs that contains the cluster labels. - obsp: str - The key in adata.obsp that contains the connectivities. - - Returns - ------- - float - The modularity of the graph based on the provided clustering. - """ - # if labels is a key in adata.obs, get the values from adata.obs - # otherwise, assume it is an array-like object - label_array = adata.obs[labels] if isinstance(labels, str) else labels - connectivities = adata.obsp[obsp] - - if isinstance(connectivities, CSRBase): - # converting to dense if it is a CSR matrix - dense = connectivities.toarray() - else: - toarray = getattr(connectivities, "toarray", None) - dense = toarray() if callable(toarray) else connectivities - - if isinstance(dense, pd.DataFrame): - dense = dense.values - dense = cast("ArrayLike", dense) - return modularity(dense, label_array, mode=mode) diff --git a/src/scanpy/neighbors/__init__.py b/src/scanpy/neighbors/__init__.py index d12300e31c..9df1149b55 100644 --- a/src/scanpy/neighbors/__init__.py +++ b/src/scanpy/neighbors/__init__.py @@ -76,7 +76,7 @@ def neighbors( # noqa: PLR0913 n_neighbors: int = 15, n_pcs: int | None = None, *, - distance_matrix: np.ndarray | None = None, + distances: np.ndarray | SpBase | None = None, use_rep: str | None = None, knn: bool = True, method: _Method = "umap", @@ -189,12 +189,12 @@ def neighbors( # noqa: PLR0913 :doc:`/how-to/knn-transformers` """ - if distance_matrix is not None: + if distances is not None: # Added this to support the new distance matrix function # if a precomputed distance matrix is provided, skip the PCA and distance computation return neighbors_from_distance( adata, - distance_matrix, + distances, n_neighbors=n_neighbors, method=method, ) @@ -262,11 +262,13 @@ def neighbors( # noqa: PLR0913 def neighbors_from_distance( adata: AnnData, - distance_matrix: np.ndarray | SpBase, + distances: np.ndarray | SpBase, n_neighbors: int = 15, method: Literal["umap", "gauss"] = "umap", # default to umap key_added: str | None = None, ) -> AnnData: + ### inconsistent neighbors = bkk and n throw some stuff = bad way of writing the graph + ### adjust for this = knn = True # computes the neighborhood graph from a precomputed distance matrix # both umap an gauss are supported, default is umap # skipping PCA and distance computation and goes straight to the graph @@ -277,7 +279,7 @@ def neighbors_from_distance( ---------- adata Annotated data matrix. - distance_matrix + distances Precomputed dense or sparse distance matrix. n_neighbors Number of nearest neighbors to use in the graph. @@ -291,27 +293,27 @@ def neighbors_from_distance( adata Annotated data with computed distances and connectivities. """ - if isinstance(distance_matrix, SpBase): + if isinstance(distances, SpBase): # spare matrices can save memory for large datasets # csr_matrix is the most efficient format for sparse matrices # setting the diagonal to 0 is important = distance to self must not affect umap or gauss # elimimate zeros is important to save memory, avoids storing explicit zeros - distance_matrix = sparse.csr_matrix(distance_matrix) # noqa: TID251 - distance_matrix.setdiag(0) - distance_matrix.eliminate_zeros() + distances = sparse.csr_matrix(distances) # noqa: TID251 + distances.setdiag(0) + distances.eliminate_zeros() # extracting for each observation the indices and distances of the n_neighbors # being then used by umap or gauss knn_indices, knn_distances = _get_indices_distances_from_sparse_matrix( - distance_matrix, n_neighbors + distances, n_neighbors ) else: # if it is dense, converting it to ndarray # and setting the diagonal to 0 # extracting knn indices and distances - distance_matrix = np.asarray(distance_matrix) - np.fill_diagonal(distance_matrix, 0) + distances = np.asarray(distances) + np.fill_diagonal(distances, 0) knn_indices, knn_distances = _get_indices_distances_from_dense_matrix( - distance_matrix, n_neighbors + distances, n_neighbors ) if method == "umap": @@ -326,7 +328,7 @@ def neighbors_from_distance( # using gauss to build connectivities from distances # requires sparse matrix for efficiency connectivities = _connectivity.gauss( - sparse.csr_matrix(distance_matrix), # noqa: TID251 + sparse.csr_matrix(distances), # noqa: TID251 n_neighbors, knn=True, ) @@ -338,7 +340,7 @@ def neighbors_from_distance( dists_key = "distances" if key_added is None else key_added + "_distances" conns_key = "connectivities" if key_added is None else key_added + "_connectivities" # storing the actual distance and connectivitiy matrices as obsp - adata.uns[dists_key] = sparse.csr_matrix(distance_matrix) # noqa: TID251 + adata.uns[dists_key] = sparse.csr_matrix(distances) # noqa: TID251 adata.obsp[conns_key] = connectivities # populating with metadata describing how neighbors were computed # I think might be important as many functions downstream rely diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 80002a8c30..414c9b479c 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -1,6 +1,5 @@ from __future__ import annotations -import importlib.util import warnings from functools import partial from string import ascii_letters @@ -11,10 +10,8 @@ import pytest import threadpoolctl from scipy import sparse -from scipy.sparse import csr_matrix # noqa: TID251 import scanpy as sc -from scanpy.metrics import modularity, modularity_adata from testing.scanpy._helpers.data import pbmc68k_reduced from testing.scanpy._pytest.params import ARRAY_TYPES @@ -199,123 +196,3 @@ def test_confusion_matrix_api(): pd.testing.assert_frame_equal( expected, sc.metrics.confusion_matrix(data["a"], "b", data) ) - - -# importing igraph, louvain, leiden if available -HAS_IGRAPH = importlib.util.find_spec("igraph") is not None -HAS_LOUVAIN = importlib.util.find_spec("louvain") is not None -HAS_LEIDEN = importlib.util.find_spec("leidenalg") is not None - - -# Test 1: Sample graph with clear community structure (dense & sparse, directed & undirected) -@pytest.mark.parametrize( - "mode", ["UNDIRECTED", "DIRECTED"], ids=["undirected", "directed"] -) -@pytest.mark.parametrize("use_sparse", [False, True], ids=["sparse", "dense"]) -def test_modularity_sample_structure(mode, use_sparse): - if HAS_IGRAPH is False: - pytest.skip("igraph is not installed") - # 4 node adjacency matrix with two separate 2-node communities - mat = np.array( - [ - [1, 1, 0, 0], - [1, 1, 0, 0], - [0, 0, 1, 1], - [0, 0, 1, 1], - ] - ) - labels = ["A", "A", "B", "B"] - adj = csr_matrix(mat) if use_sparse else mat - score = modularity(adj, labels, mode=mode) - - # Modularity should be between 0 and 1 - assert 0 <= score <= 1 - - -# Test 2: Edge case when all nodes belong to the same community/cluster -def test_modularity_single_community(): - if HAS_IGRAPH is False: - pytest.skip("igraph is not installed") - # fully connected graph sample - adj = np.ones((4, 4)) - np.eye(4) - labels = ["A", "A", "A", "A"] - score = modularity(adj, labels) - - # modularity should be 0 - assert score == pytest.approx(0.0, rel=1e-6) - - -# Test 3: Invalad input, labels length does not match adjacency matrix size -def test_modularity_invalid_labels(): - if HAS_IGRAPH is False: - pytest.skip("igraph is not installed") - from igraph._igraph import InternalError - - adj = np.eye(4) - labels = ["A", "A", "B"] - with pytest.raises( - InternalError, - match="Membership vector size differs", - ): - modularity(adj, labels) - - -# Test 4: Pass both Louvain and Leiden clustering algorithms -@pytest.mark.parametrize("cluster_method", ["louvain", "leiden"]) -def test_modularity_adata_multiple_clusterings(cluster_method): - if HAS_IGRAPH is False: - pytest.skip("igraph is not installed") - if cluster_method == "louvain" and HAS_LOUVAIN is False: - pytest.skip("louvain is not installed") - if cluster_method == "leiden" and HAS_LEIDEN is False: - pytest.skip("leiden is not installed") - # Loading PBMC Data and compute PCA and neighbors graph - adata = sc.datasets.pbmc3k() - sc.pp.pca(adata) - sc.pp.neighbors(adata) - # Compute modularity using both Louvain and Leiden clustering - if cluster_method == "louvain": - sc.tl.louvain(adata) - score_louvain = modularity_adata( - adata, labels="louvain", obsp="connectivities", mode="UNDIRECTED" - ) - # Score should be between 0 and 1 - assert 0 <= score_louvain <= 1 - if cluster_method == "leiden": - sc.tl.leiden(adata) - score_leiden = modularity_adata( - adata, labels="leiden", obsp="connectivities", mode="UNDIRECTED" - ) - # Score should be between 0 and 1 - assert 0 <= score_leiden <= 1 - - -# Test 5: Modularity should be the same no matter the order of the labels -def test_modularity_order(): - if HAS_IGRAPH is False: - pytest.skip("igraph is not installed") - adj = np.array( - [ - [1, 1, 0, 0], - [1, 1, 0, 0], - [0, 0, 1, 1], - [0, 0, 1, 1], - ] - ) - labels1 = ["A", "A", "B", "B"] - labels2 = ["B", "B", "A", "A"] - score_1 = modularity(adj, labels1) - score_2 = modularity(adj, labels2) - assert score_1 == score_2 - - -# Test 6: Modularity on disconnected graph lke edge-case behavior in some algorithms -def test_modularity_disconnected_graph(): - if HAS_IGRAPH is False: - pytest.skip("igraph is not installed") - adj = np.zeros((4, 4)) - labels = ["A", "B", "C", "D"] - score = modularity(adj, labels) - - # Modularity should be undefined for disconnected graphs - assert np.isnan(score) From 914b87d7d0c180565594c3574f625a90c0395c7e Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Mon, 12 May 2025 12:26:57 +0200 Subject: [PATCH 12/29] input fix --- src/scanpy/metrics/__init__.py | 2 +- src/scanpy/neighbors/__init__.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/scanpy/metrics/__init__.py b/src/scanpy/metrics/__init__.py index 417b070df5..9cdf82bdf2 100644 --- a/src/scanpy/metrics/__init__.py +++ b/src/scanpy/metrics/__init__.py @@ -6,4 +6,4 @@ from ._metrics import confusion_matrix from ._morans_i import morans_i -__all__ = ["confusion_matrix", "gearys_c", "modularity"] +__all__ = ["confusion_matrix", "gearys_c", "morans_i"] diff --git a/src/scanpy/neighbors/__init__.py b/src/scanpy/neighbors/__init__.py index 9df1149b55..30f3ed27f3 100644 --- a/src/scanpy/neighbors/__init__.py +++ b/src/scanpy/neighbors/__init__.py @@ -263,6 +263,7 @@ def neighbors( # noqa: PLR0913 def neighbors_from_distance( adata: AnnData, distances: np.ndarray | SpBase, + *, n_neighbors: int = 15, method: Literal["umap", "gauss"] = "umap", # default to umap key_added: str | None = None, From d2852036b405646591be30b1476fd79bc89818cc Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Mon, 12 May 2025 13:42:00 +0200 Subject: [PATCH 13/29] variable input fixes --- src/scanpy/neighbors/__init__.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/scanpy/neighbors/__init__.py b/src/scanpy/neighbors/__init__.py index 30f3ed27f3..91cc63d8ba 100644 --- a/src/scanpy/neighbors/__init__.py +++ b/src/scanpy/neighbors/__init__.py @@ -265,7 +265,7 @@ def neighbors_from_distance( distances: np.ndarray | SpBase, *, n_neighbors: int = 15, - method: Literal["umap", "gauss"] = "umap", # default to umap + method: _Method = "umap", # default to umap key_added: str | None = None, ) -> AnnData: ### inconsistent neighbors = bkk and n throw some stuff = bad way of writing the graph @@ -295,10 +295,6 @@ def neighbors_from_distance( Annotated data with computed distances and connectivities. """ if isinstance(distances, SpBase): - # spare matrices can save memory for large datasets - # csr_matrix is the most efficient format for sparse matrices - # setting the diagonal to 0 is important = distance to self must not affect umap or gauss - # elimimate zeros is important to save memory, avoids storing explicit zeros distances = sparse.csr_matrix(distances) # noqa: TID251 distances.setdiag(0) distances.eliminate_zeros() From 50705b398e8d9ef6cbf094dc926e8cf282511bbb Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Mon, 12 May 2025 14:34:08 +0200 Subject: [PATCH 14/29] test added --- src/scanpy/neighbors/__init__.py | 2 +- tests/test_neighbors.py | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/src/scanpy/neighbors/__init__.py b/src/scanpy/neighbors/__init__.py index 91cc63d8ba..7d4bef13f6 100644 --- a/src/scanpy/neighbors/__init__.py +++ b/src/scanpy/neighbors/__init__.py @@ -337,7 +337,7 @@ def neighbors_from_distance( dists_key = "distances" if key_added is None else key_added + "_distances" conns_key = "connectivities" if key_added is None else key_added + "_connectivities" # storing the actual distance and connectivitiy matrices as obsp - adata.uns[dists_key] = sparse.csr_matrix(distances) # noqa: TID251 + adata.obsp[dists_key] = sparse.csr_matrix(distances) # noqa: TID251 adata.obsp[conns_key] = connectivities # populating with metadata describing how neighbors were computed # I think might be important as many functions downstream rely diff --git a/tests/test_neighbors.py b/tests/test_neighbors.py index 904c47f813..7562394548 100644 --- a/tests/test_neighbors.py +++ b/tests/test_neighbors.py @@ -13,6 +13,7 @@ from scanpy import Neighbors from scanpy._compat import CSBase from testing.scanpy._helpers import anndata_v0_8_constructor_compat +from testing.scanpy._helpers.data import pbmc68k_reduced if TYPE_CHECKING: from typing import Literal @@ -241,3 +242,22 @@ def test_restore_n_neighbors(neigh, conv): ad.uns["neighbors"] = dict(connectivities=conv(neigh.connectivities)) neigh_restored = Neighbors(ad) assert neigh_restored.n_neighbors == 1 + + +def test_neighbors_distance_equivalence(): + adata = pbmc68k_reduced() + adata_d = adata.copy() + + sc.pp.neighbors(adata) + # reusing the same distances + sc.pp.neighbors(adata_d, distances=adata.obsp["distances"]) + np.testing.assert_allclose( + adata.obsp["connectivities"].toarray(), + adata_d.obsp["connectivities"].toarray(), + rtol=1e-5, + ) + np.testing.assert_allclose( + adata.obsp["distances"].toarray(), + adata_d.obsp["distances"].toarray(), + rtol=1e-5, + ) From 47306677a5a1c517b2618207a9b9801baadd997e Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Mon, 12 May 2025 14:46:16 +0200 Subject: [PATCH 15/29] numpy issue fix for one line --- src/scanpy/metrics/_metrics.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/scanpy/metrics/_metrics.py b/src/scanpy/metrics/_metrics.py index dbe4406533..83957fcd4a 100644 --- a/src/scanpy/metrics/_metrics.py +++ b/src/scanpy/metrics/_metrics.py @@ -60,9 +60,7 @@ def confusion_matrix( orig, new = pd.Series(orig), pd.Series(new) assert len(orig) == len(new) - unique_labels = pd.unique( - np.concatenate((np.asarray(orig.values), np.asarray(new.values))) - ) + unique_labels = pd.unique(np.concatenate((orig.to_numpy(), new.to_numpy()))) # Compute mtx = _confusion_matrix(orig, new, labels=unique_labels) From 4b9fe3e80c410aa7f8465de71069c3e6fd589e32 Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Mon, 12 May 2025 15:52:33 +0200 Subject: [PATCH 16/29] avoid densifying sparse matrices --- src/scanpy/metrics/_metrics.py | 15 +++----- src/scanpy/neighbors/__init__.py | 59 -------------------------------- 2 files changed, 4 insertions(+), 70 deletions(-) diff --git a/src/scanpy/metrics/_metrics.py b/src/scanpy/metrics/_metrics.py index e2709cefe0..4841dec418 100644 --- a/src/scanpy/metrics/_metrics.py +++ b/src/scanpy/metrics/_metrics.py @@ -2,7 +2,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING import numpy as np import pandas as pd @@ -179,14 +179,7 @@ def modularity_adata( label_array = adata.obs[labels] if isinstance(labels, str) else labels connectivities = adata.obsp[obsp] - if isinstance(connectivities, CSRBase): - # converting to dense if it is a CSR matrix - dense = connectivities.toarray() - else: - toarray = getattr(connectivities, "toarray", None) - dense = toarray() if callable(toarray) else connectivities + if isinstance(connectivities, pd.DataFrame): + connectivities = connectivities.values - if isinstance(dense, pd.DataFrame): - dense = dense.values - dense = cast("ArrayLike", dense) - return modularity(dense, label_array, mode=mode) + return modularity(connectivities, label_array, mode=mode) diff --git a/src/scanpy/neighbors/__init__.py b/src/scanpy/neighbors/__init__.py index 5584eedc94..a1e09de8f2 100644 --- a/src/scanpy/neighbors/__init__.py +++ b/src/scanpy/neighbors/__init__.py @@ -258,65 +258,6 @@ def neighbors( # noqa: PLR0913 return adata if copy else None -# def neighbors_from_distance( -# adata: AnnData, -# distance_matrix: np.ndarray, -# n_neighbors: int = 15, -# method: Literal["umap", "gauss"] = "umap", # default to umap -# ) -> None: -# # computes the neighborhood graph from a precomputed distance matrix -# # only umap is supported -# # skipping PCA and distance computation and goes straight to the graph -# """Compute neighbors from a precomputer distance matrix. - -# Parameters -# ---------- -# adata : AnnData -# Annotated data matrix. -# distance_matrix : np.ndarray -# Precomputed dense or sparse distance matrix. -# n_neighbors : int -# Number of nearest neighbors to use in the graph. -# method : str -# Method to use for computing the graph. Currently only 'umap' is supported. -# """ -# if isinstance(distance_matrix, SpBase): -# distance_matrix = np.asarray(sparse.csr_matrix(distance_matrix).toarray()) - -# knn_indices, knn_distances = _get_indices_distances_from_dense_matrix( -# distance_matrix, n_neighbors -# ) -# if method == "umap": -# connectivities = umap( -# knn_indices, -# knn_distances, -# n_obs=adata.n_obs, -# n_neighbors=n_neighbors, -# ) -# elif method == "gauss": -# connectivities = _connectivity.gauss( -# sparse.csr_matrix(distance_matrix), -# n_neighbors, -# knn=True, -# ) -# else: -# msg = f"Method {method} not implemented." -# raise NotImplementedError(msg) - -# adata.obsp["connectivities"] = connectivities -# adata.obsp["distances"] = sparse.csr_matrix(distance_matrix) -# adata.uns["neighbors"] = { -# "connectivities_key": "connectivities", -# "distances_key": "distances", -# "params": { -# "n_neighbors": n_neighbors, -# "method": method, -# "random_state": 0, -# "metric": "euclidean", -# }, -# } - - class FlatTree(NamedTuple): # noqa: D101 hyperplanes: None offsets: None From 7d754c799369c902a072962437aed715ef31cd13 Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Mon, 12 May 2025 16:17:01 +0200 Subject: [PATCH 17/29] switched to @needs --- tests/test_metrics.py | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 80002a8c30..d31e99c3fe 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -16,6 +16,7 @@ import scanpy as sc from scanpy.metrics import modularity, modularity_adata from testing.scanpy._helpers.data import pbmc68k_reduced +from testing.scanpy._pytest.marks import needs from testing.scanpy._pytest.params import ARRAY_TYPES if TYPE_CHECKING: @@ -212,9 +213,8 @@ def test_confusion_matrix_api(): "mode", ["UNDIRECTED", "DIRECTED"], ids=["undirected", "directed"] ) @pytest.mark.parametrize("use_sparse", [False, True], ids=["sparse", "dense"]) +@needs.igraph def test_modularity_sample_structure(mode, use_sparse): - if HAS_IGRAPH is False: - pytest.skip("igraph is not installed") # 4 node adjacency matrix with two separate 2-node communities mat = np.array( [ @@ -233,9 +233,8 @@ def test_modularity_sample_structure(mode, use_sparse): # Test 2: Edge case when all nodes belong to the same community/cluster +@needs.igraph def test_modularity_single_community(): - if HAS_IGRAPH is False: - pytest.skip("igraph is not installed") # fully connected graph sample adj = np.ones((4, 4)) - np.eye(4) labels = ["A", "A", "A", "A"] @@ -246,9 +245,8 @@ def test_modularity_single_community(): # Test 3: Invalad input, labels length does not match adjacency matrix size +@needs.igraph def test_modularity_invalid_labels(): - if HAS_IGRAPH is False: - pytest.skip("igraph is not installed") from igraph._igraph import InternalError adj = np.eye(4) @@ -262,13 +260,10 @@ def test_modularity_invalid_labels(): # Test 4: Pass both Louvain and Leiden clustering algorithms @pytest.mark.parametrize("cluster_method", ["louvain", "leiden"]) +@needs.igraph +@needs.louvain +@needs.leidenalg def test_modularity_adata_multiple_clusterings(cluster_method): - if HAS_IGRAPH is False: - pytest.skip("igraph is not installed") - if cluster_method == "louvain" and HAS_LOUVAIN is False: - pytest.skip("louvain is not installed") - if cluster_method == "leiden" and HAS_LEIDEN is False: - pytest.skip("leiden is not installed") # Loading PBMC Data and compute PCA and neighbors graph adata = sc.datasets.pbmc3k() sc.pp.pca(adata) @@ -291,9 +286,8 @@ def test_modularity_adata_multiple_clusterings(cluster_method): # Test 5: Modularity should be the same no matter the order of the labels +@needs.igraph def test_modularity_order(): - if HAS_IGRAPH is False: - pytest.skip("igraph is not installed") adj = np.array( [ [1, 1, 0, 0], @@ -310,9 +304,8 @@ def test_modularity_order(): # Test 6: Modularity on disconnected graph lke edge-case behavior in some algorithms +@needs.igraph def test_modularity_disconnected_graph(): - if HAS_IGRAPH is False: - pytest.skip("igraph is not installed") adj = np.zeros((4, 4)) labels = ["A", "B", "C", "D"] score = modularity(adj, labels) From 15320af0a902d79b8f0d298f1c32b9a2303f48b3 Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Mon, 12 May 2025 16:18:51 +0200 Subject: [PATCH 18/29] switched to @needs --- tests/test_metrics.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/test_metrics.py b/tests/test_metrics.py index d31e99c3fe..8ffcb0148e 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -1,6 +1,5 @@ from __future__ import annotations -import importlib.util import warnings from functools import partial from string import ascii_letters @@ -202,12 +201,6 @@ def test_confusion_matrix_api(): ) -# importing igraph, louvain, leiden if available -HAS_IGRAPH = importlib.util.find_spec("igraph") is not None -HAS_LOUVAIN = importlib.util.find_spec("louvain") is not None -HAS_LEIDEN = importlib.util.find_spec("leidenalg") is not None - - # Test 1: Sample graph with clear community structure (dense & sparse, directed & undirected) @pytest.mark.parametrize( "mode", ["UNDIRECTED", "DIRECTED"], ids=["undirected", "directed"] From 623a86cfe50a7ca6ca986f891c3df489c32d017c Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Mon, 12 May 2025 16:22:05 +0200 Subject: [PATCH 19/29] variable fix input --- src/scanpy/metrics/_metrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/scanpy/metrics/_metrics.py b/src/scanpy/metrics/_metrics.py index 4841dec418..066bdf5d34 100644 --- a/src/scanpy/metrics/_metrics.py +++ b/src/scanpy/metrics/_metrics.py @@ -10,7 +10,7 @@ from pandas.api.types import CategoricalDtype from scipy.sparse import coo_matrix -from .._compat import CSRBase, SpBase +from .._compat import SpBase if TYPE_CHECKING: from collections.abc import Sequence @@ -130,7 +130,7 @@ def modularity( except ImportError as e: msg = "igraph is require for computing modularity" raise ImportError(msg) from e - if isinstance(connectivities, CSRBase | SpBase): + if isinstance(connectivities, SpBase): # check if the connectivities is a sparse matrix coo = coo_matrix(connectivities) edges = list(zip(coo.row, coo.col, strict=False)) From e8c9a254706cb7cc2f0dc00607b64d4982f493f2 Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Mon, 12 May 2025 16:25:39 +0200 Subject: [PATCH 20/29] code from separate PR removed --- src/scanpy/neighbors/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/scanpy/neighbors/__init__.py b/src/scanpy/neighbors/__init__.py index a1e09de8f2..ffbe6b9fdf 100644 --- a/src/scanpy/neighbors/__init__.py +++ b/src/scanpy/neighbors/__init__.py @@ -21,11 +21,9 @@ from .._utils import NeighborsView, _doc_params, get_literal_vals from . import _connectivity from ._common import ( - _get_indices_distances_from_dense_matrix, _get_indices_distances_from_sparse_matrix, _get_sparse_matrix_from_indices_distances, ) -from ._connectivity import umap from ._doc import doc_n_pcs, doc_use_rep from ._types import _KnownTransformer, _Method From 040b8b715a871df85b6af3fefb0edd3a3fd72e68 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Fri, 16 May 2025 13:35:08 +0200 Subject: [PATCH 21/29] unify metadata assembly --- src/scanpy/neighbors/__init__.py | 137 +++++++++++++++---------------- 1 file changed, 65 insertions(+), 72 deletions(-) diff --git a/src/scanpy/neighbors/__init__.py b/src/scanpy/neighbors/__init__.py index 7d4bef13f6..c96f6e3491 100644 --- a/src/scanpy/neighbors/__init__.py +++ b/src/scanpy/neighbors/__init__.py @@ -31,7 +31,7 @@ if TYPE_CHECKING: from collections.abc import Callable, MutableMapping - from typing import Any, Literal, NotRequired + from typing import Any, Literal, NotRequired, Unpack from anndata import AnnData from igraph import Graph @@ -60,6 +60,13 @@ class KwdsForTransformer(TypedDict): random_state: _LegacyRandom +class NeighborsDict(TypedDict): # noqa: D101 + connectivities_key: str + distances_key: str + params: NeighborsParams + rp_forest: NotRequired[RPForestDict] + + class NeighborsParams(TypedDict): # noqa: D101 n_neighbors: int method: _Method @@ -138,6 +145,7 @@ def neighbors( # noqa: PLR0913 Use :func:`rapids_singlecell.pp.neighbors` instead. metric A known metric’s name or a callable that returns a distance. + If `distances` is given, this parameter is simply stored in `.uns` (see below). *ignored if ``transformer`` is an instance.* metric_kwds @@ -190,12 +198,15 @@ def neighbors( # noqa: PLR0913 """ if distances is not None: - # Added this to support the new distance matrix function + if callable(metric): + msg = "`metric` must be a string if `distances` is given." + raise TypeError(msg) # if a precomputed distance matrix is provided, skip the PCA and distance computation return neighbors_from_distance( adata, distances, n_neighbors=n_neighbors, + metric=metric, method=method, ) start = logg.info("computing neighbors") @@ -215,46 +226,31 @@ def neighbors( # noqa: PLR0913 random_state=random_state, ) - if key_added is None: - key_added = "neighbors" - conns_key = "connectivities" - dists_key = "distances" - else: - conns_key = key_added + "_connectivities" - dists_key = key_added + "_distances" - - adata.uns[key_added] = {} - - neighbors_dict = adata.uns[key_added] - - neighbors_dict["connectivities_key"] = conns_key - neighbors_dict["distances_key"] = dists_key - - neighbors_dict["params"] = NeighborsParams( + key_added, neighbors_dict = _get_metadata( + key_added, n_neighbors=neighbors.n_neighbors, method=method, random_state=random_state, metric=metric, + **({} if not metric_kwds else dict(metric_kwds=metric_kwds)), + **({} if use_rep is None else dict(use_rep=use_rep)), + **({} if n_pcs is None else dict(n_pcs=n_pcs)), ) - if metric_kwds: - neighbors_dict["params"]["metric_kwds"] = metric_kwds - if use_rep is not None: - neighbors_dict["params"]["use_rep"] = use_rep - if n_pcs is not None: - neighbors_dict["params"]["n_pcs"] = n_pcs - - adata.obsp[dists_key] = neighbors.distances - adata.obsp[conns_key] = neighbors.connectivities if neighbors.rp_forest is not None: neighbors_dict["rp_forest"] = neighbors.rp_forest + + adata.uns[key_added] = neighbors_dict + adata.obsp[neighbors_dict["distances_key"]] = neighbors.distances + adata.obsp[neighbors_dict["connectivities_key"]] = neighbors.connectivities + logg.info( " finished", time=start, deep=( f"added to `.uns[{key_added!r}]`\n" - f" `.obsp[{dists_key!r}]`, distances for each pair of neighbors\n" - f" `.obsp[{conns_key!r}]`, weighted adjacency matrix" + f" `.obsp[{neighbors_dict['distances_key']!r}]`, distances for each pair of neighbors\n" + f" `.obsp[{neighbors_dict['connectivities_key']!r}]`, weighted adjacency matrix" ), ) return adata if copy else None @@ -265,6 +261,7 @@ def neighbors_from_distance( distances: np.ndarray | SpBase, *, n_neighbors: int = 15, + metric: _Metric = "euclidean", method: _Method = "umap", # default to umap key_added: str | None = None, ) -> AnnData: @@ -298,63 +295,59 @@ def neighbors_from_distance( distances = sparse.csr_matrix(distances) # noqa: TID251 distances.setdiag(0) distances.eliminate_zeros() - # extracting for each observation the indices and distances of the n_neighbors - # being then used by umap or gauss - knn_indices, knn_distances = _get_indices_distances_from_sparse_matrix( - distances, n_neighbors - ) else: - # if it is dense, converting it to ndarray - # and setting the diagonal to 0 - # extracting knn indices and distances distances = np.asarray(distances) np.fill_diagonal(distances, 0) - knn_indices, knn_distances = _get_indices_distances_from_dense_matrix( - distances, n_neighbors - ) if method == "umap": - # using umap to build connectivities from distances + if isinstance(distances, CSRBase): + knn_indices, knn_distances = _get_indices_distances_from_sparse_matrix( + distances, n_neighbors + ) + else: + knn_indices, knn_distances = _get_indices_distances_from_dense_matrix( + distances, n_neighbors + ) connectivities = umap( - knn_indices, - knn_distances, - n_obs=adata.n_obs, - n_neighbors=n_neighbors, + knn_indices, knn_distances, n_obs=adata.n_obs, n_neighbors=n_neighbors ) elif method == "gauss": - # using gauss to build connectivities from distances - # requires sparse matrix for efficiency - connectivities = _connectivity.gauss( - sparse.csr_matrix(distances), # noqa: TID251 - n_neighbors, - knn=True, - ) + distances = sparse.csr_matrix(distances) # noqa: TID251 + connectivities = _connectivity.gauss(distances, n_neighbors, knn=True) else: msg = f"Method {method} not implemented." raise NotImplementedError(msg) - # defining where to store graph info - key = "neighbors" if key_added is None else key_added - dists_key = "distances" if key_added is None else key_added + "_distances" - conns_key = "connectivities" if key_added is None else key_added + "_connectivities" - # storing the actual distance and connectivitiy matrices as obsp - adata.obsp[dists_key] = sparse.csr_matrix(distances) # noqa: TID251 - adata.obsp[conns_key] = connectivities - # populating with metadata describing how neighbors were computed - # I think might be important as many functions downstream rely - # on .uns['neighbors'] to find correct .obsp key - adata.uns[key] = { - "connectivities_key": "connectivities", - "distances_key": "distances", - "params": { - "n_neighbors": n_neighbors, - "method": method, - "random_state": 0, - "metric": "euclidean", - }, - } + + key_added, neighbors_dict = _get_metadata( + key_added, + n_neighbors=n_neighbors, + method=method, + random_state=0, + metric=metric, + ) + adata.uns[key_added] = neighbors_dict + adata.obsp[neighbors_dict["distances_key"]] = distances + adata.obsp[neighbors_dict["connectivities_key"]] = connectivities return adata +def _get_metadata( + key_added: str | None, + **params: Unpack[NeighborsParams], +) -> tuple[str, NeighborsDict]: + if key_added is None: + return "neighbors", NeighborsDict( + connectivities_key="connectivities", + distances_key="distances", + params=params, + ) + return key_added, NeighborsDict( + connectivities_key=f"{key_added}_connectivities", + distances_key=f"{key_added}_distances", + params=params, + ) + + class FlatTree(NamedTuple): # noqa: D101 hyperplanes: None offsets: None From d6a9aeea71fac3a39c340e6d114dd442ad796777 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Fri, 16 May 2025 13:40:51 +0200 Subject: [PATCH 22/29] Discard changes to src/scanpy/neighbors/__init__.py --- src/scanpy/neighbors/__init__.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/scanpy/neighbors/__init__.py b/src/scanpy/neighbors/__init__.py index ffbe6b9fdf..4a42111b74 100644 --- a/src/scanpy/neighbors/__init__.py +++ b/src/scanpy/neighbors/__init__.py @@ -74,7 +74,6 @@ def neighbors( # noqa: PLR0913 n_neighbors: int = 15, n_pcs: int | None = None, *, - distance_matrix: np.ndarray | None = None, use_rep: str | None = None, knn: bool = True, method: _Method = "umap", @@ -187,13 +186,6 @@ def neighbors( # noqa: PLR0913 :doc:`/how-to/knn-transformers` """ - # if distance_matrix is not None: - # return neighbors_from_distance( - # adata, - # distance_matrix, - # n_neighbors=n_neighbors, - # method=method, - # ) start = logg.info("computing neighbors") adata = adata.copy() if copy else adata if adata.is_view: # we shouldn't need this here... From c03b863729d72e34f0e3db789a9fafa545a31cd6 Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Fri, 23 May 2025 11:53:26 +0200 Subject: [PATCH 23/29] comments fix and release notes --- docs/release-notes/3627.feature.md | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 docs/release-notes/3627.feature.md diff --git a/docs/release-notes/3627.feature.md b/docs/release-notes/3627.feature.md new file mode 100644 index 0000000000..3443aa5cea --- /dev/null +++ b/docs/release-notes/3627.feature.md @@ -0,0 +1,3 @@ +Add your info here + +Added `neighbors_from_distance`, function for computing graphs from a precoputing distance matrix using UMAP or Gaussian methods. `A. Karesh` From 473a4378083cdf7e9fd21f7e91a60d8a92074e06 Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Fri, 23 May 2025 11:57:03 +0200 Subject: [PATCH 24/29] comments fix typo --- src/scanpy/neighbors/__init__.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/scanpy/neighbors/__init__.py b/src/scanpy/neighbors/__init__.py index c96f6e3491..574c036f63 100644 --- a/src/scanpy/neighbors/__init__.py +++ b/src/scanpy/neighbors/__init__.py @@ -265,12 +265,6 @@ def neighbors_from_distance( method: _Method = "umap", # default to umap key_added: str | None = None, ) -> AnnData: - ### inconsistent neighbors = bkk and n throw some stuff = bad way of writing the graph - ### adjust for this = knn = True - # computes the neighborhood graph from a precomputed distance matrix - # both umap an gauss are supported, default is umap - # skipping PCA and distance computation and goes straight to the graph - # key_added is the key under which to store the results in adata.uns or adata.obsp """Compute neighbors from a precomputer distance matrix. Parameters From ac0a6b3fb8e69424ff257b9707b7adfbb74d1ca3 Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Sun, 25 May 2025 16:58:50 +0200 Subject: [PATCH 25/29] before neighbour merge --- src/scanpy/metrics/_metrics.py | 28 +++++++++++----------------- tests/test_metrics.py | 22 ++++++++++------------ 2 files changed, 21 insertions(+), 29 deletions(-) diff --git a/src/scanpy/metrics/_metrics.py b/src/scanpy/metrics/_metrics.py index 066bdf5d34..29014151ad 100644 --- a/src/scanpy/metrics/_metrics.py +++ b/src/scanpy/metrics/_metrics.py @@ -14,7 +14,6 @@ if TYPE_CHECKING: from collections.abc import Sequence - from typing import Literal from anndata import AnnData from numpy.typing import ArrayLike @@ -67,9 +66,7 @@ def confusion_matrix( orig, new = pd.Series(orig), pd.Series(new) assert len(orig) == len(new) - unique_labels = pd.unique( - np.concatenate((np.asarray(orig.values), np.asarray(new.values))) - ) + unique_labels = pd.unique(np.concatenate((orig.to_numpy(), new.to_numpy()))) # Compute mtx = _confusion_matrix(orig, new, labels=unique_labels) @@ -103,20 +100,19 @@ def confusion_matrix( def modularity( connectivities: ArrayLike | SpBase, labels: pd.Series | ArrayLike, - mode: Literal["UNDIRECTED", "DIRECTED"] = "UNDIRECTED", + *, + is_directed: bool, ) -> float: - # accepting both dense or spare matrices as the connectivity graph - # setting mode between directed and undirected """Compute the modularity of a graph given its connectivities and labels. Parameters ---------- - connectivities: array-like or sparse matrix + connectivities: Weighted adjacency matrix representing the graph. Can be a dense NumPy array or a sparse CSR matrix. - labels: array-like or pandas.Series + labels: Cluster labels for each node in the graph. - mode: str - The mode of the graph. Can be "UNDIRECTED" or "DIRECTED". Default is "UNDIRECTED". + is_directed: + Whether the graph is directed or undirected. If True, the graph is treated as directed; otherwise, it is treated as undirected. Returns ------- @@ -137,12 +133,12 @@ def modularity( # converting to the coo format to extract the edges and weights # storing only non-zero elements and their indices weights = coo.data.tolist() - graph = ig.Graph(edges=edges, directed=mode == "DIRECTED") + graph = ig.Graph(edges=edges, directed=is_directed) graph.es["weight"] = weights else: # if the graph is dense, creates it directly using igraph's adjacency matrix dense_array = np.asarray(connectivities) - igraph_mode = ig.ADJ_UNDIRECTED if mode == "UNDIRECTED" else ig.ADJ_DIRECTED + igraph_mode = ig.ADJ_DIRECTED if is_directed else ig.ADJ_UNDIRECTED graph = ig.Graph.Weighted_Adjacency(dense_array.tolist(), mode=igraph_mode) # cluster labels to integer codes required by igraph labels = pd.Categorical(np.asarray(labels)).codes @@ -155,7 +151,7 @@ def modularity_adata( *, labels: str | ArrayLike = "leiden", obsp: str = "connectivities", - mode: Literal["UNDIRECTED", "DIRECTED"] = "UNDIRECTED", + is_directed: bool, ) -> float: # default to leiden labels and connectivities as it is more common """Compute modularity from an AnnData object using stored graph and clustering labels. @@ -174,12 +170,10 @@ def modularity_adata( float The modularity of the graph based on the provided clustering. """ - # if labels is a key in adata.obs, get the values from adata.obs - # otherwise, assume it is an array-like object label_array = adata.obs[labels] if isinstance(labels, str) else labels connectivities = adata.obsp[obsp] if isinstance(connectivities, pd.DataFrame): connectivities = connectivities.values - return modularity(connectivities, label_array, mode=mode) + return modularity(connectivities, label_array, is_directed=is_directed) diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 8ffcb0148e..7fae0fe3cd 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -202,12 +202,10 @@ def test_confusion_matrix_api(): # Test 1: Sample graph with clear community structure (dense & sparse, directed & undirected) -@pytest.mark.parametrize( - "mode", ["UNDIRECTED", "DIRECTED"], ids=["undirected", "directed"] -) +@pytest.mark.parametrize("is_directed", [False, True], ids=["undirected", "directed"]) @pytest.mark.parametrize("use_sparse", [False, True], ids=["sparse", "dense"]) @needs.igraph -def test_modularity_sample_structure(mode, use_sparse): +def test_modularity_sample_structure(use_sparse, is_directed): # 4 node adjacency matrix with two separate 2-node communities mat = np.array( [ @@ -219,7 +217,7 @@ def test_modularity_sample_structure(mode, use_sparse): ) labels = ["A", "A", "B", "B"] adj = csr_matrix(mat) if use_sparse else mat - score = modularity(adj, labels, mode=mode) + score = modularity(adj, labels, is_directed=is_directed) # Modularity should be between 0 and 1 assert 0 <= score <= 1 @@ -231,7 +229,7 @@ def test_modularity_single_community(): # fully connected graph sample adj = np.ones((4, 4)) - np.eye(4) labels = ["A", "A", "A", "A"] - score = modularity(adj, labels) + score = modularity(adj, labels, is_directed=False) # modularity should be 0 assert score == pytest.approx(0.0, rel=1e-6) @@ -248,7 +246,7 @@ def test_modularity_invalid_labels(): InternalError, match="Membership vector size differs", ): - modularity(adj, labels) + modularity(adj, labels, is_directed=False) # Test 4: Pass both Louvain and Leiden clustering algorithms @@ -265,14 +263,14 @@ def test_modularity_adata_multiple_clusterings(cluster_method): if cluster_method == "louvain": sc.tl.louvain(adata) score_louvain = modularity_adata( - adata, labels="louvain", obsp="connectivities", mode="UNDIRECTED" + adata, labels="louvain", obsp="connectivities", is_directed=False ) # Score should be between 0 and 1 assert 0 <= score_louvain <= 1 if cluster_method == "leiden": sc.tl.leiden(adata) score_leiden = modularity_adata( - adata, labels="leiden", obsp="connectivities", mode="UNDIRECTED" + adata, labels="leiden", obsp="connectivities", is_directed=False ) # Score should be between 0 and 1 assert 0 <= score_leiden <= 1 @@ -291,8 +289,8 @@ def test_modularity_order(): ) labels1 = ["A", "A", "B", "B"] labels2 = ["B", "B", "A", "A"] - score_1 = modularity(adj, labels1) - score_2 = modularity(adj, labels2) + score_1 = modularity(adj, labels1, is_directed=False) + score_2 = modularity(adj, labels2, is_directed=False) assert score_1 == score_2 @@ -301,7 +299,7 @@ def test_modularity_order(): def test_modularity_disconnected_graph(): adj = np.zeros((4, 4)) labels = ["A", "B", "C", "D"] - score = modularity(adj, labels) + score = modularity(adj, labels, is_directed=False) # Modularity should be undefined for disconnected graphs assert np.isnan(score) From 1c033f0cde4b12a97c1e4528c0f11a1a3100a9b9 Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Sun, 25 May 2025 16:59:37 +0200 Subject: [PATCH 26/29] notes --- docs/release-notes/3616.feature.md | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 docs/release-notes/3616.feature.md diff --git a/docs/release-notes/3616.feature.md b/docs/release-notes/3616.feature.md new file mode 100644 index 0000000000..5ef2c8ed03 --- /dev/null +++ b/docs/release-notes/3616.feature.md @@ -0,0 +1,3 @@ +Add your info here + +Added modularity scoring functions for graphs, including `modularity_adata` and support for directed/undirected graphs. `A. Karesh` From a1b20330fbf4cf1032b479955253e3d11c35c325 Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Sun, 25 May 2025 19:16:34 +0200 Subject: [PATCH 27/29] merge error fix --- src/scanpy/metrics/_metrics.py | 3 +++ src/scanpy/neighbors/__init__.py | 2 ++ tests/test_metrics.py | 2 ++ 3 files changed, 7 insertions(+) diff --git a/src/scanpy/metrics/_metrics.py b/src/scanpy/metrics/_metrics.py index 6dd5dc05aa..29014151ad 100644 --- a/src/scanpy/metrics/_metrics.py +++ b/src/scanpy/metrics/_metrics.py @@ -8,6 +8,9 @@ import pandas as pd from natsort import natsorted from pandas.api.types import CategoricalDtype +from scipy.sparse import coo_matrix + +from .._compat import SpBase if TYPE_CHECKING: from collections.abc import Sequence diff --git a/src/scanpy/neighbors/__init__.py b/src/scanpy/neighbors/__init__.py index 1766615006..574c036f63 100644 --- a/src/scanpy/neighbors/__init__.py +++ b/src/scanpy/neighbors/__init__.py @@ -21,9 +21,11 @@ from .._utils import NeighborsView, _doc_params, get_literal_vals from . import _connectivity from ._common import ( + _get_indices_distances_from_dense_matrix, _get_indices_distances_from_sparse_matrix, _get_sparse_matrix_from_indices_distances, ) +from ._connectivity import umap from ._doc import doc_n_pcs, doc_use_rep from ._types import _KnownTransformer, _Method diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 9c1ffec674..7fae0fe3cd 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -10,8 +10,10 @@ import pytest import threadpoolctl from scipy import sparse +from scipy.sparse import csr_matrix # noqa: TID251 import scanpy as sc +from scanpy.metrics import modularity, modularity_adata from testing.scanpy._helpers.data import pbmc68k_reduced from testing.scanpy._pytest.marks import needs from testing.scanpy._pytest.params import ARRAY_TYPES From 4cdc7299e6e1374e6416c3d3837f6b430db2cd76 Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Sun, 25 May 2025 19:50:53 +0200 Subject: [PATCH 28/29] post merge and call form neighbor --- src/scanpy/metrics/__init__.py | 4 ++-- src/scanpy/metrics/_metrics.py | 23 ++++++++++++++--------- src/scanpy/neighbors/__init__.py | 3 +++ 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/src/scanpy/metrics/__init__.py b/src/scanpy/metrics/__init__.py index 9cdf82bdf2..5ec2229909 100644 --- a/src/scanpy/metrics/__init__.py +++ b/src/scanpy/metrics/__init__.py @@ -3,7 +3,7 @@ from __future__ import annotations from ._gearys_c import gearys_c -from ._metrics import confusion_matrix +from ._metrics import confusion_matrix, modularity, modularity_adata from ._morans_i import morans_i -__all__ = ["confusion_matrix", "gearys_c", "morans_i"] +__all__ = ["confusion_matrix", "gearys_c", "modularity", "modularity_adata", "morans_i"] diff --git a/src/scanpy/metrics/_metrics.py b/src/scanpy/metrics/_metrics.py index 29014151ad..8c352abf16 100644 --- a/src/scanpy/metrics/_metrics.py +++ b/src/scanpy/metrics/_metrics.py @@ -149,8 +149,8 @@ def modularity( def modularity_adata( adata: AnnData, *, - labels: str | ArrayLike = "leiden", - obsp: str = "connectivities", + label_key: str | ArrayLike = "leiden", + key: str = "neighbors", is_directed: bool, ) -> float: # default to leiden labels and connectivities as it is more common @@ -158,20 +158,25 @@ def modularity_adata( Parameters ---------- - adata: AnnData + adata: The AnnData object containing the data. - labels: str or array-like - The key in adata.obs that contains the cluster labels. - obsp: str - The key in adata.obsp that contains the connectivities. + label_key: + The key in `adata.obs` that contains the clustering labels. Defaults to "leiden". + key: + The key in `adata.obsp` that contains the connectivities. Defaults to "neighbors". + is_directed: + Whether the graph is directed or undirected. If True, the graph is treated as directed; otherwise, it is treated as undirected. Returns ------- float The modularity of the graph based on the provided clustering. """ - label_array = adata.obs[labels] if isinstance(labels, str) else labels - connectivities = adata.obsp[obsp] + label_array = adata.obs[label_key] if isinstance(label_key, str) else label_key + connectivities_key = adata.uns[key]["connectivities_key"] + params = adata.uns[key]["params"] + connectivities = adata.obsp[connectivities_key] + is_directed = params["is_directed"] if isinstance(connectivities, pd.DataFrame): connectivities = connectivities.values diff --git a/src/scanpy/neighbors/__init__.py b/src/scanpy/neighbors/__init__.py index 574c036f63..8058c559db 100644 --- a/src/scanpy/neighbors/__init__.py +++ b/src/scanpy/neighbors/__init__.py @@ -93,6 +93,7 @@ def neighbors( # noqa: PLR0913 random_state: _LegacyRandom = 0, key_added: str | None = None, copy: bool = False, + is_directed: bool = False, ) -> AnnData | None: """Compute the nearest neighbors distance matrix and a neighborhood graph of observations :cite:p:`McInnes2018`. @@ -237,6 +238,8 @@ def neighbors( # noqa: PLR0913 **({} if n_pcs is None else dict(n_pcs=n_pcs)), ) + neighbors_dict["params"]["is_directed"] = is_directed + if neighbors.rp_forest is not None: neighbors_dict["rp_forest"] = neighbors.rp_forest From cb7aaf6126113e706db9ec5a5da19c3d9f44fc35 Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Mon, 26 May 2025 15:51:16 +0200 Subject: [PATCH 29/29] release notes fix --- docs/release-notes/3616.feature.md | 4 +--- docs/release-notes/3627.feature.md | 3 --- 2 files changed, 1 insertion(+), 6 deletions(-) delete mode 100644 docs/release-notes/3627.feature.md diff --git a/docs/release-notes/3616.feature.md b/docs/release-notes/3616.feature.md index 5ef2c8ed03..06ea8b523d 100644 --- a/docs/release-notes/3616.feature.md +++ b/docs/release-notes/3616.feature.md @@ -1,3 +1 @@ -Add your info here - -Added modularity scoring functions for graphs, including `modularity_adata` and support for directed/undirected graphs. `A. Karesh` +Add modularity scoring via {func}`modularity_adata` with support for directed/undirected graphs {smaller}`A. Karesh` diff --git a/docs/release-notes/3627.feature.md b/docs/release-notes/3627.feature.md deleted file mode 100644 index 3443aa5cea..0000000000 --- a/docs/release-notes/3627.feature.md +++ /dev/null @@ -1,3 +0,0 @@ -Add your info here - -Added `neighbors_from_distance`, function for computing graphs from a precoputing distance matrix using UMAP or Gaussian methods. `A. Karesh`