From 5a529762dcdbb9aa572c6288d597fc87060ac8fb Mon Sep 17 00:00:00 2001 From: LLehner Date: Mon, 4 Mar 2024 23:33:19 +0100 Subject: [PATCH 01/22] Add method to calculate embeddings for variable by distance aggregation --- src/squidpy/tl/__init__.py | 1 + src/squidpy/tl/_var_embeddings.py | 92 +++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 src/squidpy/tl/_var_embeddings.py diff --git a/src/squidpy/tl/__init__.py b/src/squidpy/tl/__init__.py index edaa31825..eb3ac5951 100644 --- a/src/squidpy/tl/__init__.py +++ b/src/squidpy/tl/__init__.py @@ -3,3 +3,4 @@ from __future__ import annotations from squidpy.tl._var_by_distance import var_by_distance +from squidpy.tl._var_embeddings import var_embeddings diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py new file mode 100644 index 000000000..d3308a872 --- /dev/null +++ b/src/squidpy/tl/_var_embeddings.py @@ -0,0 +1,92 @@ +from __future__ import annotations + + +import pandas as pd +from anndata import AnnData +from scanpy import logging as logg +from sklearn.preprocessing import StandardScaler +import umap + +from squidpy._docs import d + +__all__ = ["var_embeddings"] + + +@d.dedent +def var_embeddings( + adata: AnnData, + cluster_key: str, + design_matrix_key: str = "design_matrix", + n_bins: int = 100, + include_anchor: bool = False, +) -> AnnData: + """ + Cluster variables by previously calculated distance to an anchor point. + + Parameters + ---------- + %(adata)s + cluster_key + Annotation column in `.obs` that is used as anchor. + design_matrix_key + Name of the design matrix saved to `.obsm`. + n_bins + Number of bins to use for aggregation. + include_anchor + Whether to include the variable counts belonging to the anchor point in the aggregation. + Returns + ------- + If ``copy = True``, returns the design_matrix with the distances to an anchor point + Otherwise, stores design_matrix in `.obsm`. + """ + if design_matrix_key not in adata.obsm.keys(): + raise ValueError(f"`.obsm['{design_matrix_key}']` does not exist. Aborting.") + + logg.info("Calculating embeddings for distance aggregations by gene.") + + df = adata.obsm[design_matrix_key].copy() + + # bin the data by distance + df["bins"] = pd.cut(df[cluster_key], bins=n_bins) + + # get median value of each interval + df['median_value'] = df['bins'].apply(calculate_median) + + # turn categorical NaNs into float 0s + df['median_value'] = pd.to_numeric(df['median_value'], errors='coerce').fillna(0).astype(float) + + # get count matrix and add binned distance to each .obs + X_df = adata.to_df() + X_df["distance"] = df["median_value"].copy() + + # transpose the count matrix + X_df_T = X_df.T + + # aggregate the transposed count matrix by the distances and remove the distance row + mth_row_values = X_df_T.iloc[-1] + result = X_df_T.groupby(mth_row_values, axis=1).sum() + result.drop(result.tail(1).index,inplace=True) + + # optionally include or remove variable values for distance 0 (anchro point) + if not include_anchor: + result = result.drop(result.columns[0], axis=1) + + reducer = umap.UMAP() + + # scale the data and reduce dimensionality + scaled_exp = StandardScaler().fit_transform(result.values) + scaled_exp_df = pd.DataFrame(scaled_exp, index=result.index, columns=result.columns) + embedding = reducer.fit_transform(scaled_exp_df) + + adata.varm[f"{n_bins}_bins_distance_aggregation"] = result + embedding_df = pd.DataFrame(embedding, index=result.index) + embedding_df["var"] = result.index + adata.uns[f"{n_bins}_bins_distance_embeddings"] = embedding_df + + return + +def calculate_median(interval): + median = interval.mid + + return median + From eb84518f6aa68893c98503822427e2a9a2b85fe0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 4 Mar 2024 22:57:34 +0000 Subject: [PATCH 02/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/squidpy/tl/_var_embeddings.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index d3308a872..03868aeac 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -1,11 +1,10 @@ from __future__ import annotations - import pandas as pd +import umap from anndata import AnnData from scanpy import logging as logg from sklearn.preprocessing import StandardScaler -import umap from squidpy._docs import d @@ -50,10 +49,10 @@ def var_embeddings( df["bins"] = pd.cut(df[cluster_key], bins=n_bins) # get median value of each interval - df['median_value'] = df['bins'].apply(calculate_median) + df["median_value"] = df["bins"].apply(calculate_median) # turn categorical NaNs into float 0s - df['median_value'] = pd.to_numeric(df['median_value'], errors='coerce').fillna(0).astype(float) + df["median_value"] = pd.to_numeric(df["median_value"], errors="coerce").fillna(0).astype(float) # get count matrix and add binned distance to each .obs X_df = adata.to_df() @@ -65,14 +64,14 @@ def var_embeddings( # aggregate the transposed count matrix by the distances and remove the distance row mth_row_values = X_df_T.iloc[-1] result = X_df_T.groupby(mth_row_values, axis=1).sum() - result.drop(result.tail(1).index,inplace=True) + result.drop(result.tail(1).index, inplace=True) # optionally include or remove variable values for distance 0 (anchro point) if not include_anchor: result = result.drop(result.columns[0], axis=1) reducer = umap.UMAP() - + # scale the data and reduce dimensionality scaled_exp = StandardScaler().fit_transform(result.values) scaled_exp_df = pd.DataFrame(scaled_exp, index=result.index, columns=result.columns) @@ -85,8 +84,8 @@ def var_embeddings( return + def calculate_median(interval): median = interval.mid return median - From 488da20df560179f360e9f9c5b519e5bc94331ec Mon Sep 17 00:00:00 2001 From: LLehner Date: Mon, 4 Mar 2024 23:58:46 +0100 Subject: [PATCH 03/22] Fix pre-commit --- src/squidpy/tl/_var_embeddings.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index d3308a872..f45942ff6 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -1,11 +1,12 @@ from __future__ import annotations +from typing import Any import pandas as pd +import umap from anndata import AnnData from scanpy import logging as logg from sklearn.preprocessing import StandardScaler -import umap from squidpy._docs import d @@ -50,10 +51,10 @@ def var_embeddings( df["bins"] = pd.cut(df[cluster_key], bins=n_bins) # get median value of each interval - df['median_value'] = df['bins'].apply(calculate_median) + df["median_value"] = df["bins"].apply(calculate_median) # turn categorical NaNs into float 0s - df['median_value'] = pd.to_numeric(df['median_value'], errors='coerce').fillna(0).astype(float) + df["median_value"] = pd.to_numeric(df["median_value"], errors="coerce").fillna(0).astype(float) # get count matrix and add binned distance to each .obs X_df = adata.to_df() @@ -65,14 +66,14 @@ def var_embeddings( # aggregate the transposed count matrix by the distances and remove the distance row mth_row_values = X_df_T.iloc[-1] result = X_df_T.groupby(mth_row_values, axis=1).sum() - result.drop(result.tail(1).index,inplace=True) + result.drop(result.tail(1).index, inplace=True) # optionally include or remove variable values for distance 0 (anchro point) if not include_anchor: result = result.drop(result.columns[0], axis=1) reducer = umap.UMAP() - + # scale the data and reduce dimensionality scaled_exp = StandardScaler().fit_transform(result.values) scaled_exp_df = pd.DataFrame(scaled_exp, index=result.index, columns=result.columns) @@ -85,8 +86,8 @@ def var_embeddings( return -def calculate_median(interval): + +def calculate_median(interval: pd.Interval) -> Any: median = interval.mid return median - From 0b724941c56b289139b47d9d17b90d978dd00997 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 4 Mar 2024 23:00:55 +0000 Subject: [PATCH 04/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/squidpy/tl/_var_embeddings.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index c575de4b9..60139f3f6 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -4,7 +4,6 @@ import pandas as pd import umap -import umap from anndata import AnnData from scanpy import logging as logg from sklearn.preprocessing import StandardScaler @@ -78,7 +77,6 @@ def var_embeddings( reducer = umap.UMAP() - # scale the data and reduce dimensionality scaled_exp = StandardScaler().fit_transform(result.values) scaled_exp_df = pd.DataFrame(scaled_exp, index=result.index, columns=result.columns) From edcca877dbf21b542b160abd62a91cf183d8bc94 Mon Sep 17 00:00:00 2001 From: LLehner Date: Tue, 5 Mar 2024 00:05:23 +0100 Subject: [PATCH 05/22] Update param name --- src/squidpy/tl/_var_embeddings.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index c575de4b9..87c75d23a 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -17,7 +17,7 @@ @d.dedent def var_embeddings( adata: AnnData, - cluster_key: str, + group: str, design_matrix_key: str = "design_matrix", n_bins: int = 100, include_anchor: bool = False, @@ -28,7 +28,7 @@ def var_embeddings( Parameters ---------- %(adata)s - cluster_key + group Annotation column in `.obs` that is used as anchor. design_matrix_key Name of the design matrix saved to `.obsm`. @@ -49,7 +49,7 @@ def var_embeddings( df = adata.obsm[design_matrix_key].copy() # bin the data by distance - df["bins"] = pd.cut(df[cluster_key], bins=n_bins) + df["bins"] = pd.cut(df[group], bins=n_bins) # get median value of each interval df["median_value"] = df["bins"].apply(calculate_median) From 4be2529ef43bd6ecbea0d26464409b242d2e7899 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 4 Mar 2024 23:00:55 +0000 Subject: [PATCH 06/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/squidpy/tl/_var_embeddings.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index 87c75d23a..91f1d1768 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -4,7 +4,6 @@ import pandas as pd import umap -import umap from anndata import AnnData from scanpy import logging as logg from sklearn.preprocessing import StandardScaler @@ -78,7 +77,6 @@ def var_embeddings( reducer = umap.UMAP() - # scale the data and reduce dimensionality scaled_exp = StandardScaler().fit_transform(result.values) scaled_exp_df = pd.DataFrame(scaled_exp, index=result.index, columns=result.columns) From cfe496cbddc5c2aa31aa034f71f155b72d26e784 Mon Sep 17 00:00:00 2001 From: LLehner Date: Mon, 22 Apr 2024 19:14:40 +0200 Subject: [PATCH 07/22] Remove duplicate code --- src/squidpy/tl/_var_embeddings.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index 91f1d1768..7205c32e9 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -52,11 +52,9 @@ def var_embeddings( # get median value of each interval df["median_value"] = df["bins"].apply(calculate_median) - df["median_value"] = df["bins"].apply(calculate_median) # turn categorical NaNs into float 0s df["median_value"] = pd.to_numeric(df["median_value"], errors="coerce").fillna(0).astype(float) - df["median_value"] = pd.to_numeric(df["median_value"], errors="coerce").fillna(0).astype(float) # get count matrix and add binned distance to each .obs X_df = adata.to_df() @@ -69,23 +67,22 @@ def var_embeddings( mth_row_values = X_df_T.iloc[-1] result = X_df_T.groupby(mth_row_values, axis=1).sum() result.drop(result.tail(1).index, inplace=True) - result.drop(result.tail(1).index, inplace=True) - # optionally include or remove variable values for distance 0 (anchro point) + # optionally include or remove variable values for distance 0 (anchor point) if not include_anchor: result = result.drop(result.columns[0], axis=1) - reducer = umap.UMAP() + #reducer = umap.UMAP() # scale the data and reduce dimensionality - scaled_exp = StandardScaler().fit_transform(result.values) - scaled_exp_df = pd.DataFrame(scaled_exp, index=result.index, columns=result.columns) - embedding = reducer.fit_transform(scaled_exp_df) + #scaled_exp = StandardScaler().fit_transform(result.values) + #scaled_exp_df = pd.DataFrame(scaled_exp, index=result.index, columns=result.columns) + #embedding = reducer.fit_transform(scaled_exp_df) adata.varm[f"{n_bins}_bins_distance_aggregation"] = result - embedding_df = pd.DataFrame(embedding, index=result.index) - embedding_df["var"] = result.index - adata.uns[f"{n_bins}_bins_distance_embeddings"] = embedding_df + #embedding_df = pd.DataFrame(embedding, index=result.index) + #embedding_df["var"] = result.index + #adata.uns[f"{n_bins}_bins_distance_embeddings"] = embedding_df return From c4fca2920c82a9b17eab9c8b15471b0c4145e394 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 22 Apr 2024 17:15:01 +0000 Subject: [PATCH 08/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/squidpy/tl/_var_embeddings.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index 7205c32e9..e9b289d59 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -72,17 +72,17 @@ def var_embeddings( if not include_anchor: result = result.drop(result.columns[0], axis=1) - #reducer = umap.UMAP() + # reducer = umap.UMAP() # scale the data and reduce dimensionality - #scaled_exp = StandardScaler().fit_transform(result.values) - #scaled_exp_df = pd.DataFrame(scaled_exp, index=result.index, columns=result.columns) - #embedding = reducer.fit_transform(scaled_exp_df) + # scaled_exp = StandardScaler().fit_transform(result.values) + # scaled_exp_df = pd.DataFrame(scaled_exp, index=result.index, columns=result.columns) + # embedding = reducer.fit_transform(scaled_exp_df) adata.varm[f"{n_bins}_bins_distance_aggregation"] = result - #embedding_df = pd.DataFrame(embedding, index=result.index) - #embedding_df["var"] = result.index - #adata.uns[f"{n_bins}_bins_distance_embeddings"] = embedding_df + # embedding_df = pd.DataFrame(embedding, index=result.index) + # embedding_df["var"] = result.index + # adata.uns[f"{n_bins}_bins_distance_embeddings"] = embedding_df return From 64e38dfb55b0c1b765a8476c09760f5170cd97b6 Mon Sep 17 00:00:00 2001 From: LLehner Date: Mon, 22 Apr 2024 23:50:10 +0200 Subject: [PATCH 09/22] Improve performance, Update output --- src/squidpy/tl/_var_embeddings.py | 50 ++++++++++++++----------------- 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index 7205c32e9..67545b5f0 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -2,11 +2,11 @@ from typing import Any +import numpy as np import pandas as pd -import umap +import scanpy as sc from anndata import AnnData from scanpy import logging as logg -from sklearn.preprocessing import StandardScaler from squidpy._docs import d @@ -46,45 +46,39 @@ def var_embeddings( logg.info("Calculating embeddings for distance aggregations by gene.") df = adata.obsm[design_matrix_key].copy() - # bin the data by distance df["bins"] = pd.cut(df[group], bins=n_bins) - # get median value of each interval df["median_value"] = df["bins"].apply(calculate_median) - # turn categorical NaNs into float 0s df["median_value"] = pd.to_numeric(df["median_value"], errors="coerce").fillna(0).astype(float) - # get count matrix and add binned distance to each .obs X_df = adata.to_df() - X_df["distance"] = df["median_value"].copy() - + X_df["distance"] = df["median_value"] + # aggregate the count matrix by the bins + aggregated_df = X_df.groupby(["distance"]).sum() # transpose the count matrix - X_df_T = X_df.T - - # aggregate the transposed count matrix by the distances and remove the distance row - mth_row_values = X_df_T.iloc[-1] - result = X_df_T.groupby(mth_row_values, axis=1).sum() - result.drop(result.tail(1).index, inplace=True) + result = aggregated_df.T # optionally include or remove variable values for distance 0 (anchor point) + start_bin = 0 if not include_anchor: result = result.drop(result.columns[0], axis=1) - - #reducer = umap.UMAP() - - # scale the data and reduce dimensionality - #scaled_exp = StandardScaler().fit_transform(result.values) - #scaled_exp_df = pd.DataFrame(scaled_exp, index=result.index, columns=result.columns) - #embedding = reducer.fit_transform(scaled_exp_df) - - adata.varm[f"{n_bins}_bins_distance_aggregation"] = result - #embedding_df = pd.DataFrame(embedding, index=result.index) - #embedding_df["var"] = result.index - #adata.uns[f"{n_bins}_bins_distance_embeddings"] = embedding_df - - return + start_bin = 1 + + # set genes x bins to count matrix (required for embeddings and clustering) + var_by_bins = sc.AnnData(result) + # set genes x bins to .obs (required for plotting counts by distance) + var_by_bins.obs = result + # rename column names for plotting + var_by_bins.obs.columns = range(start_bin, 101) + # create genes x genes identity matrix + identity_df = pd.DataFrame(np.eye(len(var_by_bins.obs)), columns=var_by_bins.obs.index, dtype="category") + # append identity matrix to obs column wise (required for highlighting genes in plot) + identity_df.index = var_by_bins.obs.index + var_by_bins.obs = pd.concat([var_by_bins.obs, identity_df], axis=1) + + return var_by_bins def calculate_median(interval: pd.Interval) -> Any: From 9eabd0d196ce0bf6522a6d6d3a4c477b60d363b8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 22 Apr 2024 21:53:23 +0000 Subject: [PATCH 10/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/squidpy/tl/_var_embeddings.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index 233cb6d3d..a9055f69d 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -1,13 +1,13 @@ from __future__ import annotations +import time from typing import Any import numpy as np import pandas as pd -from anndata import AnnData import scanpy as sc +from anndata import AnnData from scanpy import logging as logg -import time from squidpy._docs import d @@ -72,7 +72,7 @@ def var_embeddings( # set genes x bins to .obs (required for plotting counts by distance) var_by_bins.obs = result # rename column names for plotting - var_by_bins.obs.columns = range(start_bin,101) + var_by_bins.obs.columns = range(start_bin, 101) # create genes x genes identity matrix identity_df = pd.DataFrame(np.eye(len(var_by_bins.obs)), columns=var_by_bins.obs.index, dtype="category") # append identity matrix to obs column wise (required for highlighting genes in plot) From a40a8cfcddafddd18402f2df4ad433edaf13dc4a Mon Sep 17 00:00:00 2001 From: LLehner Date: Mon, 22 Apr 2024 23:55:55 +0200 Subject: [PATCH 11/22] Remove import --- src/squidpy/tl/_var_embeddings.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index 233cb6d3d..67545b5f0 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -4,10 +4,9 @@ import numpy as np import pandas as pd -from anndata import AnnData import scanpy as sc +from anndata import AnnData from scanpy import logging as logg -import time from squidpy._docs import d @@ -72,7 +71,7 @@ def var_embeddings( # set genes x bins to .obs (required for plotting counts by distance) var_by_bins.obs = result # rename column names for plotting - var_by_bins.obs.columns = range(start_bin,101) + var_by_bins.obs.columns = range(start_bin, 101) # create genes x genes identity matrix identity_df = pd.DataFrame(np.eye(len(var_by_bins.obs)), columns=var_by_bins.obs.index, dtype="category") # append identity matrix to obs column wise (required for highlighting genes in plot) From 09c72b08bcbeafd73e27b69553d75a0eb8ad104e Mon Sep 17 00:00:00 2001 From: LLehner <64135338+LLehner@users.noreply.github.com> Date: Tue, 23 Apr 2024 00:01:38 +0200 Subject: [PATCH 12/22] Remove import --- src/squidpy/tl/_var_embeddings.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index a9055f69d..67545b5f0 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -1,6 +1,5 @@ from __future__ import annotations -import time from typing import Any import numpy as np From 3396146dcc8fb475e5e14910f7d4f61237edbde3 Mon Sep 17 00:00:00 2001 From: LLehner Date: Sun, 26 May 2024 23:12:21 +0200 Subject: [PATCH 13/22] Update return --- src/squidpy/tl/_var_embeddings.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index a9055f69d..2ce5e4c6d 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -1,7 +1,6 @@ from __future__ import annotations -import time -from typing import Any +from typing import Any, Union import numpy as np import pandas as pd @@ -21,7 +20,8 @@ def var_embeddings( design_matrix_key: str = "design_matrix", n_bins: int = 100, include_anchor: bool = False, -) -> AnnData: + copy: bool = False, +) -> Union[AnnData, (pd.DataFrame, pd.DataFrame)]: """ Cluster variables by previously calculated distance to an anchor point. @@ -38,8 +38,8 @@ def var_embeddings( Whether to include the variable counts belonging to the anchor point in the aggregation. Returns ------- - If ``copy = True``, returns the design_matrix with the distances to an anchor point - Otherwise, stores design_matrix in `.obsm`. + If ``copy = True``, returns var by distance matrices. + Otherwise, stores var by distance bin matrices in `.obsm`. """ if design_matrix_key not in adata.obsm.keys(): raise ValueError(f"`.obsm['{design_matrix_key}']` does not exist. Aborting.") @@ -67,19 +67,17 @@ def var_embeddings( result = result.drop(result.columns[0], axis=1) start_bin = 1 - # set genes x bins to count matrix (required for embeddings and clustering) - var_by_bins = sc.AnnData(result) - # set genes x bins to .obs (required for plotting counts by distance) - var_by_bins.obs = result # rename column names for plotting - var_by_bins.obs.columns = range(start_bin, 101) + result.columns = range(start_bin, 101) # create genes x genes identity matrix - identity_df = pd.DataFrame(np.eye(len(var_by_bins.obs)), columns=var_by_bins.obs.index, dtype="category") + obs = pd.DataFrame(np.eye(len(result)), columns=result.index, dtype="category") # append identity matrix to obs column wise (required for highlighting genes in plot) - identity_df.index = var_by_bins.obs.index - var_by_bins.obs = pd.concat([var_by_bins.obs, identity_df], axis=1) + obs.index = result.index + adata.obsm["var_by_distance_X"] = result + adata.obsm["var_by_distance_obs"] = obs - return var_by_bins + if copy: + return (result, obs) def calculate_median(interval: pd.Interval) -> Any: From 67bdd5c146b16815d7cf4e7ca850770ca449b461 Mon Sep 17 00:00:00 2001 From: LLehner Date: Sun, 26 May 2024 23:16:00 +0200 Subject: [PATCH 14/22] Fix pre-commit --- src/squidpy/tl/_var_embeddings.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index 2ce5e4c6d..d2e3aa026 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Union +from typing import Any, Union, Tuple import numpy as np import pandas as pd @@ -21,7 +21,7 @@ def var_embeddings( n_bins: int = 100, include_anchor: bool = False, copy: bool = False, -) -> Union[AnnData, (pd.DataFrame, pd.DataFrame)]: +) -> Union[AnnData, Tuple[pd.DataFrame, pd.DataFrame]]: """ Cluster variables by previously calculated distance to an anchor point. From 876c4edf26ab2cdf27c2f8121b5d9d1f4c1568e1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 26 May 2024 21:16:24 +0000 Subject: [PATCH 15/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/squidpy/tl/_var_embeddings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index d2e3aa026..e5b4bff2a 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Union, Tuple +from typing import Any, Tuple, Union import numpy as np import pandas as pd From 8ee07bafc0e02b86a9b7fd0d5d95f499574deca5 Mon Sep 17 00:00:00 2001 From: LLehner Date: Sun, 26 May 2024 23:57:17 +0200 Subject: [PATCH 16/22] Fix pre-commit --- src/squidpy/tl/_var_embeddings.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index e5b4bff2a..ea9a4201a 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Tuple, Union +from typing import Any, Union import numpy as np import pandas as pd @@ -21,7 +21,7 @@ def var_embeddings( n_bins: int = 100, include_anchor: bool = False, copy: bool = False, -) -> Union[AnnData, Tuple[pd.DataFrame, pd.DataFrame]]: +) -> Union[AnnData, pd.DataFrame]: """ Cluster variables by previously calculated distance to an anchor point. From d3cefff90876ce64dcff83f7fa617363b21f942a Mon Sep 17 00:00:00 2001 From: LLehner Date: Mon, 27 May 2024 19:06:35 +0200 Subject: [PATCH 17/22] Fix pre-commit --- src/squidpy/tl/_var_embeddings.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index ea9a4201a..398a847c9 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Union +from typing import Any import numpy as np import pandas as pd @@ -21,7 +21,7 @@ def var_embeddings( n_bins: int = 100, include_anchor: bool = False, copy: bool = False, -) -> Union[AnnData, pd.DataFrame]: +) -> AnnData | pd.DataFrame: """ Cluster variables by previously calculated distance to an anchor point. From 57296760d44ce609f684ac966a08e4dc2a595631 Mon Sep 17 00:00:00 2001 From: Laurens Lehner Date: Thu, 8 Aug 2024 13:11:41 +0200 Subject: [PATCH 18/22] Fix indices; Update return type --- src/squidpy/tl/_var_embeddings.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index 398a847c9..636302546 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -20,7 +20,6 @@ def var_embeddings( design_matrix_key: str = "design_matrix", n_bins: int = 100, include_anchor: bool = False, - copy: bool = False, ) -> AnnData | pd.DataFrame: """ Cluster variables by previously calculated distance to an anchor point. @@ -60,7 +59,6 @@ def var_embeddings( aggregated_df = X_df.groupby(["distance"]).sum() # transpose the count matrix result = aggregated_df.T - # optionally include or remove variable values for distance 0 (anchor point) start_bin = 0 if not include_anchor: @@ -69,15 +67,15 @@ def var_embeddings( # rename column names for plotting result.columns = range(start_bin, 101) - # create genes x genes identity matrix - obs = pd.DataFrame(np.eye(len(result)), columns=result.index, dtype="category") - # append identity matrix to obs column wise (required for highlighting genes in plot) + # create genes x genes identity matrix (required for highlighting genes in plot) + obs = pd.DataFrame(np.eye(len(result)), columns=result.index) + obs.replace(1.0, pd.Series(obs.columns, obs.columns), inplace=True) + obs.replace(0.0, "other", inplace=True) + obs = obs.astype("category") obs.index = result.index - adata.obsm["var_by_distance_X"] = result - adata.obsm["var_by_distance_obs"] = obs + adata_new = AnnData(X=result, obs=obs, var=pd.DataFrame(index=result.columns)) - if copy: - return (result, obs) + return adata_new def calculate_median(interval: pd.Interval) -> Any: From 7dfa9332302621ab6ccb3291a764f0bbf771bf1e Mon Sep 17 00:00:00 2001 From: LLehner Date: Mon, 26 Aug 2024 19:03:02 +0200 Subject: [PATCH 19/22] Add spatialdata as input --- src/squidpy/tl/_var_embeddings.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index 636302546..ac6ef2105 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -1,12 +1,13 @@ from __future__ import annotations -from typing import Any +from typing import Any, Optional import numpy as np import pandas as pd import scanpy as sc from anndata import AnnData from scanpy import logging as logg +from spatialdata import SpatialData from squidpy._docs import d @@ -15,20 +16,23 @@ @d.dedent def var_embeddings( - adata: AnnData, + sdata: SpatialData, + table: str, group: str, design_matrix_key: str = "design_matrix", n_bins: int = 100, include_anchor: bool = False, ) -> AnnData | pd.DataFrame: """ - Cluster variables by previously calculated distance to an anchor point. + Bin variables by previously calculated distance to an anchor point. Parameters ---------- %(adata)s + table + Name of the table in `SpatialData` object. group - Annotation column in `.obs` that is used as anchor. + Annotation column in design matrix, given by `design_matrix_key`, that is used as anchor. design_matrix_key Name of the design matrix saved to `.obsm`. n_bins @@ -40,6 +44,9 @@ def var_embeddings( If ``copy = True``, returns var by distance matrices. Otherwise, stores var by distance bin matrices in `.obsm`. """ + + adata = sdata.tables[table] + if design_matrix_key not in adata.obsm.keys(): raise ValueError(f"`.obsm['{design_matrix_key}']` does not exist. Aborting.") @@ -75,7 +82,7 @@ def var_embeddings( obs.index = result.index adata_new = AnnData(X=result, obs=obs, var=pd.DataFrame(index=result.columns)) - return adata_new + sdata.tables["var_by_dist_bins"] = adata_new def calculate_median(interval: pd.Interval) -> Any: From d6e5ecd59789010043f5c704a72b2c90ab44e586 Mon Sep 17 00:00:00 2001 From: LLehner Date: Tue, 27 Aug 2024 11:21:17 +0200 Subject: [PATCH 20/22] Update docstring --- src/squidpy/tl/_var_embeddings.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index ac6ef2105..44ae3d2e6 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -41,8 +41,7 @@ def var_embeddings( Whether to include the variable counts belonging to the anchor point in the aggregation. Returns ------- - If ``copy = True``, returns var by distance matrices. - Otherwise, stores var by distance bin matrices in `.obsm`. + Stores binned count matrices in `sdata.tables["var_by_dist_bins"]`. """ adata = sdata.tables[table] From dc4028c6180bad528cbc785c61b0bd8b8f5e060c Mon Sep 17 00:00:00 2001 From: Tim Treis Date: Fri, 9 May 2025 22:32:01 +0200 Subject: [PATCH 21/22] stash --- src/squidpy/_utils.py | 14 ++ src/squidpy/pl/_var_by_distance.py | 238 +++++++++++++++++++++++++++-- src/squidpy/tl/_var_by_distance.py | 21 ++- src/squidpy/tl/_var_embeddings.py | 81 ++++++---- 4 files changed, 303 insertions(+), 51 deletions(-) diff --git a/src/squidpy/_utils.py b/src/squidpy/_utils.py index 5aa65c54d..6eee095e9 100644 --- a/src/squidpy/_utils.py +++ b/src/squidpy/_utils.py @@ -15,6 +15,8 @@ import joblib as jl import numpy as np +from anndata import AnnData +from spatialdata import SpatialData __all__ = ["singledispatchmethod", "Signal", "SigQueue", "NDArray", "NDArrayA"] @@ -337,3 +339,15 @@ def new_func2(*args: Any, **kwargs: Any) -> Any: else: raise TypeError(repr(type(reason))) + +def _get_adata_from_input(data: AnnData | SpatialData, table: str | None = None) -> None: + if isinstance(data, AnnData): + return data + elif isinstance(data, SpatialData): + if table is None: + raise ValueError("If using a SpatialData object, a table name must be provided with `table`.") + if table not in data.tables.keys(): + raise ValueError(f"Table `{table}` not found in `SpatialData` object.") + return data.tables[table] + else: + raise TypeError(f"Expected `data` to be of type `AnnData` or `SpatialData`, found `{type(data).__name__}`.") \ No newline at end of file diff --git a/src/squidpy/pl/_var_by_distance.py b/src/squidpy/pl/_var_by_distance.py index 35e113b19..4d47a10f6 100644 --- a/src/squidpy/pl/_var_by_distance.py +++ b/src/squidpy/pl/_var_by_distance.py @@ -17,8 +17,10 @@ from scanpy.plotting._tools.scatterplots import _panel_grid from scanpy.plotting._utils import _set_default_colors_for_categorical_obs from scipy.sparse import issparse +from spatialdata import SpatialData from squidpy._docs import d +from squidpy._utils import _get_adata_from_input from squidpy.pl._utils import save_fig __all__ = ["var_by_distance"] @@ -26,9 +28,10 @@ @d.dedent def var_by_distance( - adata: AnnData, + data: AnnData | SpatialData, var: str | list[str], anchor_key: str | list[str], + table: str | None = None, design_matrix_key: str = "design_matrix", color: str | None = None, covariate: str | None = None, @@ -94,6 +97,9 @@ def var_by_distance( regplot_kwargs = dict(regplot_kwargs) scatterplot_kwargs = dict(scatterplot_kwargs) + # potentially extract table from SpatialData object + adata = _get_adata_from_input(data, table) + # if several variables are plotted, make a panel grid if isinstance(var, list): fig, grid = _panel_grid( @@ -106,16 +112,16 @@ def var_by_distance( else: var = [var] - df = adata.obsm[design_matrix_key] # get design matrix + design_matrix = adata.obsm[design_matrix_key] # add var column to design matrix for name in var: if name in adata.var_names: - df[name] = ( + design_matrix[name] = ( np.array(adata[:, name].X.toarray()) if issparse(adata[:, name].X) else np.array(adata[:, name].X) ) elif name in adata.obs: - df[name] = adata.obs[name].values + design_matrix[name] = adata.obs[name].values else: raise ValueError(f"Variable {name} not found in `adata.var` or `adata.obs`.") @@ -131,7 +137,7 @@ def var_by_distance( # if no covariate is specified, 'sns.regplot' will take the values of all observations if covariate is None: sns.regplot( - data=df, + data=design_matrix, x=anchor_key, y=v, order=order, @@ -145,12 +151,12 @@ def var_by_distance( if isinstance(line_palette, str) or line_palette is None: _set_default_colors_for_categorical_obs(adata, covariate) line_palette = adata.uns[covariate + "_colors"] - covariate_instances = df[covariate].unique() + covariate_instances = design_matrix[covariate].unique() # iterate over all covariate values and make 'sns.regplot' for each for i, co in enumerate(covariate_instances): sns.regplot( - data=df.loc[df[covariate] == co], + data=design_matrix.loc[design_matrix[covariate] == co], x=anchor_key, y=v, order=order, @@ -165,15 +171,15 @@ def var_by_distance( # add scatter plot if specified if show_scatter: if color is None: - plt.scatter(data=df, x=anchor_key, y=v, color="grey", **scatterplot_kwargs) + plt.scatter(data=design_matrix, x=anchor_key, y=v, color="grey", **scatterplot_kwargs) # if variable to plot on color palette is categorical, make categorical color palette - elif df[color].dtype.name == "category": - unique_colors = df[color].unique() + elif design_matrix[color].dtype.name == "category": + unique_colors = design_matrix[color].unique() cNorm = colors.Normalize(vmin=0, vmax=len(unique_colors)) scalarMap = cm.ScalarMappable(norm=cNorm, cmap=scatter_palette) for i in range(len(unique_colors)): plt.scatter( - data=df.loc[df[color] == unique_colors[i]], + data=design_matrix.loc[design_matrix[color] == unique_colors[i]], x=anchor_key, y=v, color=scalarMap.to_rgba(i), @@ -182,7 +188,7 @@ def var_by_distance( # if variable to plot on color palette is not categorical else: plt.scatter( - data=df, + data=design_matrix, x=anchor_key, y=v, c=color, @@ -206,3 +212,211 @@ def var_by_distance( save_fig(fig, path=save, transparent=False, dpi=dpi) if return_ax: return axs + + + +# @d.dedent +# def var_by_distance( +# data: AnnData | SpatialData, +# var: str | list[str], +# table: str | None = None, +# color: str | None = None, +# covariate: str | None = None, +# order: int = 5, +# show_scatter: bool = True, +# line_palette: str | Sequence[str] | Cycler | None = None, +# scatter_palette: str | Sequence[str] | Cycler | None = "viridis", +# dpi: int | None = None, +# figsize: tuple[int, int] | None = None, +# save: str | Path | None = None, +# title: str | list[str] | None = None, +# axis_label: str | None = None, +# return_ax: bool | None = None, +# regplot_kwargs: Mapping[str, Any] = MappingProxyType({}), +# scatterplot_kwargs: Mapping[str, Any] = MappingProxyType({}), +# ) -> Axes | None: +# """ +# Plot variables using a smooth regression line with increasing distance to an anchor point. + +# Parameters +# ---------- +# data +# AnnData or SpatialData object returned by the `var_embeddings` function. +# var +# Variables (genes) to plot on y-axis. +# table +# Name of the table in `SpatialData` object. +# color +# Variable to color the scatter plot. +# covariate +# A covariate for which separate regression lines are plotted for each category. +# order +# Order of the polynomial fit for :func:`seaborn.regplot`. +# show_scatter +# Whether to show a scatter plot underlying the regression line. +# line_palette +# Categorical color palette used in case a covariate is specified. +# scatter_palette +# Color palette for the scatter plot underlying the :func:`seaborn.regplot`. +# dpi +# Dots per inch. +# figsize +# Size of the figure in inches. +# save +# Path to save the plot. +# title +# Panel titles. +# axis_label +# Panel axis labels. +# return_ax +# Whether to return :class:`matplotlib.axes.Axes` object(s). +# regplot_kwargs +# Additional keyword arguments for :func:`seaborn.regplot`. +# scatterplot_kwargs +# Additional keyword arguments for :func:`matplotlib.pyplot.scatter`. + +# Returns +# ------- +# Axes or None +# """ +# dpi = rcParams["figure.dpi"] if dpi is None else dpi +# regplot_kwargs = dict(regplot_kwargs) +# scatterplot_kwargs = dict(scatterplot_kwargs) + +# # Validate data type and extract AnnData object +# if isinstance(data, AnnData): +# adata = data +# elif isinstance(data, SpatialData): +# if table is None: +# raise ValueError("If using a SpatialData object, a table name must be provided with `table`.") +# if table not in data.tables: +# raise KeyError(f"Table '{table}' not found in SpatialData object. Available tables: {list(data.tables.keys())}") +# adata = data.tables[table] +# else: +# raise TypeError(f"Expected `data` to be of type `AnnData` or `SpatialData`, found '{type(data).__name__}'.") + +# if isinstance(var, str): +# var = [var] + +# # If multiple variables, set up a grid of plots +# if len(var) > 1: +# fig, grid = _panel_grid( +# hspace=0.25, +# wspace=0.75 / rcParams["figure.figsize"][0] + 0.02, +# ncols=4, +# num_panels=len(var), +# ) +# axs = [] +# else: +# fig, ax = plt.subplots(1, 1, figsize=figsize) +# axs = [ax] + +# # Create dataframe from adata +# df = adata.to_df() + +# # Ensure all values of `var` are in adata.obs and have float values +# for v in var: +# if v not in adata.obs_names: +# raise KeyError(f"Variable '{v}' not found in adata.obs_names") +# df.loc[v] = df.loc[v].astype(float) + +# # If color is specified and is in adata.obs or adata.var_names, add to df +# if color is not None: +# if color in adata.obs: +# df[color] = adata.obs[color].values +# elif color in adata.var_names: +# df[color] = adata[:, color].X.flatten() +# else: +# raise ValueError(f"Color variable '{color}' not found in adata.obs or adata.var_names.") + +# # If covariate is specified and is in adata.obs, add to df +# if covariate is not None: +# if covariate in adata.obs: +# df[covariate] = adata.obs[covariate].values +# else: +# raise ValueError(f"Covariate '{covariate}' not found in adata.obs.") + +# # Iterate over the variables to plot +# for i, v in enumerate(var): +# if len(var) > 1: +# ax = plt.subplot(grid[i]) +# axs.append(ax) +# else: +# ax = axs[0] + +# # if no covariate is specified, use seaborn regplot directly +# if covariate is None: +# sns.regplot( +# data=df, +# x='distance', +# y=v, +# order=order, +# color=line_palette, +# scatter=show_scatter, +# ax=ax, +# line_kws=regplot_kwargs, +# scatter_kws=scatterplot_kwargs, +# ) +# else: +# # Generate color palette if not provided +# if line_palette is None: +# _set_default_colors_for_categorical_obs(adata, covariate) +# line_palette = adata.uns[covariate + "_colors"] +# covariate_instances = df[covariate].unique() + +# # Iterate over each category in covariate +# for idx, category in enumerate(covariate_instances): +# sns.regplot( +# data=df[df[covariate] == category], +# x='distance', +# y=v, +# order=order, +# color=line_palette[idx % len(line_palette)], +# scatter=show_scatter, +# ax=ax, +# label=str(category), +# line_kws=regplot_kwargs, +# scatter_kws=scatterplot_kwargs, +# ) +# ax.legend(title=covariate) + +# # Add scatter plot if specified +# if show_scatter and color is not None: +# if df[color].dtype.name == "category": +# unique_colors = df[color].unique() +# palette = sns.color_palette(scatter_palette, len(unique_colors)) +# for idx, cat in enumerate(unique_colors): +# ax.scatter( +# x=df.loc[df[color] == cat, 'distance'], +# y=df.loc[df[color] == cat, v], +# color=palette[idx], +# label=str(cat), +# **scatterplot_kwargs, +# ) +# else: +# sc = ax.scatter( +# x=df['distance'], +# y=df[v], +# c=df[color], +# cmap=scatter_palette, +# **scatterplot_kwargs, +# ) +# fig.colorbar(sc, ax=ax) + +# if isinstance(title, list): +# ax.set_title(title[i]) +# elif title is not None: +# ax.set_title(title) +# if axis_label is None: +# ax.set_xlabel("Distance") +# ax.set_ylabel(v) +# else: +# ax.set_xlabel(axis_label) + +# if save is not None: +# save_fig(fig, path=save, transparent=False, dpi=dpi) + +# if return_ax: +# return axs if len(axs) > 1 else axs[0] +# else: +# plt.show() diff --git a/src/squidpy/tl/_var_by_distance.py b/src/squidpy/tl/_var_by_distance.py index 3f137c762..d48e48851 100644 --- a/src/squidpy/tl/_var_by_distance.py +++ b/src/squidpy/tl/_var_by_distance.py @@ -11,9 +11,10 @@ from sklearn.metrics import DistanceMetric from sklearn.neighbors import KDTree from sklearn.preprocessing import MinMaxScaler +from spatialdata import SpatialData from squidpy._docs import d -from squidpy._utils import NDArrayA +from squidpy._utils import NDArrayA, _get_adata_from_input from squidpy.gr._utils import _save_data __all__ = ["var_by_distance"] @@ -21,10 +22,11 @@ @d.dedent def var_by_distance( - adata: AnnData, + data: AnnData | SpatialData, groups: str | list[str] | NDArrayA, cluster_key: str, library_key: str | None = None, + table: str | None = None, design_matrix_key: str = "design_matrix", covariates: str | list[str] | None = None, metric: str = "euclidean", @@ -57,16 +59,19 @@ def var_by_distance( If ``copy = True``, returns the design_matrix with the distances to an anchor point Otherwise, stores design_matrix in `.obsm`. """ + # potentially extract table from SpatialData object + adata = _get_adata_from_input(data, table) + start = logg.info(f"Creating {design_matrix_key}") # list of columns which will be categorical later on - categorical_columns = [cluster_key] + # categorical_columns = [cluster_key] # save initial metadata to adata.uns if copy == False if not copy: adata.uns[design_matrix_key] = _add_metadata( cluster_key, groups, metric=metric, library_key=library_key, covariates=covariates ) - if isinstance(groups, str) or isinstance(groups, np.ndarray): + if isinstance(groups, str | np.ndarray): anchor: list[Any] = [groups] elif isinstance(groups, list): anchor = groups @@ -78,7 +83,7 @@ def var_by_distance( batch = [None] else: batch = adata.obs[library_key].unique() - categorical_columns.append(library_key) + # categorical_columns.append(library_key) batch_design_matrices = {} max_distances = {} @@ -188,8 +193,8 @@ def _add_metadata( metadata["anchor_raw"] = "custom_anchor_raw" elif isinstance(groups, list): for i, anchor in enumerate(groups): - metadata["anchor_scaled_" + str(i)] = anchor - metadata["anchor_raw_" + str(i)] = anchor + "_raw" + metadata[f"anchor_scaled_{str(i)}"] = anchor + metadata[f"anchor_raw_{str(i)}"] = anchor + "_raw" else: metadata["anchor_scaled"] = groups metadata["anchor_raw"] = groups + "_raw" @@ -205,7 +210,7 @@ def _add_metadata( if isinstance(covariates, str): covariates = [covariates] for i, covariate in enumerate(covariates): - metadata["covariate_" + str(i)] = covariate + metadata[f"covariate_{str(i)}"] = covariate return metadata diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index 44ae3d2e6..748b5c238 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -10,61 +10,82 @@ from spatialdata import SpatialData from squidpy._docs import d +from squidpy._utils import _get_adata_from_input __all__ = ["var_embeddings"] @d.dedent def var_embeddings( - sdata: SpatialData, - table: str, + data: AnnData | SpatialData, group: str, design_matrix_key: str = "design_matrix", + table: str | None = None, n_bins: int = 100, include_anchor: bool = False, -) -> AnnData | pd.DataFrame: + return_as_adata: bool = False, +) -> AnnData | None: """ Bin variables by previously calculated distance to an anchor point. Parameters ---------- - %(adata)s - table - Name of the table in `SpatialData` object. + data + AnnData or SpatialData object. group Annotation column in design matrix, given by `design_matrix_key`, that is used as anchor. design_matrix_key Name of the design matrix saved to `.obsm`. + table + Name of the table in `SpatialData` object. n_bins Number of bins to use for aggregation. include_anchor Whether to include the variable counts belonging to the anchor point in the aggregation. + return_as_adata + Only evaluated, if `data` is a SpatialData object. Whether to return the result or store it as a new table. + Returns ------- - Stores binned count matrices in `sdata.tables["var_by_dist_bins"]`. + AnnData or None + If `data` is an `AnnData` object or `return_as_adata` is True, returns the new `AnnData` object. + If `data` is a `SpatialData` object and `return_as_adata` is False, modifies `data` in place and returns None. """ - adata = sdata.tables[table] + adata = _get_adata_from_input(data, table) + + if design_matrix_key not in adata.obsm: + raise KeyError(f"Design matrix key '{design_matrix_key}' not found in .obsm. Available keys are: {list(adata.obsm.keys())}") + + design_matrix = adata.obsm[design_matrix_key].copy() + if group not in design_matrix.columns: + raise KeyError(f"Group column '{group}' not found in design matrix. Available columns: {list(design_matrix.columns)}") + if not pd.api.types.is_numeric_dtype(design_matrix[group]): + raise TypeError(f"The group column '{group}' must be numeric.") - if design_matrix_key not in adata.obsm.keys(): - raise ValueError(f"`.obsm['{design_matrix_key}']` does not exist. Aborting.") logg.info("Calculating embeddings for distance aggregations by gene.") - df = adata.obsm[design_matrix_key].copy() - # bin the data by distance - df["bins"] = pd.cut(df[group], bins=n_bins) - # get median value of each interval - df["median_value"] = df["bins"].apply(calculate_median) + # bin the data by distance and calculate the median distance for each bin + intervals = pd.cut(design_matrix[group], bins=n_bins) + + # Extract the interval bounds as tuples and midpoints in a single pass + design_matrix["bins"] = [(interval.left, interval.right) if pd.notnull(interval) else (0.0, 0.0) for interval in intervals] + design_matrix["median_value"] = [interval.mid if pd.notnull(interval) else 0.0 for interval in intervals] + + # turn categorical NaNs into float 0s - df["median_value"] = pd.to_numeric(df["median_value"], errors="coerce").fillna(0).astype(float) + design_matrix["median_value"] = pd.to_numeric(design_matrix["median_value"], errors="coerce").fillna(0).astype(float) + # get count matrix and add binned distance to each .obs X_df = adata.to_df() - X_df["distance"] = df["median_value"] + X_df["distance"] = design_matrix["median_value"] + # aggregate the count matrix by the bins aggregated_df = X_df.groupby(["distance"]).sum() - # transpose the count matrix + result = aggregated_df.T + # optionally include or remove variable values for distance 0 (anchor point) start_bin = 0 if not include_anchor: @@ -72,19 +93,17 @@ def var_embeddings( start_bin = 1 # rename column names for plotting - result.columns = range(start_bin, 101) - # create genes x genes identity matrix (required for highlighting genes in plot) - obs = pd.DataFrame(np.eye(len(result)), columns=result.index) - obs.replace(1.0, pd.Series(obs.columns, obs.columns), inplace=True) - obs.replace(0.0, "other", inplace=True) - obs = obs.astype("category") - obs.index = result.index - adata_new = AnnData(X=result, obs=obs, var=pd.DataFrame(index=result.columns)) - - sdata.tables["var_by_dist_bins"] = adata_new + result.columns = range(start_bin, n_bins + 1) + adata_new = AnnData(X=result) + adata_new.uns[design_matrix_key] = design_matrix -def calculate_median(interval: pd.Interval) -> Any: - median = interval.mid + if isinstance(data, AnnData): + return adata_new + elif isinstance(data, SpatialData): + if return_as_adata: + return adata_new + else: + data.tables["var_by_dist_bins"] = adata_new + return None - return median From f9149208d1aa2f84d69d25b1b6fd54b5ce0dc46f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 9 May 2025 20:32:43 +0000 Subject: [PATCH 22/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/squidpy/_utils.py | 3 ++- src/squidpy/tl/_var_embeddings.py | 19 ++++++++++++------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/squidpy/_utils.py b/src/squidpy/_utils.py index c209c6bc4..b9d220d53 100644 --- a/src/squidpy/_utils.py +++ b/src/squidpy/_utils.py @@ -340,6 +340,7 @@ def new_func2(*args: Any, **kwargs: Any) -> Any: else: raise TypeError(repr(type(reason))) + def _get_adata_from_input(data: AnnData | SpatialData, table: str | None = None) -> None: if isinstance(data, AnnData): return data @@ -350,4 +351,4 @@ def _get_adata_from_input(data: AnnData | SpatialData, table: str | None = None) raise ValueError(f"Table `{table}` not found in `SpatialData` object.") return data.tables[table] else: - raise TypeError(f"Expected `data` to be of type `AnnData` or `SpatialData`, found `{type(data).__name__}`.") \ No newline at end of file + raise TypeError(f"Expected `data` to be of type `AnnData` or `SpatialData`, found `{type(data).__name__}`.") diff --git a/src/squidpy/tl/_var_embeddings.py b/src/squidpy/tl/_var_embeddings.py index 748b5c238..4e5681457 100644 --- a/src/squidpy/tl/_var_embeddings.py +++ b/src/squidpy/tl/_var_embeddings.py @@ -55,27 +55,33 @@ def var_embeddings( adata = _get_adata_from_input(data, table) if design_matrix_key not in adata.obsm: - raise KeyError(f"Design matrix key '{design_matrix_key}' not found in .obsm. Available keys are: {list(adata.obsm.keys())}") + raise KeyError( + f"Design matrix key '{design_matrix_key}' not found in .obsm. Available keys are: {list(adata.obsm.keys())}" + ) design_matrix = adata.obsm[design_matrix_key].copy() if group not in design_matrix.columns: - raise KeyError(f"Group column '{group}' not found in design matrix. Available columns: {list(design_matrix.columns)}") + raise KeyError( + f"Group column '{group}' not found in design matrix. Available columns: {list(design_matrix.columns)}" + ) if not pd.api.types.is_numeric_dtype(design_matrix[group]): raise TypeError(f"The group column '{group}' must be numeric.") - logg.info("Calculating embeddings for distance aggregations by gene.") # bin the data by distance and calculate the median distance for each bin intervals = pd.cut(design_matrix[group], bins=n_bins) # Extract the interval bounds as tuples and midpoints in a single pass - design_matrix["bins"] = [(interval.left, interval.right) if pd.notnull(interval) else (0.0, 0.0) for interval in intervals] + design_matrix["bins"] = [ + (interval.left, interval.right) if pd.notnull(interval) else (0.0, 0.0) for interval in intervals + ] design_matrix["median_value"] = [interval.mid if pd.notnull(interval) else 0.0 for interval in intervals] - # turn categorical NaNs into float 0s - design_matrix["median_value"] = pd.to_numeric(design_matrix["median_value"], errors="coerce").fillna(0).astype(float) + design_matrix["median_value"] = ( + pd.to_numeric(design_matrix["median_value"], errors="coerce").fillna(0).astype(float) + ) # get count matrix and add binned distance to each .obs X_df = adata.to_df() @@ -106,4 +112,3 @@ def var_embeddings( else: data.tables["var_by_dist_bins"] = adata_new return None -