quadbio
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/notebooks/tutorials/spatial_mapping.ipynb‎
Lines changed: 49 additions & 47 deletions b/‎docs/notebooks/tutorials/spatial_mapping.ipynb‎
Lines changed: 49 additions & 47 deletions
diff --git a/‎docs/notebooks/tutorials/spatial_smoothing.ipynb‎
Lines changed: 176 additions & 126 deletions b/‎docs/notebooks/tutorials/spatial_smoothing.ipynb‎
Lines changed: 176 additions & 126 deletions
diff --git a/‎docs/references.bib‎
Lines changed: 24 additions & 0 deletions b/‎docs/references.bib‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 7 additions & 8 deletions b/‎pyproject.toml‎
Lines changed: 7 additions & 8 deletions
diff --git a/‎src/cellmapper/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎src/cellmapper/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/cellmapper/check.py‎
Lines changed: 2 additions & 0 deletions b/‎src/cellmapper/check.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/cellmapper/constants.py‎
Lines changed: 4 additions & 0 deletions b/‎src/cellmapper/constants.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/cellmapper/cellmapper.py‎ ‎src/cellmapper/model/cellmapper.py‎src/cellmapper/cellmapper.py renamed to src/cellmapper/model/cellmapper.py
Lines changed: 72 additions & 65 deletions b/‎src/cellmapper/cellmapper.py‎ ‎src/cellmapper/model/cellmapper.py‎src/cellmapper/cellmapper.py renamed to src/cellmapper/model/cellmapper.py
Lines changed: 72 additions & 65 deletions
@@ -20,5 +20,6 @@ __pycache__/
 /docs/_build/
 .ipynb_checkpoints/
 
-# datasets
+# datasets and models
 *.h5ad
+*.pt
@@ -232,3 +232,27 @@ @article{lopez2018deep
   publisher={Nature Publishing Group US New York},
   url={https://www.nature.com/articles/s41592-018-0229-2},
 }
+
+@article{stuart2019comprehensive,
+  title={Comprehensive integration of single-cell data},
+  author={Stuart, Tim and Butler, Andrew and Hoffman, Paul and Hafemeister, Christoph and Papalexi, Efthymia and Mauck III, William M and Hao, Yuhan and Stoeckius, Marlon and Smibert, Peter and Satija, Rahul},
+  journal={Cell},
+  volume={177},
+  number={7},
+  pages={1888--1902},
+  year={2019},
+  publisher={Elsevier},
+  url={https://www.sciencedirect.com/science/article/pii/S0092867419305598},
+}
+
+@article{xia2023spatial,
+  title={Spatial-linked alignment tool (SLAT) for aligning heterogenous slices},
+  author={Xia, Chen-Rui and Cao, Zhi-Jie and Tu, Xin-Ming and Gao, Ge},
+  journal={Nature Communications},
+  volume={14},
+  number={1},
+  pages={7236},
+  year={2023},
+  publisher={Nature Publishing Group UK London},
+  url={https://www.nature.com/articles/s41467-023-43105-5},
+}
@@ -23,13 +23,12 @@ classifiers = [
 ]
 dynamic = [ "version" ]
 dependencies = [
-  "anndata",
+  "anndata>=0.11",
   "numpy",
   "packaging",
   "pandas",
-  "pynndescent",
   "rich",
-  "scanpy",
+  "scanpy>=1.11",
   "scikit-learn",
   "scipy",
   # for debug logging (referenced from the issue template)
@@ -58,17 +57,17 @@ optional-dependencies.doc = [
 optional-dependencies.test = [
   "coverage",
   "pytest",
-  "squidpy",
+  "squidpy>=1.6",
 ]
 optional-dependencies.tutorials = [
-  "cellmapper",
   "harmony-pytorch",
+  "igraph",
   "netgraph",
   "python-louvain",
-  "scvi-tools",
+  "scvi-tools>=1.3",
   "seaborn",
-  "sopa",
-  "squidpy",
+  "sopa>=2",
+  "squidpy>=1.6",
 ]
 
 # https://docs.pypi.org/project_metadata/#project-urls
 
@@ -1,8 +1,8 @@
 from importlib.metadata import version
 
-from .cellmapper import CellMapper
-from .knn import Neighbors
 from .logging import logger
+from .model.cellmapper import CellMapper
+from .model.knn import Neighbors
 
 __all__ = ["logger", "CellMapper", "Neighbors"]
 
 
@@ -63,12 +63,14 @@ def check(self) -> None:
     "https://docs.rapids.ai/install/.",
     faiss="To speed up k-NN search on GPU, you may install faiss following the guide from "
     "https://github.com/facebookresearch/faiss/blob/main/INSTALL.md",
+    pynndescent="To use fast approximate k-NN search, install pynndescent: pip install pynndescent",
 )
 
 CHECKERS = {
     "cuml": Checker("cuml", vmin=None, install_hint=INSTALL_HINTS.cuml),
     "cupy": Checker("cupy", vmin=None, install_hint=INSTALL_HINTS.cupy),
     "faiss": Checker("faiss", package_name="faiss", vmin="1.7.0", install_hint=INSTALL_HINTS.faiss),
+    "pynndescent": Checker("pynndescent", vmin=None, install_hint=INSTALL_HINTS.pynndescent),
 }
 
 
 
@@ -0,0 +1,4 @@
+class PackageConstants:
+    """Constants used througout the package."""
+
+    n_comps: int = 50
@@ -3,22 +3,21 @@
 import gc
 from typing import Any, Literal
 
-import anndata as ad
 import numpy as np
 import pandas as pd
 import scanpy as sc
 from anndata import AnnData
 from scipy.sparse import coo_matrix, csc_matrix, csr_matrix
 from sklearn.preprocessing import OneHotEncoder
 
-from cellmapper.evaluate import CellMapperEvaluationMixin
 from cellmapper.logging import logger
-from cellmapper.utils import create_imputed_anndata
+from cellmapper.model.embedding import EmbeddingMixin
+from cellmapper.model.evaluate import EvaluationMixin
+from cellmapper.model.knn import Neighbors
+from cellmapper.utils import create_imputed_anndata, get_n_comps
 
-from .knn import Neighbors
 
-
-class CellMapper(CellMapperEvaluationMixin):
+class CellMapper(EvaluationMixin, EmbeddingMixin):
     """Mapping of labels, embeddings, and expression values between reference and query datasets."""
 
     def __init__(self, query: AnnData, reference: AnnData | None = None) -> None:
@@ -137,79 +136,59 @@ def _validate_and_normalize_mapping_matrix(
 
         return mapping_matrix
 
-    def compute_joint_pca(self, n_components: int = 50, key_added: str = "pca_joint", **kwargs) -> None:
-        """
-        Compute a joint PCA on the normalized .X matrices of query and reference, using only overlapping genes.
-
-        Parameters
-        ----------
-        n_components
-            Number of principal components to compute.
-        key_added
-            Key under which to store the joint PCA embeddings in `.obsm` of both query and reference AnnData objects.
-        **kwargs
-            Additional keyword arguments to pass to scanpy's `pp.pca` function.
-
-        Notes
-        -----
-        This method performs an inner join on genes (variables) between the query and reference AnnData objects,
-        concatenates the normalized expression matrices, and computes a joint PCA using Scanpy. The resulting
-        PCA embeddings are stored in `.obsm[key_added]` for both objects. This is a fallback and not recommended
-        for most use cases. Please provide a biologically meaningful representation if possible.
-        """
-        logger.warning(
-            "No representation provided (use_rep=None). "
-            "Falling back to joint PCA on normalized .X of both datasets using only overlapping genes. "
-            "This is NOT recommended for most use cases! Please provide a biologically meaningful representation."
-        )
-        # Concatenate with inner join on genes
-        joint = ad.concat([self.reference, self.query], join="inner", label="batch", keys=["reference", "query"])
-
-        # Compute PCA using scanpy
-        sc.pp.pca(joint, n_comps=n_components, **kwargs)
-
-        # Assign PCA embeddings back to each object using the batch key
-        self.reference.obsm[key_added] = joint.obsm["X_pca"][joint.obs["batch"] == "reference"]
-        self.query.obsm[key_added] = joint.obsm["X_pca"][joint.obs["batch"] == "query"]
-        logger.info(
-            "Joint PCA computed and stored as '%s' in both reference.obsm and query.obsm. "
-            "Proceeding to use this as the representation for neighbor search.",
-            key_added,
-        )
-
     def compute_neighbors(
         self,
         n_neighbors: int = 30,
         use_rep: str | None = None,
+        n_comps: int | None = None,
         method: Literal["sklearn", "pynndescent", "rapids", "faiss"] = "sklearn",
         metric: str = "euclidean",
         only_yx: bool = False,
-        joint_pca_key: str = "pca_joint",
-        n_pca_components: int = 50,
-        pca_kwargs: dict[str, Any] | None = None,
+        fallback_representation: Literal["fast_cca", "joint_pca"] = "fast_cca",
+        fallback_kwargs: dict[str, Any] | None = None,
     ) -> None:
         """
         Compute nearest neighbors between reference and query datasets.
 
+        The method computes k-nearest neighbor graphs to enable mapping between
+        datasets. If no representation is provided (`use_rep=None`), a fallback
+        representation will be computed automatically using either fast CCA
+        ,inspired by Seurat v3 :cite:`stuart2019comprehensive`), or joint PCA. In self-mapping mode,
+        a simple PCA will be computed on the query dataset.
+
         Parameters
         ----------
         n_neighbors
             Number of nearest neighbors.
         use_rep
-            Data representation based on which to find nearest neighbors. If None, a joint PCA will be computed.
+            Data representation based on which to find nearest neighbors. If None,
+            a fallback representation will be computed automatically.
+        n_comps
+            Number of components to use. If a pre-computed representation is provided via `use_rep`,
+            we will use the number of components from that representation. Otherwiese, if `use_rep=None`,
+            we will compute the given number of components using the fallback representation method.
         method
-            Method to use for computing neighbors. "sklearn" and "pynndescent" run on CPU, "rapids" and "faiss" run on GPU. Note that all but "pynndescent" perform exact neighbor search. With GPU acceleration, "faiss" is usually fastest and more memory efficient than "rapids".
-            All methods return exactly `n_neighbors` neighbors, including the reference cell itself (in self-mapping mode). For faiss and sklearn, distances to self are very small positive numbers, for rapids and sklearn, they are exactly 0.
+            Method to use for computing neighbors. "sklearn" and "pynndescent" run on CPU,
+            "rapids" and "faiss" run on GPU. Note that all but "pynndescent" perform exact
+            neighbor search. With GPU acceleration, "faiss" is usually fastest and more
+            memory efficient than "rapids". All methods return exactly `n_neighbors` neighbors,
+            including the reference cell itself (in self-mapping mode). For faiss and sklearn,
+            distances to self are very small positive numbers, for rapids and sklearn, they are exactly 0.
         metric
             Distance metric to use for nearest neighbors.
         only_yx
-            If True, only compute the xy neighbors. This is faster, but not suitable for Jaccard or HNOCA methods.
-        joint_pca_key
-            Key under which to store the joint PCA embeddings if use_rep is None.
-        n_pca_components
-            Number of principal components to compute for joint PCA if use_rep is None.
-        pca_kwargs
-            Additional keyword arguments to pass to scanpy's `pp.pca` function if use_rep is None.
+            If True, only compute the xy neighbors. This is faster, but not suitable for
+            Jaccard or HNOCA methods.
+        fallback_representation
+            Method to use for computing a cross-dataset representation when `use_rep=None`. Options:
+
+            - "fast_cca": Fast canonical correlation analysis, inspired by Seurat v3 :cite:`stuart2019comprehensive` and
+              SLAT :cite:`xia2023spatial`).
+            - "joint_pca": Principal component analysis on concatenated datasets.
+        fallback_kwargs
+            Additional keyword arguments to pass to the fallback representation method.
+            For "fast_cca": see :meth:`~cellmapper.EmbeddingMixin.compute_fast_cca`.
+            For "joint_pca": see :meth:`~cellmapper.EmbeddingMixin.compute_joint_pca`.
 
         Returns
         -------
@@ -223,22 +202,50 @@ def compute_neighbors(
         - ``n_neighbors``: Number of nearest neighbors.
         - ``only_yx``: Whether only yx neighbors were computed.
         """
+        # Handle backward compatibility parameters
+        if fallback_kwargs is None:
+            fallback_kwargs = {}
+
         self.only_yx = only_yx
+
         if use_rep is None:
-            if pca_kwargs is None:
-                pca_kwargs = {}
+            logger.warning(
+                "No representation provided (`use_rep=None`). Computing a joint representation automatically "
+                "using '%s'. For optimal results, consider pre-computing a representation and passing it to `use_rep`.",
+                fallback_representation,
+            )
+
             if self._is_self_mapping:
-                sc.pp.pca(self.query, n_comps=n_pca_components, **pca_kwargs)
-                use_rep = "X_pca"
+                logger.info("Self-mapping detected. Computing PCA on query dataset for representation.")
+                key_added = fallback_kwargs.pop("key_added", "X_pca")
+                sc.tl.pca(self.query, n_comps=n_comps, key_added=key_added, **fallback_kwargs)
             else:
-                self.compute_joint_pca(n_components=n_pca_components, key_added=joint_pca_key, **pca_kwargs)
-                use_rep = joint_pca_key
+                if fallback_representation == "fast_cca":
+                    key_added = fallback_kwargs.pop("key_added", "X_cca")
+                    self.compute_fast_cca(n_comps=n_comps, key_added=key_added, **fallback_kwargs)
+                elif fallback_representation == "joint_pca":
+                    key_added = fallback_kwargs.pop("key_added", "X_pca")
+                    self.compute_joint_pca(n_comps=n_comps, key_added=key_added, **fallback_kwargs)
+                else:
+                    raise ValueError(
+                        f"Unknown fallback_representation: {fallback_representation}. "
+                        "Supported options are 'fast_cca' and 'joint_pca'."
+                    )
+            use_rep = key_added
+
+        # Extract the representation from the query and reference datasets
         if use_rep == "X":
             xrep = self.reference.X
             yrep = self.query.X
         else:
             xrep = self.reference.obsm[use_rep]
             yrep = self.query.obsm[use_rep]
+
+        # handle the number of components
+        n_comps = get_n_comps(n_comps, n_vars=xrep.shape[1])
+        xrep = xrep[:, :n_comps]
+        yrep = yrep[:, :n_comps]
+
         self.knn = Neighbors(np.ascontiguousarray(xrep), np.ascontiguousarray(yrep))
         self.knn.compute_neighbors(n_neighbors=n_neighbors, method=method, metric=metric, only_yx=only_yx)
Original file line number	Diff line number	Diff line change
`@@ -63,12 +63,14 @@ def check(self) -> None:`
`63`	`63`	`"https://docs.rapids.ai/install/.",`
`64`	`64`	`faiss="To speed up k-NN search on GPU, you may install faiss following the guide from "`
`65`	`65`	`"https://github.com/facebookresearch/faiss/blob/main/INSTALL.md",`
	`66`	`+ pynndescent="To use fast approximate k-NN search, install pynndescent: pip install pynndescent",`
`66`	`67`	`)`
`67`	`68`
`68`	`69`	`CHECKERS = {`
`69`	`70`	`"cuml": Checker("cuml", vmin=None, install_hint=INSTALL_HINTS.cuml),`
`70`	`71`	`"cupy": Checker("cupy", vmin=None, install_hint=INSTALL_HINTS.cupy),`
`71`	`72`	`"faiss": Checker("faiss", package_name="faiss", vmin="1.7.0", install_hint=INSTALL_HINTS.faiss),`
	`73`	`+ "pynndescent": Checker("pynndescent", vmin=None, install_hint=INSTALL_HINTS.pynndescent),`
`72`	`74`	`}`
`73`	`75`
`74`	`76`