Merge pull request #70 from sedfanne/random-landmarking

bjoaofelipe · web-flow · commit 2787202a1f78 · 2025-08-26T19:26:54.000-04:00
Random landmarking
diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
@@ -22,6 +22,7 @@ jobs:
       fail-fast: false
       matrix:
         config:
+        - {name: '3.11', os: ubuntu-latest, python: '3.11' }
         - {name: '3.10', os: ubuntu-latest, python: '3.10' }
         - {name: '3.9', os: ubuntu-latest, python: '3.9' }
         - {name: '3.8', os: ubuntu-latest, python: '3.8' }
diff --git a/graphtools/graphs.py b/graphtools/graphs.py
@@ -13,6 +13,8 @@
 from sklearn.neighbors import NearestNeighbors
 from sklearn.preprocessing import normalize
 from sklearn.utils.extmath import randomized_svd
+from sklearn.metrics.pairwise import euclidean_distances
+
 
 import numbers
 import numpy as np
@@ -82,7 +84,6 @@ def __init__(
         n_pca=None,
         **kwargs,
     ):
-
         if decay is not None:
             if thresh <= 0 and knn_max is None:
                 raise ValueError(
@@ -489,7 +490,9 @@ class LandmarkGraph(DataGraph):
     >>> X_full = G.interpolate(X_landmark)
     """
 
-    def __init__(self, data, n_landmark=2000, n_svd=100, **kwargs):
+    def __init__(
+        self, data, n_landmark=2000, n_svd=100, random_landmarking=False, **kwargs
+    ):
         """Initialize a landmark graph.
 
         Raises
@@ -508,6 +511,7 @@ def __init__(self, data, n_landmark=2000, n_svd=100, **kwargs):
                 "using kNNGraph or lower n_svd".format(n_svd, data.shape[0]),
                 RuntimeWarning,
             )
+        self.random_landmarking = random_landmarking
         self.n_landmark = n_landmark
         self.n_svd = n_svd
         super().__init__(data, **kwargs)
@@ -637,28 +641,48 @@ def _data_transitions(self):
     def build_landmark_op(self):
         """Build the landmark operator
 
+
         Calculates spectral clusters on the kernel, and calculates transition
         probabilities between cluster centers by using transition probabilities
         between samples assigned to each cluster.
+
+        random_landmarking:
+        This method randomly selects n_landmark points and assigns each sample to its nearest landmark
+        using Euclidean distance .
+
+
         """
         with _logger.log_task("landmark operator"):
             is_sparse = sparse.issparse(self.kernel)
-            # spectral clustering
-            with _logger.log_task("SVD"):
-                _, _, VT = randomized_svd(
-                    self.diff_aff,
-                    n_components=self.n_svd,
-                    random_state=self.random_state,
-                )
-            with _logger.log_task("KMeans"):
-                kmeans = MiniBatchKMeans(
-                    self.n_landmark,
-                    init_size=3 * self.n_landmark,
-                    n_init=1,
-                    batch_size=10000,
-                    random_state=self.random_state,
-                )
-                self._clusters = kmeans.fit_predict(self.diff_op.dot(VT.T))
+
+            if self.random_landmarking:
+                n_samples = self.data.shape[0]
+                rng = np.random.default_rng(self.random_state)
+                landmark_indices = rng.choice(n_samples, self.n_landmark, replace=False)
+                data = self.data if not hasattr(self, "data_nu") else self.data_nu
+                # if n_samples > 5000 and self.distance == "euclidean":   ( sklearn.euclidean_distances is faster than cdist for big dataset)
+                #     distances = euclidean_distances(data, data[landmark_indices])
+                #  this is a futur optimization for the euclidean case
+                #
+                distances = cdist(data, data[landmark_indices], metric=self.distance)
+                self._clusters = np.argmin(distances, axis=1)
+
+            else:
+                with _logger.log_task("SVD"):
+                    _, _, VT = randomized_svd(
+                        self.diff_aff,
+                        n_components=self.n_svd,
+                        random_state=self.random_state,
+                    )
+                with _logger.log_task("KMeans"):
+                    kmeans = MiniBatchKMeans(
+                        self.n_landmark,
+                        init_size=3 * self.n_landmark,
+                        n_init=1,
+                        batch_size=10000,
+                        random_state=self.random_state,
+                    )
+                    self._clusters = kmeans.fit_predict(self.diff_op.dot(VT.T))
 
             # transition matrices
             pmn = self._landmarks_to_data()
diff --git a/test/test_estimator.py b/test/test_estimator.py
@@ -101,7 +101,7 @@ def test_anndata_input():
     E2 = Estimator(verbose=0)
     E2.fit(anndata.AnnData(X))
     np.testing.assert_allclose(
-        E.graph.K.toarray(), E2.graph.K.toarray(), rtol=1e-6, atol=2e-7
+        E.graph.K.toarray(), E2.graph.K.toarray(), rtol=1e-6, atol=1e-6
     )
 
 

Original file line number	Diff line number	Diff line change
`@@ -101,7 +101,7 @@ def test_anndata_input():`
`101`	`101`	`E2 = Estimator(verbose=0)`
`102`	`102`	`E2.fit(anndata.AnnData(X))`
`103`	`103`	`np.testing.assert_allclose(`
`104`		`- E.graph.K.toarray(), E2.graph.K.toarray(), rtol=1e-6, atol=2e-7`
	`104`	`+ E.graph.K.toarray(), E2.graph.K.toarray(), rtol=1e-6, atol=1e-6`
`105`	`105`	`)`
`106`	`106`
`107`	`107`