Adding random landmarking -

Mohamed-Elyes Kanoun · Mohamed-Elyes Kanoun · commit 5d9c9d0128d4 · 2025-07-24T11:46:26.000-04:00
diff --git a/graphtools/graphs.py b/graphtools/graphs.py
@@ -638,26 +638,15 @@ def _data_transitions(self):
     def build_landmark_op(self):
         """Build the landmark operator
 
-        Sélectionne aléatoirement n_landmark points comme landmarks, puis assigne chaque point à son landmark le plus proche.
-                
-        with _logger.log_task("landmark operator"):
-            is_sparse = sparse.issparse(self.kernel)
-            # spectral clustering
-            with _logger.log_task("SVD"):
-                _, _, VT = randomized_svd(
-                    self.diff_aff,
-                    n_components=self.n_svd,
-                    random_state=self.random_state,
-                )
-            with _logger.log_task("KMeans"):
-                kmeans = MiniBatchKMeans(
-                    self.n_landmark,
-                    init_size=3 * self.n_landmark,
-                    n_init=1,
-                    batch_size=10000,
-                    random_state=self.random_state,
-                )
-                self._clusters = kmeans.fit_predict(self.diff_op.dot(VT.T))
+
+            Calculates spectral clusters on the kernel, and calculates transition
+            probabilities between cluster centers by using transition probabilities
+            between samples assigned to each cluster.
+            
+            random_landmarking:
+            This method randomly selects n_landmark points and assigns each sample to its nearest landmark
+            using Euclidean distance .
+
             
         """
         if self.random_landmarking :
@@ -666,7 +655,7 @@ def build_landmark_op(self):
                 n_samples = self.data.shape[0]
                 rng = np.random.default_rng(self.random_state)
                 landmark_indices = rng.choice(n_samples, self.n_landmark, replace=False)
-                data = self.data if not hasattr(self, 'data_nu') else self.data_nu 
+                data = self.data if not hasattr(self, 'data_nu') else self.data_nu # because of the scaling to review
                 distances = cdist(data, data[landmark_indices], metric="euclidean")
                 if n_samples > 5000:   # sklearn.euclidean_distances is faster than cdist for big dataset 
                     distances = euclidean_distances(data, data[landmark_indices])