added random_landmarking support for precomputed distance/affinity

MattScicluna · MattScicluna · commit 2d65034a23b5 · 2025-12-18T13:48:10.000-05:00
diff --git a/graphtools/graphs.py b/graphtools/graphs.py
@@ -1201,16 +1201,45 @@ def build_landmark_op(self):
                 n_samples = self.data.shape[0]
                 rng = np.random.default_rng(self.random_state)
                 landmark_indices = rng.choice(n_samples, self.n_landmark, replace=False)
-                data = (
-                    self.data if not hasattr(self, "data_nu") else self.data_nu
-                )  # because of the scaling to review
-                if (
-                    n_samples > 5000 and self.distance == "euclidean"
-                ):  # sklearn.euclidean_distances is faster than cdist for big dataset
-                    distances = euclidean_distances(data, data[landmark_indices])
+                precomputed = getattr(self, "precomputed", None)
+
+                if precomputed is not None:
+                    # Use the precomputed affinities/distances directly to avoid Euclidean fallback
+                    landmark_affinities = self.kernel[:, landmark_indices]
+
+                    if sparse.issparse(landmark_affinities):
+                        landmark_affinities = landmark_affinities.tocsr()
+                        cluster_assignments = np.asarray(
+                            landmark_affinities.argmax(axis=1)
+                        ).reshape(-1)
+                        row_max = matrix.to_array(
+                            landmark_affinities.max(axis=1)
+                        ).reshape(-1)
+                    else:
+                        landmark_affinities = np.asarray(landmark_affinities)
+                        cluster_assignments = np.argmax(landmark_affinities, axis=1)
+                        row_max = np.max(landmark_affinities, axis=1)
+
+                    if np.any(row_max == 0):
+                        warnings.warn(
+                            "Some samples have zero affinity to all randomly selected landmarks; "
+                            "increase n_landmark or ensure the affinity matrix connects all points.",
+                            RuntimeWarning,
+                        )
+                    self._clusters = cluster_assignments
                 else:
-                    distances = cdist(data, data[landmark_indices], metric=self.distance)
-                self._clusters = np.argmin(distances, axis=1)
+                    data = (
+                        self.data if not hasattr(self, "data_nu") else self.data_nu
+                    )  # because of the scaling to review
+                    if (
+                        n_samples > 5000 and self.distance == "euclidean"
+                    ):  # sklearn.euclidean_distances is faster than cdist for big dataset
+                        distances = euclidean_distances(data, data[landmark_indices])
+                    else:
+                        distances = cdist(
+                            data, data[landmark_indices], metric=self.distance
+                        )
+                    self._clusters = np.argmin(distances, axis=1)
 
             else:
                 with _logger.log_task("SVD"):
diff --git a/test/test_random_landmarking.py b/test/test_random_landmarking.py
@@ -405,6 +405,46 @@ def test_random_landmarking_distance_parameter_consistency():
         assert len(G.clusters) == small_data.shape[0]
 
 
+def test_random_landmarking_with_precomputed_affinity():
+    """Random landmarking should work with precomputed affinity matrices"""
+    affinity = np.array(
+        [
+            [1.0, 0.8, 0.1, 0.0, 0.0, 0.0],
+            [0.8, 1.0, 0.2, 0.0, 0.0, 0.0],
+            [0.1, 0.2, 1.0, 0.9, 0.4, 0.0],
+            [0.0, 0.0, 0.9, 1.0, 0.5, 0.2],
+            [0.0, 0.0, 0.4, 0.5, 1.0, 0.9],
+            [0.0, 0.0, 0.0, 0.2, 0.9, 1.0],
+        ]
+    )
+    affinity = (affinity + affinity.T) / 2  # ensure symmetry
+    n_landmark = 3
+    random_state = 42
+
+    G = graphtools.Graph(
+        affinity,
+        precomputed="affinity",
+        n_landmark=n_landmark,
+        random_landmarking=True,
+        random_state=random_state,
+        knn=3,
+        thresh=0,
+    )
+
+    # Trigger landmark construction
+    _ = G.landmark_op
+
+    rng = np.random.default_rng(random_state)
+    landmark_indices = rng.choice(affinity.shape[0], n_landmark, replace=False)
+    expected_clusters = np.asarray(
+        G.kernel[:, landmark_indices].argmax(axis=1)
+    ).reshape(-1)
+
+    assert np.array_equal(G.clusters, expected_clusters)
+    assert G.transitions.shape == (affinity.shape[0], n_landmark)
+    assert G.landmark_op.shape == (n_landmark, n_landmark)
+
+
 #############
 # Test API
 #############