Skip to content

Commit e169ec5

Browse files
author
Mohamed-Elyes Kanoun
committed
Adding random landmarking
1 parent ec5e9b1 commit e169ec5

File tree

1 file changed

+43
-5
lines changed

1 file changed

+43
-5
lines changed

graphtools/graphs.py

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
from sklearn.neighbors import NearestNeighbors
1414
from sklearn.preprocessing import normalize
1515
from sklearn.utils.extmath import randomized_svd
16+
from sklearn.metrics.pairwise import euclidean_distances
17+
1618

1719
import numbers
1820
import numpy as np
@@ -487,7 +489,7 @@ class LandmarkGraph(DataGraph):
487489
>>> X_full = G.interpolate(X_landmark)
488490
"""
489491

490-
def __init__(self, data, n_landmark=2000, n_svd=100, **kwargs):
492+
def __init__(self, data, n_landmark=2000, n_svd=100, random_landmarking = False, **kwargs):
491493
"""Initialize a landmark graph.
492494
493495
Raises
@@ -506,6 +508,7 @@ def __init__(self, data, n_landmark=2000, n_svd=100, **kwargs):
506508
"using kNNGraph or lower n_svd".format(n_svd, data.shape[0]),
507509
RuntimeWarning,
508510
)
511+
self.random_landmarking = random_landmarking
509512
self.n_landmark = n_landmark
510513
self.n_svd = n_svd
511514
super().__init__(data, **kwargs)
@@ -635,10 +638,8 @@ def _data_transitions(self):
635638
def build_landmark_op(self):
636639
"""Build the landmark operator
637640
638-
Calculates spectral clusters on the kernel, and calculates transition
639-
probabilities between cluster centers by using transition probabilities
640-
between samples assigned to each cluster.
641-
"""
641+
Sélectionne aléatoirement n_landmark points comme landmarks, puis assigne chaque point à son landmark le plus proche.
642+
642643
with _logger.log_task("landmark operator"):
643644
is_sparse = sparse.issparse(self.kernel)
644645
# spectral clustering
@@ -657,6 +658,43 @@ def build_landmark_op(self):
657658
random_state=self.random_state,
658659
)
659660
self._clusters = kmeans.fit_predict(self.diff_op.dot(VT.T))
661+
662+
"""
663+
if self.random_landmarking :
664+
with _logger.log_task("landmark operator"):
665+
is_sparse = sparse.issparse(self.kernel)
666+
n_samples = self.data.shape[0]
667+
rng = np.random.default_rng(self.random_state)
668+
landmark_indices = rng.choice(n_samples, self.n_landmark, replace=False)
669+
data = self.data if not hasattr(self, 'data_nu') else self.data_nu
670+
distances = cdist(data, data[landmark_indices], metric="euclidean")
671+
if n_samples > 5000: # sklearn.euclidean_distances is faster than cdist for big dataset
672+
distances = euclidean_distances(data, data[landmark_indices])
673+
else:
674+
distances = cdist(data, data[landmark_indices], metric="euclidean")
675+
self._clusters = np.argmin(distances, axis=1)
676+
677+
else:
678+
with _logger.log_task("landmark operator"):
679+
is_sparse = sparse.issparse(self.kernel)
680+
# spectral clustering
681+
with _logger.log_task("SVD"):
682+
_, _, VT = randomized_svd(
683+
self.diff_aff,
684+
n_components=self.n_svd,
685+
random_state=self.random_state,
686+
)
687+
with _logger.log_task("KMeans"):
688+
kmeans = MiniBatchKMeans(
689+
self.n_landmark,
690+
init_size=3 * self.n_landmark,
691+
n_init=1,
692+
batch_size=10000,
693+
random_state=self.random_state,
694+
)
695+
self._clusters = kmeans.fit_predict(self.diff_op.dot(VT.T))
696+
697+
660698

661699
# transition matrices
662700
pmn = self._landmarks_to_data()

0 commit comments

Comments
 (0)