1313from sklearn .neighbors import NearestNeighbors
1414from sklearn .preprocessing import normalize
1515from sklearn .utils .extmath import randomized_svd
16+ from sklearn .metrics .pairwise import euclidean_distances
17+
1618
1719import numbers
1820import numpy as np
@@ -487,7 +489,7 @@ class LandmarkGraph(DataGraph):
487489 >>> X_full = G.interpolate(X_landmark)
488490 """
489491
490- def __init__ (self , data , n_landmark = 2000 , n_svd = 100 , ** kwargs ):
492+ def __init__ (self , data , n_landmark = 2000 , n_svd = 100 , random_landmarking = False , ** kwargs ):
491493 """Initialize a landmark graph.
492494
493495 Raises
@@ -506,6 +508,7 @@ def __init__(self, data, n_landmark=2000, n_svd=100, **kwargs):
506508 "using kNNGraph or lower n_svd" .format (n_svd , data .shape [0 ]),
507509 RuntimeWarning ,
508510 )
511+ self .random_landmarking = random_landmarking
509512 self .n_landmark = n_landmark
510513 self .n_svd = n_svd
511514 super ().__init__ (data , ** kwargs )
@@ -635,10 +638,8 @@ def _data_transitions(self):
635638 def build_landmark_op (self ):
636639 """Build the landmark operator
637640
638- Calculates spectral clusters on the kernel, and calculates transition
639- probabilities between cluster centers by using transition probabilities
640- between samples assigned to each cluster.
641- """
641+ Sélectionne aléatoirement n_landmark points comme landmarks, puis assigne chaque point à son landmark le plus proche.
642+
642643 with _logger.log_task("landmark operator"):
643644 is_sparse = sparse.issparse(self.kernel)
644645 # spectral clustering
@@ -657,6 +658,43 @@ def build_landmark_op(self):
657658 random_state=self.random_state,
658659 )
659660 self._clusters = kmeans.fit_predict(self.diff_op.dot(VT.T))
661+
662+ """
663+ if self .random_landmarking :
664+ with _logger .log_task ("landmark operator" ):
665+ is_sparse = sparse .issparse (self .kernel )
666+ n_samples = self .data .shape [0 ]
667+ rng = np .random .default_rng (self .random_state )
668+ landmark_indices = rng .choice (n_samples , self .n_landmark , replace = False )
669+ data = self .data if not hasattr (self , 'data_nu' ) else self .data_nu
670+ distances = cdist (data , data [landmark_indices ], metric = "euclidean" )
671+ if n_samples > 5000 : # sklearn.euclidean_distances is faster than cdist for big dataset
672+ distances = euclidean_distances (data , data [landmark_indices ])
673+ else :
674+ distances = cdist (data , data [landmark_indices ], metric = "euclidean" )
675+ self ._clusters = np .argmin (distances , axis = 1 )
676+
677+ else :
678+ with _logger .log_task ("landmark operator" ):
679+ is_sparse = sparse .issparse (self .kernel )
680+ # spectral clustering
681+ with _logger .log_task ("SVD" ):
682+ _ , _ , VT = randomized_svd (
683+ self .diff_aff ,
684+ n_components = self .n_svd ,
685+ random_state = self .random_state ,
686+ )
687+ with _logger .log_task ("KMeans" ):
688+ kmeans = MiniBatchKMeans (
689+ self .n_landmark ,
690+ init_size = 3 * self .n_landmark ,
691+ n_init = 1 ,
692+ batch_size = 10000 ,
693+ random_state = self .random_state ,
694+ )
695+ self ._clusters = kmeans .fit_predict (self .diff_op .dot (VT .T ))
696+
697+
660698
661699 # transition matrices
662700 pmn = self ._landmarks_to_data ()
0 commit comments