@@ -1192,7 +1192,7 @@ def build_landmark_op(self):
11921192
11931193 random_landmarking:
11941194 This method randomly selects n_landmark points and assigns each sample to its nearest landmark
1195- using Euclidean distance .
1195+ using the same distance metric as the graph (self.distance) .
11961196 """
11971197 with _logger .log_task ("landmark operator" ):
11981198 is_sparse = sparse .issparse (self .kernel )
@@ -1204,13 +1204,20 @@ def build_landmark_op(self):
12041204 data = (
12051205 self .data if not hasattr (self , "data_nu" ) else self .data_nu
12061206 ) # because of the scaling to review
1207- if (
1208- n_samples > 5000
1209- ): # sklearn.euclidean_distances is faster than cdist for big dataset
1210- distances = euclidean_distances (data , data [landmark_indices ])
1211- else :
1212- distances = cdist (data , data [landmark_indices ], metric = "euclidean" )
1213- self ._clusters = np .argmin (distances , axis = 1 )
1207+
1208+ # Store landmark indices for fast path optimization
1209+ self ._landmark_indices = landmark_indices
1210+
1211+ with _logger .log_task ("distance computation" ):
1212+ if (
1213+ n_samples > 5000
1214+ ): # sklearn.euclidean_distances is faster than cdist for big dataset
1215+ distances = euclidean_distances (data , data [landmark_indices ])
1216+ else :
1217+ distances = cdist (data , data [landmark_indices ], metric = "euclidean" )
1218+
1219+ with _logger .log_task ("cluster assignment" ):
1220+ self ._clusters = np .argmin (distances , axis = 1 )
12141221
12151222 else :
12161223 with _logger .log_task ("SVD" ):
@@ -1229,18 +1236,110 @@ def build_landmark_op(self):
12291236 )
12301237 self ._clusters = kmeans .fit_predict (self .diff_op .dot (VT .T ))
12311238
1232- # transition matrices
1233- pmn = self ._landmarks_to_data ()
1239+ # Compute landmark operator and transitions
1240+ if self .random_landmarking :
1241+ # FAST PATH for random landmarking: landmarks are real samples
1242+ # No aggregation needed - extract submatrices directly from kernel
1243+
1244+ L = np .asarray (self ._landmark_indices )
1245+
1246+ # ============================================================
1247+ # APPROACH TOGGLE: One-hop vs Two-hop
1248+ # ============================================================
1249+ # Set to False for two-hop (default, maintains connectivity)
1250+ # Set to True for one-hop (faster but causes sparsity issues)
1251+ USE_ONE_HOP = False
1252+
1253+ if USE_ONE_HOP :
1254+ # ONE-HOP APPROACH (CURRENTLY DISABLED - see explanation below)
1255+ # ============================================================
1256+ # Direct landmark→landmark transitions from normalized kernel
1257+ #
1258+ # Pros:
1259+ # - Very fast (no matrix multiplication)
1260+ # - Minimal memory usage
1261+ #
1262+ # Cons:
1263+ # - Results in very low sparsity (0.02 vs 0.36 for two-hop)
1264+ # - Insufficient connectivity for small knn values
1265+ # - Causes downstream failures (e.g., PHATE MDS: "Input matrices
1266+ # must contain >1 unique points")
1267+ #
1268+ # Why it fails:
1269+ # With kNN graphs (e.g., knn=500, n_landmark=5000), most landmark
1270+ # pairs are NOT direct neighbors. So P[L,L] is extremely sparse,
1271+ # making the landmark operator too degenerate for algorithms that
1272+ # expect reasonable connectivity.
1273+ #
1274+ # When this might work:
1275+ # - Very large knn values where landmarks likely to be neighbors
1276+ # - Dense graphs (not kNN)
1277+ # - Algorithms that handle very sparse operators
1278+ # ============================================================
1279+
1280+ P = normalize (self .kernel , norm = "l1" , axis = 1 )
1281+
1282+ if is_sparse :
1283+ landmark_op = P [L , :][:, L ].toarray ()
1284+ landmark_op = normalize (landmark_op , norm = "l1" , axis = 1 )
1285+ pnm = P [:, L ]
1286+ else :
1287+ landmark_op = P [L , :][:, L ]
1288+ landmark_op = normalize (landmark_op , norm = "l1" , axis = 1 )
1289+ pnm = P [:, L ]
1290+
1291+ else :
1292+ # TWO-HOP APPROACH (ACTIVE)
1293+ # ============================================================
1294+ # Compute landmark→data→landmark transitions via sparse matrix multiplication
1295+ #
1296+ # This maintains proper connectivity (sparsity ~0.36) by allowing
1297+ # landmarks to connect through intermediate data points.
1298+ #
1299+ # For kNN graphs, both pmn and pnm are sparse, so the .dot()
1300+ # operation uses efficient sparse matrix multiplication.
1301+ # ============================================================
1302+
1303+ with _logger .log_task ("extract landmark submatrices" ):
1304+ # Extract kernel rows for landmarks directly (no aggregation)
1305+ # For random landmarking, each cluster = one sample, so no summing needed
1306+ pmn = self .kernel [L , :] # Shape: (n_landmark, n_samples)
1307+
1308+ with _logger .log_task ("normalization" ):
1309+ # Match exact normalization order from original working code
1310+ pnm = pmn .transpose () # Shape: (n_samples, n_landmark)
1311+ pmn = normalize (pmn , norm = "l1" , axis = 1 ) # Row normalize: landmark→data transitions
1312+ pnm = normalize (pnm , norm = "l1" , axis = 1 ) # Row normalize: data→landmark transitions
1313+
1314+ with _logger .log_task ("landmark operator multiplication" ):
1315+ # Compute two-hop transitions: landmark → data → landmark
1316+ # Uses sparse matrix multiplication if kernel is sparse
1317+ landmark_op = pmn .dot (pnm )
1318+
1319+ if is_sparse :
1320+ # Convert to dense for landmark operator (small matrix)
1321+ landmark_op = landmark_op .toarray ()
1322+
1323+ else :
1324+ # ORIGINAL PATH: For spectral clustering, need to aggregate multiple points per cluster
1325+
1326+ with _logger .log_task ("landmarks to data" ):
1327+ # Aggregate kernel rows by cluster
1328+ pmn = self ._landmarks_to_data ()
1329+
1330+ with _logger .log_task ("normalization" ):
1331+ # Normalize transitions
1332+ pnm = pmn .transpose ()
1333+ pmn = normalize (pmn , norm = "l1" , axis = 1 )
1334+ pnm = normalize (pnm , norm = "l1" , axis = 1 )
1335+
1336+ with _logger .log_task ("landmark operator multiplication" ):
1337+ # Full matrix multiplication needed for aggregated clusters
1338+ landmark_op = pmn .dot (pnm )
1339+ if is_sparse :
1340+ # no need to have a sparse landmark operator
1341+ landmark_op = landmark_op .toarray ()
12341342
1235- # row normalize
1236- pnm = pmn .transpose ()
1237- pmn = normalize (pmn , norm = "l1" , axis = 1 )
1238- pnm = normalize (pnm , norm = "l1" , axis = 1 )
1239- # sparsity agnostic matrix multiplication
1240- landmark_op = pmn .dot (pnm )
1241- if is_sparse :
1242- # no need to have a sparse landmark operator
1243- landmark_op = landmark_op .toarray ()
12441343 # store output
12451344 self ._landmark_op = landmark_op
12461345 self ._transitions = pnm
0 commit comments