Skip to content

Commit 792d07f

Browse files
committed
WIP removed unnecessary ops
1 parent dcd3b83 commit 792d07f

File tree

1 file changed

+118
-19
lines changed

1 file changed

+118
-19
lines changed

graphtools/graphs.py

Lines changed: 118 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1192,7 +1192,7 @@ def build_landmark_op(self):
11921192
11931193
random_landmarking:
11941194
This method randomly selects n_landmark points and assigns each sample to its nearest landmark
1195-
using Euclidean distance .
1195+
using the same distance metric as the graph (self.distance).
11961196
"""
11971197
with _logger.log_task("landmark operator"):
11981198
is_sparse = sparse.issparse(self.kernel)
@@ -1204,13 +1204,20 @@ def build_landmark_op(self):
12041204
data = (
12051205
self.data if not hasattr(self, "data_nu") else self.data_nu
12061206
) # because of the scaling to review
1207-
if (
1208-
n_samples > 5000
1209-
): # sklearn.euclidean_distances is faster than cdist for big dataset
1210-
distances = euclidean_distances(data, data[landmark_indices])
1211-
else:
1212-
distances = cdist(data, data[landmark_indices], metric="euclidean")
1213-
self._clusters = np.argmin(distances, axis=1)
1207+
1208+
# Store landmark indices for fast path optimization
1209+
self._landmark_indices = landmark_indices
1210+
1211+
with _logger.log_task("distance computation"):
1212+
if (
1213+
n_samples > 5000
1214+
): # sklearn.euclidean_distances is faster than cdist for big dataset
1215+
distances = euclidean_distances(data, data[landmark_indices])
1216+
else:
1217+
distances = cdist(data, data[landmark_indices], metric="euclidean")
1218+
1219+
with _logger.log_task("cluster assignment"):
1220+
self._clusters = np.argmin(distances, axis=1)
12141221

12151222
else:
12161223
with _logger.log_task("SVD"):
@@ -1229,18 +1236,110 @@ def build_landmark_op(self):
12291236
)
12301237
self._clusters = kmeans.fit_predict(self.diff_op.dot(VT.T))
12311238

1232-
# transition matrices
1233-
pmn = self._landmarks_to_data()
1239+
# Compute landmark operator and transitions
1240+
if self.random_landmarking:
1241+
# FAST PATH for random landmarking: landmarks are real samples
1242+
# No aggregation needed - extract submatrices directly from kernel
1243+
1244+
L = np.asarray(self._landmark_indices)
1245+
1246+
# ============================================================
1247+
# APPROACH TOGGLE: One-hop vs Two-hop
1248+
# ============================================================
1249+
# Set to False for two-hop (default, maintains connectivity)
1250+
# Set to True for one-hop (faster but causes sparsity issues)
1251+
USE_ONE_HOP = False
1252+
1253+
if USE_ONE_HOP:
1254+
# ONE-HOP APPROACH (CURRENTLY DISABLED - see explanation below)
1255+
# ============================================================
1256+
# Direct landmark→landmark transitions from normalized kernel
1257+
#
1258+
# Pros:
1259+
# - Very fast (no matrix multiplication)
1260+
# - Minimal memory usage
1261+
#
1262+
# Cons:
1263+
# - Results in very low sparsity (0.02 vs 0.36 for two-hop)
1264+
# - Insufficient connectivity for small knn values
1265+
# - Causes downstream failures (e.g., PHATE MDS: "Input matrices
1266+
# must contain >1 unique points")
1267+
#
1268+
# Why it fails:
1269+
# With kNN graphs (e.g., knn=500, n_landmark=5000), most landmark
1270+
# pairs are NOT direct neighbors. So P[L,L] is extremely sparse,
1271+
# making the landmark operator too degenerate for algorithms that
1272+
# expect reasonable connectivity.
1273+
#
1274+
# When this might work:
1275+
# - Very large knn values where landmarks likely to be neighbors
1276+
# - Dense graphs (not kNN)
1277+
# - Algorithms that handle very sparse operators
1278+
# ============================================================
1279+
1280+
P = normalize(self.kernel, norm="l1", axis=1)
1281+
1282+
if is_sparse:
1283+
landmark_op = P[L, :][:, L].toarray()
1284+
landmark_op = normalize(landmark_op, norm="l1", axis=1)
1285+
pnm = P[:, L]
1286+
else:
1287+
landmark_op = P[L, :][:, L]
1288+
landmark_op = normalize(landmark_op, norm="l1", axis=1)
1289+
pnm = P[:, L]
1290+
1291+
else:
1292+
# TWO-HOP APPROACH (ACTIVE)
1293+
# ============================================================
1294+
# Compute landmark→data→landmark transitions via sparse matrix multiplication
1295+
#
1296+
# This maintains proper connectivity (sparsity ~0.36) by allowing
1297+
# landmarks to connect through intermediate data points.
1298+
#
1299+
# For kNN graphs, both pmn and pnm are sparse, so the .dot()
1300+
# operation uses efficient sparse matrix multiplication.
1301+
# ============================================================
1302+
1303+
with _logger.log_task("extract landmark submatrices"):
1304+
# Extract kernel rows for landmarks directly (no aggregation)
1305+
# For random landmarking, each cluster = one sample, so no summing needed
1306+
pmn = self.kernel[L, :] # Shape: (n_landmark, n_samples)
1307+
1308+
with _logger.log_task("normalization"):
1309+
# Match exact normalization order from original working code
1310+
pnm = pmn.transpose() # Shape: (n_samples, n_landmark)
1311+
pmn = normalize(pmn, norm="l1", axis=1) # Row normalize: landmark→data transitions
1312+
pnm = normalize(pnm, norm="l1", axis=1) # Row normalize: data→landmark transitions
1313+
1314+
with _logger.log_task("landmark operator multiplication"):
1315+
# Compute two-hop transitions: landmark → data → landmark
1316+
# Uses sparse matrix multiplication if kernel is sparse
1317+
landmark_op = pmn.dot(pnm)
1318+
1319+
if is_sparse:
1320+
# Convert to dense for landmark operator (small matrix)
1321+
landmark_op = landmark_op.toarray()
1322+
1323+
else:
1324+
# ORIGINAL PATH: For spectral clustering, need to aggregate multiple points per cluster
1325+
1326+
with _logger.log_task("landmarks to data"):
1327+
# Aggregate kernel rows by cluster
1328+
pmn = self._landmarks_to_data()
1329+
1330+
with _logger.log_task("normalization"):
1331+
# Normalize transitions
1332+
pnm = pmn.transpose()
1333+
pmn = normalize(pmn, norm="l1", axis=1)
1334+
pnm = normalize(pnm, norm="l1", axis=1)
1335+
1336+
with _logger.log_task("landmark operator multiplication"):
1337+
# Full matrix multiplication needed for aggregated clusters
1338+
landmark_op = pmn.dot(pnm)
1339+
if is_sparse:
1340+
# no need to have a sparse landmark operator
1341+
landmark_op = landmark_op.toarray()
12341342

1235-
# row normalize
1236-
pnm = pmn.transpose()
1237-
pmn = normalize(pmn, norm="l1", axis=1)
1238-
pnm = normalize(pnm, norm="l1", axis=1)
1239-
# sparsity agnostic matrix multiplication
1240-
landmark_op = pmn.dot(pnm)
1241-
if is_sparse:
1242-
# no need to have a sparse landmark operator
1243-
landmark_op = landmark_op.toarray()
12441343
# store output
12451344
self._landmark_op = landmark_op
12461345
self._transitions = pnm

0 commit comments

Comments
 (0)