Merge pull request #166 from MattScicluna/add_disconnection_warning

bjoaofelipe · web-flow · commit d5de92046926 · 2025-10-27T11:19:37.000-04:00
Add disconnection warning
diff --git a/.gitignore b/.gitignore
@@ -53,6 +53,7 @@ Python/doc/build/
 
 # Jupyter
 .ipynb_checkpoints/
+Python/tutorial/cache
 
 # Mac
 .DS_Store
diff --git a/Python/phate/mds.py b/Python/phate/mds.py
@@ -218,6 +218,7 @@ def embed_MDS(
     # which is much faster than scipy's pdist + squareform
     if distance_metric == "euclidean" and X.shape[0] > 1000:
         from sklearn.metrics.pairwise import euclidean_distances
+
         X_dist = euclidean_distances(X, X)
     else:
         X_dist = squareform(pdist(X, distance_metric))
@@ -235,7 +236,7 @@ def embed_MDS(
             n_components=ndim,
             random_state=seed,
             init=Y_classic,
-            verbose=verbose
+            verbose=verbose,
         )
     elif solver == "smacof":
         Y = smacof(
diff --git a/Python/phate/phate.py b/Python/phate/phate.py
@@ -33,9 +33,15 @@
 
 _logger = tasklogger.get_tasklogger("graphtools")
 
-# Check graphtools version for random_landmarking support
-def _graphtools_supports_random_landmarking():
-    """Check if installed graphtools version supports random_landmarking parameter."""
+
+# Check graphtools version
+def _graphtools_version_is_at_least_2_0():
+    """Check if installed graphtools version is >= 2.0.0.
+
+    Version 2.0.0+ includes support for:
+    - random_landmarking parameter
+    - is_connected property and connectivity checks
+    """
     try:
         return version.parse(graphtools.__version__) >= version.parse("2.0.0")
     except AttributeError:
@@ -127,9 +133,9 @@ class PHATE(BaseEstimator):
         If an integer is given, it fixes the seed
         Defaults to the global `numpy` random number generator
 
-    random_landmarking : bool, optional, default: False   
-        Whether to use random sampling for landmarking. If True, landmarks 
-        are selected randomly. If False, landmarks are selected deterministically 
+    random_landmarking : bool, optional, default: False
+        Whether to use random sampling for landmarking. If True, landmarks
+        are selected randomly. If False, landmarks are selected deterministically
         using spectral clustering.
         Defaults to False.
 
@@ -226,18 +232,18 @@ def __init__(
                 "Landmarking is disabled when n_landmark=None. "
                 "To use random landmarking, please set n_landmark to a positive integer "
                 "(e.g., n_landmark=2000).",
-                UserWarning
+                UserWarning,
             )
             # Disable random_landmarking since it has no effect
             random_landmarking = False
         # Check graphtools version if random_landmarking is still requested
-        elif random_landmarking and not _graphtools_supports_random_landmarking():
+        elif random_landmarking and not _graphtools_version_is_at_least_2_0():
             warnings.warn(
                 "random_landmarking is not available in graphtools version < 2.0.0. "
                 "Please update graphtools to use this feature: "
                 "https://pypi.org/project/graphtools/2.0.0/. "
                 "Falling back to spectral clustering for landmark selection.",
-                UserWarning
+                UserWarning,
             )
             # Disable random_landmarking since it's not supported
             random_landmarking = False
@@ -527,9 +533,9 @@ def set_params(self, **params):
             If an integer is given, it fixes the seed
             Defaults to the global `numpy` random number generator
 
-        random_landmarking : bool, optional, default: False   
-            Whether to use random sampling for landmarking. If True, landmarks 
-            are selected randomly. If False, landmarks are selected deterministically 
+        random_landmarking : bool, optional, default: False
+            Whether to use random sampling for landmarking. If True, landmarks
+            are selected randomly. If False, landmarks are selected deterministically
             using spectral clustering.
             Defaults to False.
 
@@ -814,21 +820,21 @@ def _update_graph(self, X, precomputed, n_pca, n_landmark, random_landmarking):
             try:
                 # Prepare graph params
                 graph_params = {
-                    'decay': self.decay,
-                    'knn': self.knn,
-                    'knn_max': self.knn_max,
-                    'distance': self.knn_dist,
-                    'precomputed': precomputed,
-                    'n_jobs': self.n_jobs,
-                    'verbose': self.verbose,
-                    'n_pca': n_pca,
-                    'n_landmark': n_landmark,
-                    'random_state': self.random_state,
+                    "decay": self.decay,
+                    "knn": self.knn,
+                    "knn_max": self.knn_max,
+                    "distance": self.knn_dist,
+                    "precomputed": precomputed,
+                    "n_jobs": self.n_jobs,
+                    "verbose": self.verbose,
+                    "n_pca": n_pca,
+                    "n_landmark": n_landmark,
+                    "random_state": self.random_state,
                 }
 
                 # Only add random_landmarking if graphtools supports it
-                if _graphtools_supports_random_landmarking():
-                    graph_params['random_landmarking'] = random_landmarking
+                if _graphtools_version_is_at_least_2_0():
+                    graph_params["random_landmarking"] = random_landmarking
 
                 self.graph.set_params(**graph_params)
                 _logger.log_info("Using precomputed graph and diffusion operator...")
@@ -875,36 +881,50 @@ def fit(self, X):
             n_landmark = self.n_landmark
 
         if self.graph is not None and update_graph:
-            self._update_graph(X, precomputed, n_pca, n_landmark, self.random_landmarking)
+            self._update_graph(
+                X, precomputed, n_pca, n_landmark, self.random_landmarking
+            )
 
         self.X = X
 
         if self.graph is None:
             with _logger.log_task("graph and diffusion operator"):
                 # Prepare graph params
                 graph_params = {
-                    'n_pca': n_pca,
-                    'n_landmark': n_landmark,
-                    'distance': self.knn_dist,
-                    'precomputed': precomputed,
-                    'knn': self.knn,
-                    'knn_max': self.knn_max,
-                    'decay': self.decay,
-                    'thresh': 1e-4,
-                    'n_jobs': self.n_jobs,
-                    'verbose': self.verbose,
-                    'random_state': self.random_state,
+                    "n_pca": n_pca,
+                    "n_landmark": n_landmark,
+                    "distance": self.knn_dist,
+                    "precomputed": precomputed,
+                    "knn": self.knn,
+                    "knn_max": self.knn_max,
+                    "decay": self.decay,
+                    "thresh": 1e-4,
+                    "n_jobs": self.n_jobs,
+                    "verbose": self.verbose,
+                    "random_state": self.random_state,
                 }
 
                 # Only add random_landmarking if graphtools supports it
-                if _graphtools_supports_random_landmarking():
-                    graph_params['random_landmarking'] = self.random_landmarking
+                if _graphtools_version_is_at_least_2_0():
+                    graph_params["random_landmarking"] = self.random_landmarking
 
                 # Merge with any additional kwargs
                 graph_params.update(self.kwargs)
 
                 self.graph = graphtools.Graph(X, **graph_params)
 
+                # Check for graph connectivity (requires graphtools >= 2.0.0)
+                if _graphtools_version_is_at_least_2_0():
+                    if not self.graph.is_connected:
+                        warnings.warn(
+                            f"Graph is disconnected with {self.graph.n_connected_components} "
+                            f"connected components. This may indicate that your knn parameter "
+                            f"(currently {self.knn}) is too small, or that your data contains "
+                            f"distinct clusters. PHATE may not accurately represent relationships "
+                            f"between disconnected components.",
+                            RuntimeWarning,
+                        )
+
         # landmark op doesn't build unless forced
         self.diff_op
         return self
diff --git a/Python/phate/sgd_mds.py b/Python/phate/sgd_mds.py
@@ -159,7 +159,7 @@ def sgd_mds(
         Y = Y - lr * gradients
 
         # Compute stress for convergence checking
-        stress = np.sum(errors ** 2) / len(errors)  # Normalized by number of samples
+        stress = np.sum(errors**2) / len(errors)  # Normalized by number of samples
         stress_history.append(stress)
 
         if verbose > 0 and iteration % 100 == 0:
@@ -189,7 +189,9 @@ def sgd_mds(
         last_10pct = max(1, len(stress_history) // 10)
         recent_stress = stress_history[-last_10pct:]
         if len(recent_stress) > 1:
-            stress_trend = (recent_stress[-1] - recent_stress[0]) / (recent_stress[0] + 1e-10)
+            stress_trend = (recent_stress[-1] - recent_stress[0]) / (
+                recent_stress[0] + 1e-10
+            )
             if abs(stress_trend) > 0.01:  # Still changing by more than 1%
                 _logger.log_warning(
                     f"SGD-MDS may not have converged: stress changed by {stress_trend*100:.1f}% "
diff --git a/Python/test/test_simple.py b/Python/test/test_simple.py
@@ -126,7 +126,7 @@ def test_tree():
     np.testing.assert_allclose(
         phate_precomputed_D, phate_precomputed_distance, atol=5e-4
     )
-    
+
     return None
 
 

Original file line number	Diff line number	Diff line change
`@@ -126,7 +126,7 @@ def test_tree():`
`126`	`126`	`np.testing.assert_allclose(`
`127`	`127`	`phate_precomputed_D, phate_precomputed_distance, atol=5e-4`
`128`	`128`	`)`
`129`		`-`
	`129`	`+`
`130`	`130`	`return None`
`131`	`131`
`132`	`132`