API modifications for scikit-learn

quentinhaenn · quentinhaenn · commit 2cc363d7885d · 2025-06-17T16:15:40.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -3,7 +3,7 @@ docs/build/
 
 # env and caches
 
-mdsenv/
+mds-env/
 **/__pycache__/
 .pytest_cache/
 .ruff_cache/
diff --git a/src/radius_clustering/radius_clustering.py b/src/radius_clustering/radius_clustering.py
@@ -12,15 +12,15 @@
 import numpy as np
 from sklearn.metrics import pairwise_distances
 from sklearn.base import BaseEstimator, ClusterMixin
-from sklearn.utils.validation import check_array
+from sklearn.utils.validation import check_array, validate_data, check_random_state
 
 from radius_clustering.utils._emos import py_emos_main
 from radius_clustering.utils._mds_approx import solve_mds
 
 DIR_PATH = os.path.dirname(os.path.realpath(__file__))
 
 
-class RadiusClustering(BaseEstimator, ClusterMixin):
+class RadiusClustering(ClusterMixin, BaseEstimator):
     """
     Radius Clustering algorithm.
 
@@ -42,29 +42,56 @@ class RadiusClustering(BaseEstimator, ClusterMixin):
         The indices of the cluster centers.
     labels\_ : array-like, shape (n_samples,)
         The cluster labels for each point in the input data.
-    effective_radius : float
+    effective_radius\_ : float
         The maximum distance between any point and its assigned cluster center.
+    random_state\_ : int | None
+        The random state used for reproducibility. If None, no random state is set.
+    
+    .. note::
+        The `random_state_` attribute is not used when the `manner` is set to "exact".
+    
+    .. versionadded:: 1.3.0
+        The *random_state* parameter was added to allow reproducibility in the approximate method.
+
+    .. versionchanged:: 1.3.0
+        All publicly accessible attributes are now suffixed with an underscore (e.g., `centers_`, `labels_`).
+        This is particularly useful for compatibility with scikit-learn's API.
     """
 
-    def __init__(self, manner="approx", threshold=0.5):
+    _estimator_type = "clusterer"
+
+    def __init__(self, manner: str ="approx", threshold: float =0.5, random_state: int | None = None) -> None:
         self.manner = manner
         self.threshold = threshold
+        self.random_state = random_state
 
-    def _check_symmetric(self, a, tol=1e-8):
+    def _check_symmetric(self, a: np.ndarray, tol: float =1e-8) -> bool:
         if a.ndim != 2:
             raise ValueError("Input must be a 2D array.")
         if a.shape[0] != a.shape[1]:
             return False
         return np.allclose(a, a.T, atol=tol)
 
-    def fit(self, X, y=None):
+    def fit(self, X: np.ndarray, y: None = None) -> "RadiusClustering":
         """
         Fit the MDS clustering model to the input data.
 
+        This method computes the distance matrix if the input is a feature matrix,
+        or uses the provided distance matrix directly if the input is already a distance matrix.
+
+        .. note::
+            If the input is a distance matrix, it should be symmetric and square.
+            If the input is a feature matrix, the distance matrix will be computed using Euclidean distance.
+        
+        .. tip::
+            Next version will support providing different metrics or even custom callables to compute the distance matrix.
+
         Parameters:
         -----------
         X : array-like, shape (n_samples, n_features)
-            The input data to cluster.
+            The input data to cluster. X should be a 2D array-like structure. It can either be :
+            - A distance matrix (symmetric, square) with shape (n_samples, n_samples).
+            - A feature matrix with shape (n_samples, n_features) where the distance matrix will be computed.
         y : Ignored
             Not used, present here for API consistency by convention.
 
@@ -91,38 +118,43 @@ def fit(self, X, y=None):
         For examples on common datasets and differences with kmeans,
         see :ref:`sphx_glr_auto_examples_plot_iris_example.py`
         """
-        self.X = check_array(X)
+        self.X_checked_ = validate_data(self, X)
 
         # Create dist and adj matrices
-        if not self._check_symmetric(self.X):
-            dist_mat = pairwise_distances(self.X, metric="euclidean")
+        if not self._check_symmetric(self.X_checked_):
+            dist_mat = pairwise_distances(self.X_checked_, metric="euclidean")
         else:
-            dist_mat = self.X
+            dist_mat = self.X_checked_
         adj_mask = np.triu((dist_mat <= self.threshold), k=1)
-        self.nb_edges = np.sum(adj_mask)
-        if self.nb_edges == 0:
-            self.centers_ = list(range(self.X.shape[0]))
-            self.labels_ = self.centers_
-            self.effective_radius = 0
-            self._mds_exec_time = 0
+        self.nb_edges_ = np.sum(adj_mask)
+        if self.nb_edges_ == 0:
+            self.centers_ = list(range(self.X_checked_.shape[0]))
+            self.labels_ = np.array(self.centers_)
+            self.effective_radius_ = 0
+            self.mds_exec_time_ = 0
             return self
-        self.edges = np.argwhere(adj_mask).astype(np.uint32) #TODO: changer en uint32
-        self.dist_mat = dist_mat
+        self.edges_ = np.argwhere(adj_mask).astype(np.uint32) # Edges in the adjacency matrix
+        # uint32 is used to use less memory. Max number of features is 2^32-1
+        self.dist_mat_ = dist_mat
 
         self._clustering()
         self._compute_effective_radius()
         self._compute_labels()
 
         return self
 
-    def fit_predict(self, X, y=None):
+    def fit_predict(self, X: np.ndarray, y: None = None) -> np.ndarray:
         """
         Fit the model and return the cluster labels.
 
+        This method is a convenience function that combines `fit` and `predict`.
+
         Parameters:
         -----------
         X : array-like, shape (n_samples, n_features)
-            The input data to cluster.
+            The input data to cluster. X should be a 2D array-like structure. It can either be :
+            - A distance matrix (symmetric, square) with shape (n_samples, n_samples).
+            - A feature matrix with shape (n_samples, n_features) where the distance matrix will be computed.
         y : Ignored
             Not used, present here for API consistency by convention.
 
@@ -138,13 +170,13 @@ def _clustering(self):
         """
         Perform the clustering using either the exact or approximate MDS method.
         """
-        n = self.X.shape[0]
+        n = self.X_checked_.shape[0]
         if self.manner == "exact":
             self._clustering_exact(n)
         else:
             self._clustering_approx(n)
 
-    def _clustering_exact(self, n):
+    def _clustering_exact(self, n: int) -> None:
         """
         Perform exact MDS clustering.
 
@@ -158,13 +190,26 @@ def _clustering_exact(self, n):
         This function uses the EMOS algorithm to solve the MDS problem.
         See: [jiang]_ for more details.
         """
-        self.centers_, self._mds_exec_time = py_emos_main(
-            self.edges.flatten(), n, self.nb_edges
+        self.centers_, self.mds_exec_time_ = py_emos_main(
+            self.edges_.flatten(), n, self.nb_edges_
         )
 
-    def _clustering_approx(self, n):
+    def _clustering_approx(self, n: int) -> None:
         """
-        Perform approximate MDS clustering.
+        Perform approximate MDS clustering. This method uses a pretty trick to set the seed for the random state of the C++ code of the MDS solver.
+
+        .. tip::
+            The random state is used to ensure reproducibility of the results when using the approximate method.
+            If `random_state` is None, a default value of 42 is used.
+        
+        .. important::
+            :collapsible: closed
+            The trick to set the random state is :
+            1. Use the `check_random_state` function to get a `RandomState`singleton instance, set up with the provided `random_state`.
+            2. Use the `randint` method of the `RandomState` instance to generate a random integer.
+            3. Use this random integer as the seed for the C++ code of the MDS solver.
+
+            This ensures that the seed passed to the C++ code is always an integer, which is required by the MDS solver, and allows for reproducibility of the results.
 
         Parameters:
         -----------
@@ -176,9 +221,13 @@ def _clustering_approx(self, n):
         This function uses the approximation method to solve the MDS problem.
         See [casado]_ for more details.
         """
-        result = solve_mds(n, self.edges.flatten().astype(np.int32), self.nb_edges, "test")
+        if self.random_state is None:
+            self.random_state = 42
+        self.random_state_ = check_random_state(self.random_state)
+        seed = self.random_state_.randint(np.iinfo(np.int32).max)
+        result = solve_mds(n, self.edges_.flatten().astype(np.int32), self.nb_edges_, seed)
         self.centers_ = [x for x in result["solution_set"]]
-        self._mds_exec_time = result["Time"]
+        self.mds_exec_time_ = result["Time"]
 
     def _compute_effective_radius(self):
         """
@@ -187,13 +236,13 @@ def _compute_effective_radius(self):
         The effective radius is the maximum radius among all clusters.
         That means EffRad = max(R(C_i)) for all i.
         """
-        self.effective_radius = np.min(self.dist_mat[:, self.centers_], axis=1).max()
+        self.effective_radius_ = np.min(self.dist_mat_[:, self.centers_], axis=1).max()
 
     def _compute_labels(self):
         """
         Compute the cluster labels for each point in the dataset.
         """
-        distances = self.dist_mat[:, self.centers_]
+        distances = self.dist_mat_[:, self.centers_]
         self.labels_ = np.argmin(distances, axis=1)
 
         min_dist = np.min(distances, axis=1)
diff --git a/src/radius_clustering/utils/mds.pyx b/src/radius_clustering/utils/mds.pyx
@@ -37,9 +37,9 @@ cdef extern from "mds_core.cpp":
         cpp_unordered_set[int] getSolutionSet()
         void setSolutionSet(cpp_unordered_set[int] solutionSet)
 
-    cdef Result iterated_greedy_wrapper(int numNodes, const vector[int]& edges_list, int nb_edges, string name) nogil
+    cdef Result iterated_greedy_wrapper(int numNodes, const vector[int]& edges_list, int nb_edges, long seed) nogil
 
-def solve_mds(int num_nodes, np.ndarray[int, ndim=1, mode="c"] edges not None, int nb_edges, str name):
+def solve_mds(int num_nodes, np.ndarray[int, ndim=1, mode="c"] edges not None, int nb_edges, int seed):
     """
     Solve the Minimum Dominating Set problem for a given graph.
 
@@ -64,15 +64,12 @@ def solve_mds(int num_nodes, np.ndarray[int, ndim=1, mode="c"] edges not None, i
     # Cast the NumPy array to a C++ vector
     cpp_edge_list.assign(&edges[0], &edges[0] + edges.shape[0])
     
-    cdef string instanceName = name.encode('utf-8')
-    
     cdef Result result
     with nogil:
-        result = iterated_greedy_wrapper(num_nodes, cpp_edge_list, nb_edges, instanceName)
+        result = iterated_greedy_wrapper(num_nodes, cpp_edge_list, nb_edges, seed)
 
     # Convert the C++ Result to a Python dictionary
     py_result = {
-        "instance_name": result.getInstanceName().decode('utf-8'),
         "solution_set": set(result.getSolutionSet()),
     }
     
diff --git a/src/radius_clustering/utils/mds_core.cpp b/src/radius_clustering/utils/mds_core.cpp
@@ -449,18 +449,18 @@ class Main {
 public:
     Main() : algorithm(constructive, localSearch) {}
 
-    Result execute(int numNodes, const std::vector<int>& edges_list, int nb_edges, std::string name) {
-        Instance instance(numNodes, edges_list, nb_edges, name);
-        RandomManager::setSeed(13);
+    Result execute(int numNodes, const std::vector<int>& edges_list, int nb_edges, long seed) {
+        Instance instance(numNodes, edges_list, nb_edges, "name");
+        RandomManager::setSeed(seed);
         signal(SIGINT, signal_handler);
         return algorithm.execute(instance);
     }
 };
 
 extern "C" {
-    inline Result iterated_greedy_wrapper(int numNodes, const std::vector<int>& edges_list, int nb_edges, std::string name) {
+    inline Result iterated_greedy_wrapper(int numNodes, const std::vector<int>& edges_list, int nb_edges, long seed) {
         static Main main;  // Create a single static instance
 
-        return main.execute(numNodes, edges_list, nb_edges, name);
+        return main.execute(numNodes, edges_list, nb_edges, seed);
     }
 }
diff --git a/tests/test_rad.py b/tests/test_rad.py
@@ -1,10 +1,21 @@
+from logging import getLogger
+
+logger = getLogger(__name__)
+logger.setLevel("INFO")
+
 def test_imports():
     import radius_clustering as rad
 
 
 def test_from_import():
     from radius_clustering import RadiusClustering
 
+def test_check_estimator_api_consistency():
+    from radius_clustering import RadiusClustering
+    from sklearn.utils.estimator_checks import check_estimator
+
+    # Check the API consistency of the RadiusClustering estimator
+    stats = check_estimator(RadiusClustering())
 
 def test_radius_clustering_approx():
     from radius_clustering import RadiusClustering