refactoring algorithms + doc enhancement

quentinhaenn · quentinhaenn · commit 8c605e18c612 · 2025-06-18T15:28:13.000+02:00
diff --git a/.coverage b/.coverage
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -1,7 +1,19 @@
 API Reference
 =============
 
-.. automodule:: radius_clustering
+This page documents the implementation details of the `radius_clustering` package.
+
+RadiusClustering Class
+----------------------
+
+.. autoclass:: radius_clustering.RadiusClustering
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Algorithms Module
+-----------------
+.. automodule:: radius_clustering.algorithms
    :members:
    :undoc-members:
    :show-inheritance:
diff --git a/src/radius_clustering/algorithms.py b/src/radius_clustering/algorithms.py
@@ -0,0 +1,86 @@
+"""
+This module contains the implementation of the clustering algorithms.
+It provides two main functions: `clustering_approx` and `clustering_exact`.
+
+These functions can be replaced in the `RadiusClustering` class
+to perform clustering using another algorithm.
+
+.. versionadded:: 2.0.0
+    Refactoring the structure of the code to separate the clustering algorithms
+    This allows for easier maintenance and extensibility of the codebase.
+    Plus, this allows for the addition of new clustering algorithms
+    such as `Curgraph` added in this version.
+"""
+from __future__ import annotations
+
+import numpy as np
+
+from .utils._mds_approx import solve_mds
+from .utils._emos import py_emos_main
+
+def clustering_approx(
+          n: int, edges: np.ndarray, nb_edges: int,
+          random_state: int | None = None) -> None:
+    """
+    Perform approximate MDS clustering.
+    This method uses a pretty trick to set the seed for
+    the random state of the C++ code of the MDS solver.
+
+    .. tip::
+        The random state is used to ensure reproducibility of the results
+        when using the approximate method.
+        If `random_state` is None, a default value of 42 is used.
+
+    .. important::
+        The trick to set the random state is :
+
+        1. Use the `check_random_state` function to get a `RandomState`singleton
+        instance, set up with the provided `random_state`.
+
+        2. Use the `randint` method of the `RandomState` instance to generate a
+        random integer.
+
+        3. Use this random integer as the seed for the C++ code of the MDS solver.
+
+        This ensures that the seed passed to the C++ code is always an integer,
+        which is required by the MDS solver, and allows for
+        reproducibility of the results.
+
+    Parameters:
+    -----------
+    n : int
+        The number of points in the dataset.
+
+    Notes:
+    ------
+    This function uses the approximation method to solve the MDS problem.
+    See [casado]_ for more details.
+    """
+    result = solve_mds(
+        n, edges.flatten().astype(np.int32), nb_edges, random_state
+    )
+    centers = sorted([x for x in result["solution_set"]])
+    mds_exec_time = result["Time"]
+    return centers, mds_exec_time
+
+def clustering_exact(n: int, edges: np.ndarray, nb_edges: int) -> None:
+    """
+    Perform exact MDS clustering.
+
+    This function uses the EMOs algorithm to solve the MDS problem.
+
+    .. important::
+        The EMOS algorithm is an exact algorithm for solving the MDS problem.
+        It is a branch and bound algorithm that uses graph theory tricks
+        to efficiently cut the search space. See [jiang]_ for more details.
+
+    Parameters:
+    -----------
+    n : int
+        The number of points in the dataset.
+    """
+    centers, mds_exec_time = py_emos_main(
+        edges.flatten(), n, nb_edges
+    )
+    centers.sort()
+    return centers, mds_exec_time
diff --git a/src/radius_clustering/radius_clustering.py b/src/radius_clustering/radius_clustering.py
@@ -18,8 +18,7 @@
 from sklearn.metrics import pairwise_distances
 from sklearn.utils.validation import check_random_state, validate_data
 
-from radius_clustering.utils._emos import py_emos_main
-from radius_clustering.utils._mds_approx import solve_mds
+from .algorithms import clustering_approx, clustering_exact
 
 DIR_PATH = os.path.dirname(os.path.realpath(__file__))
 
@@ -53,20 +52,23 @@ class RadiusClustering(ClusterMixin, BaseEstimator):
 
     .. note::
         The `random_state_` attribute is not used when the `manner` is set to "exact".
+    
+    .. versionchanged:: 2.0.0
+        The `RadiusClustering` class has been refactored.
+        Clustering algorithms are now separated into their own module
+        (`algorithms.py`) to improve maintainability and extensibility.
 
     .. versionadded:: 1.3.0
-        The *random_state* parameter was added to allow reproducibility in
-        the approximate method.
+
+        - The *random_state* parameter was added to allow reproducibility in the approximate method.
+
+        - The `radius` parameter replaces the `threshold` parameter for setting the dissimilarity threshold for better clarity and consistency.
 
     .. versionchanged:: 1.3.0
         All publicly accessible attributes are now suffixed with an underscore
         (e.g., `centers_`, `labels_`).
         This is particularly useful for compatibility with scikit-learn's API.
 
-    .. versionadded:: 1.3.0
-        The `radius` parameter replaces the `threshold` parameter for setting
-        the dissimilarity threshold for better clarity and consistency.
-
     .. deprecated:: 1.3.0
         The `threshold` parameter is deprecated. Use `radius` instead.
         Will be removed in a future version.
@@ -243,7 +245,7 @@ def fit_predict(self, X: np.ndarray, y: None = None, metric: str | callable = "e
         labels : array, shape (n_samples,)
             The cluster labels for each point in X.
         """
-        self.fit(X)
+        self.fit(X, metric=metric)
         return self.labels_
 
     def _clustering(self):
@@ -252,75 +254,15 @@ def _clustering(self):
         """
         n = self.X_checked_.shape[0]
         if self.manner != "exact" and self.manner != "approx":
-            print(f"Invalid manner: {self.manner}. Defaulting to 'approx'.")
             raise ValueError("Invalid manner. Choose either 'exact' or 'approx'.")
         if self.manner == "exact":
-            self._clustering_exact(n)
+            self.centers_, self.mds_exec_time_ = clustering_exact(n, self.edges_, self.nb_edges_)
         else:
-            self._clustering_approx(n)
-
-    def _clustering_exact(self, n: int) -> None:
-        """
-        Perform exact MDS clustering.
-
-        Parameters:
-        -----------
-        n : int
-            The number of points in the dataset.
-
-        Notes:
-        ------
-        This function uses the EMOS algorithm to solve the MDS problem.
-        See: [jiang]_ for more details.
-        """
-        self.centers_, self.mds_exec_time_ = py_emos_main(
-            self.edges_.flatten(), n, self.nb_edges_
-        )
-        self.centers_.sort()  # Sort the centers to ensure consistent order
-
-    def _clustering_approx(self, n: int) -> None:
-        """
-        Perform approximate MDS clustering.
-        This method uses a pretty trick to set the seed for
-        the random state of the C++ code of the MDS solver.
-
-        .. tip::
-            The random state is used to ensure reproducibility of the results
-            when using the approximate method.
-            If `random_state` is None, a default value of 42 is used.
-
-        .. important::
-            :collapsible: closed
-            The trick to set the random state is :
-            1. Use the `check_random_state` function to get a `RandomState`singleton
-            instance, set up with the provided `random_state`.
-            2. Use the `randint` method of the `RandomState` instance to generate a
-            random integer.
-            3. Use this random integer as the seed for the C++ code of the MDS solver.
-
-            This ensures that the seed passed to the C++ code is always an integer,
-            which is required by the MDS solver, and allows for
-            reproducibility of the results.
-
-        Parameters:
-        -----------
-        n : int
-            The number of points in the dataset.
-
-        Notes:
-        ------
-        This function uses the approximation method to solve the MDS problem.
-        See [casado]_ for more details.
-        """
-        if self.random_state is None:
-            self.random_state = 42
-        self.random_state_ = check_random_state(self.random_state)
-        seed = self.random_state_.randint(np.iinfo(np.int32).max)
-        result = solve_mds(
-            n, self.edges_.flatten().astype(np.int32), self.nb_edges_, seed
-        )
-        self.centers_ = sorted([x for x in result["solution_set"]])
-        self.mds_exec_time_ = result["Time"]
+            if self.random_state is None:
+                self.random_state = 42
+            self.random_state_ = check_random_state(self.random_state)
+            seed = self.random_state_.randint(np.iinfo(np.int32).max)
+            self.centers_, self.mds_exec_time_ = clustering_approx(n, self.edges_, self.nb_edges_, seed)
 
     def _compute_effective_radius(self):
         """