diff --git a/.coverage b/.coverage new file mode 100644 index 0000000..f32d8f4 Binary files /dev/null and b/.coverage differ diff --git a/.gitignore b/.gitignore index 4477aa1..9888adb 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,7 @@ docs/build/ # env and caches -mdsenv/ +mds-env/ **/__pycache__/ .pytest_cache/ .ruff_cache/ @@ -25,5 +25,8 @@ docs/source/modules/generated/ **/emos.c **/mds.cpp +# MAC OS files +.DS_Store + # Reportings reporting/ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..4a0e668 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,19 @@ +# Changelog + +## [1.3.0] - 2025-06-18 + +### Added + +- Full test coverage for the entire codebase. +- Badge for test coverage in the README. +- Added `radius` parameter to the `RadiusClustering` class, allowing users to specify the radius for clustering. + +### Deprecated + +- Deprecated the `threshold` parameter in the `RadiusClustering` class. Use `radius` instead. + +### Changed + +- Updated all the attributes in the `RadiusClustering` class to fit `scikit-learn` standards and conventions. +- Updated the tests cases to reflect the changes in the `RadiusClustering` class. +- Updated README and documentation to reflect the new `radius` parameter and the deprecation of `threshold`. diff --git a/docs/source/conf.py b/docs/source/conf.py index dc3bbb2..b23d6b7 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -16,7 +16,7 @@ project = "Radius Clustering" copyright = "2024, Haenn Quentin, Chardin Brice, Baron Mickaël" author = "Haenn Quentin, Chardin Brice, Baron Mickaël" -release = "1.0" +release = "1.3.0" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/docs/source/usage.rst b/docs/source/usage.rst index 6da5cf6..1826840 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -12,7 +12,9 @@ Here's a basic example of how to use Radius Clustering: X = np.random.rand(100, 2) # Create an instance of MdsClustering - rad = RadiusClustering(manner="approx", threshold=0.5) + rad = RadiusClustering(manner="approx", radius=0.5) + # Attention: the 'threshold' parameter is deprecated by version 1.3.0 + # and will be removed in a future version. Use 'radius' instead. # Fit the model to the data rad.fit(X) diff --git a/examples/plot_iris_example.py b/examples/plot_iris_example.py index 7204348..c31d9a5 100644 --- a/examples/plot_iris_example.py +++ b/examples/plot_iris_example.py @@ -82,7 +82,7 @@ # We create an instance of the `RadiusClustering` class and fit it to the Iris dataset. import time -rad = RadiusClustering(manner="exact", threshold=1.43) +rad = RadiusClustering(manner="exact", radius=1.43) t0 = time.time() rad.fit(X) t_rad = time.time() - t0 @@ -242,7 +242,7 @@ def get_order_labels(kmeans, rad, data): # Compute clustering with MDS -rad = RadiusClustering(manner="exact", threshold=232.09) +rad = RadiusClustering(manner="exact", radius=232.09) t0 = time.time() rad.fit(X) t_rad = time.time() - t0 diff --git a/pyproject.toml b/pyproject.toml index 175e565..0ee2d8d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,7 @@ documentation = "https://lias-laboratory.github.io/radius_clustering/" [project.optional-dependencies] dev = [ "pytest>=8.3.3", + "pytest-cov>=5.0.0", "pandas", "cython>=3.0", "setuptools>= 61.0", @@ -80,8 +81,22 @@ pythonpath = "src" testpaths = ["tests"] addopts = [ "--import-mode=importlib", + "--cov=src/radius_clustering", + "--cov-report=term-missing", + "--cov-report=html:coverage_html_report", ] +[tool.coverage.run] +source = ["src/radius_clustering"] +branch = true + +[tool.coverage.report] +show_missing = true + +[tool.coverage.html] +directory = "coverage_html_report" +title = "Coverage Report" + [tool.ruff] # Exclude a variety of commonly ignored directories. exclude = [ @@ -105,14 +120,14 @@ exclude = [ # Same as Black. line-length = 88 -indent-width = 4 +target-version = "py310" [tool.ruff.lint] # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default. # Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or # McCabe complexity (`C901`) by default. -select = ["E", "F"] -ignore = [] +select = ["E", "F", "W", "I"] +ignore = ["E203", "E731", "E741"] # Allow fix for all enabled rules (when `--fix`) is provided. fixable = ["ALL"] diff --git a/setup.py b/setup.py index bcab66c..909e82a 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,8 @@ import platform -from setuptools import setup, Extension -from Cython.Build import cythonize + import numpy as np +from Cython.Build import cythonize +from setuptools import Extension, setup SYSTEM = platform.system() CPU = platform.processor() @@ -21,7 +22,10 @@ extensions = [ Extension( "radius_clustering.utils._emos", - ["src/radius_clustering/utils/emos.pyx", "src/radius_clustering/utils/main-emos.c"], + [ + "src/radius_clustering/utils/emos.pyx", + "src/radius_clustering/utils/main-emos.c" + ], include_dirs=[np.get_include(), "src/radius_clustering/utils"], extra_compile_args=C_COMPILE_ARGS, ), diff --git a/src/radius_clustering/__init__.py b/src/radius_clustering/__init__.py index 3ebc26e..9609e48 100644 --- a/src/radius_clustering/__init__.py +++ b/src/radius_clustering/__init__.py @@ -2,6 +2,4 @@ from .radius_clustering import RadiusClustering __all__ = ["RadiusClustering"] - -# Optionally, you can set a version number for your package -__version__ = "1.2.2" +__version__ = "1.3.0" diff --git a/src/radius_clustering/radius_clustering.py b/src/radius_clustering/radius_clustering.py index db49a56..33d42c1 100644 --- a/src/radius_clustering/radius_clustering.py +++ b/src/radius_clustering/radius_clustering.py @@ -8,11 +8,15 @@ This module serves as the main interface for the Radius clustering library. """ +from __future__ import annotations + import os +import warnings + import numpy as np -from sklearn.metrics import pairwise_distances from sklearn.base import BaseEstimator, ClusterMixin -from sklearn.utils.validation import check_array +from sklearn.metrics import pairwise_distances +from sklearn.utils.validation import check_random_state, validate_data from radius_clustering.utils._emos import py_emos_main from radius_clustering.utils._mds_approx import solve_mds @@ -20,8 +24,8 @@ DIR_PATH = os.path.dirname(os.path.realpath(__file__)) -class RadiusClustering(BaseEstimator, ClusterMixin): - """ +class RadiusClustering(ClusterMixin, BaseEstimator): + r""" Radius Clustering algorithm. This class implements clustering based on the Minimum Dominating Set (MDS) problem. @@ -31,7 +35,7 @@ class RadiusClustering(BaseEstimator, ClusterMixin): ----------- manner : str, optional (default="approx") The method to use for solving the MDS problem. Can be "exact" or "approx". - threshold : float, optional (default=0.5) + radius : float, optional (default=0.5) The dissimilarity threshold to act as radius constraint for the clustering. Attributes: @@ -42,29 +46,87 @@ class RadiusClustering(BaseEstimator, ClusterMixin): The indices of the cluster centers. labels\_ : array-like, shape (n_samples,) The cluster labels for each point in the input data. - effective_radius : float + effective_radius\_ : float The maximum distance between any point and its assigned cluster center. + random_state\_ : int | None + The random state used for reproducibility. If None, no random state is set. + + .. note:: + The `random_state_` attribute is not used when the `manner` is set to "exact". + + .. versionadded:: 1.3.0 + The *random_state* parameter was added to allow reproducibility in + the approximate method. + + .. versionchanged:: 1.3.0 + All publicly accessible attributes are now suffixed with an underscore + (e.g., `centers_`, `labels_`). + This is particularly useful for compatibility with scikit-learn's API. + + .. versionadded:: 1.3.0 + The `radius` parameter replaces the `threshold` parameter for setting + the dissimilarity threshold for better clarity and consistency. + + .. deprecated:: 1.3.0 + The `threshold` parameter is deprecated. Use `radius` instead. + Will be removed in a future version. """ - def __init__(self, manner="approx", threshold=0.5): + _estimator_type = "clusterer" + + def __init__( + self, + manner: str = "approx", + radius: float = 0.5, + threshold=None, + random_state: int | None = None, + ) -> None: + if threshold is not None: + warnings.warn( + "The 'threshold' parameter is deprecated and" + " will be removed in a future version." + "Please use 'radius' instead.", + DeprecationWarning, + stacklevel=2, + ) + radius = threshold + self.threshold = threshold # For backward compatibility self.manner = manner - self.threshold = threshold + self.radius = radius + self.random_state = random_state - def _check_symmetric(self, a, tol=1e-8): + def _check_symmetric(self, a: np.ndarray, tol: float = 1e-8) -> bool: if a.ndim != 2: raise ValueError("Input must be a 2D array.") if a.shape[0] != a.shape[1]: return False return np.allclose(a, a.T, atol=tol) - def fit(self, X, y=None): + def fit(self, X: np.ndarray, y: None = None) -> "RadiusClustering": """ Fit the MDS clustering model to the input data. + This method computes the distance matrix if the input is a feature matrix, + or uses the provided distance matrix directly if the input is already + a distance matrix. + + .. note:: + If the input is a distance matrix, it should be symmetric and square. + If the input is a feature matrix, the distance matrix + will be computed using Euclidean distance. + + .. tip:: + Next version will support providing different metrics or + even custom callables to compute the distance matrix. + Parameters: ----------- X : array-like, shape (n_samples, n_features) - The input data to cluster. + The input data to cluster. X should be a 2D array-like structure. + It can either be : + - A distance matrix (symmetric, square) with shape (n_samples, n_samples). + - A feature matrix with shape (n_samples, n_features) + where the distance matrix will be computed. y : Ignored Not used, present here for API consistency by convention. @@ -91,23 +153,31 @@ def fit(self, X, y=None): For examples on common datasets and differences with kmeans, see :ref:`sphx_glr_auto_examples_plot_iris_example.py` """ - self.X = check_array(X) + self.X_checked_ = validate_data(self, X) # Create dist and adj matrices - if not self._check_symmetric(self.X): - dist_mat = pairwise_distances(self.X, metric="euclidean") + if not self._check_symmetric(self.X_checked_): + dist_mat = pairwise_distances(self.X_checked_, metric="euclidean") else: - dist_mat = self.X - adj_mask = np.triu((dist_mat <= self.threshold), k=1) - self.nb_edges = np.sum(adj_mask) - if self.nb_edges == 0: - self.centers_ = list(range(self.X.shape[0])) - self.labels_ = self.centers_ - self.effective_radius = 0 - self._mds_exec_time = 0 + dist_mat = self.X_checked_ + + if not isinstance(self.radius, (float, int)): + raise ValueError("Radius must be a positive float.") + if self.radius <= 0: + raise ValueError("Radius must be a positive float.") + adj_mask = np.triu((dist_mat <= self.radius), k=1) + self.nb_edges_ = np.sum(adj_mask) + if self.nb_edges_ == 0: + self.centers_ = list(range(self.X_checked_.shape[0])) + self.labels_ = np.array(self.centers_) + self.effective_radius_ = 0 + self.mds_exec_time_ = 0 return self - self.edges = np.argwhere(adj_mask).astype(np.uint32) #TODO: changer en uint32 - self.dist_mat = dist_mat + self.edges_ = np.argwhere(adj_mask).astype( + np.uint32 + ) # Edges in the adjacency matrix + # uint32 is used to use less memory. Max number of features is 2^32-1 + self.dist_mat_ = dist_mat self._clustering() self._compute_effective_radius() @@ -115,14 +185,20 @@ def fit(self, X, y=None): return self - def fit_predict(self, X, y=None): + def fit_predict(self, X: np.ndarray, y: None = None) -> np.ndarray: """ Fit the model and return the cluster labels. + This method is a convenience function that combines `fit` and `predict`. + Parameters: ----------- X : array-like, shape (n_samples, n_features) - The input data to cluster. + The input data to cluster. X should be a 2D array-like structure. + It can either be : + - A distance matrix (symmetric, square) with shape (n_samples, n_samples). + - A feature matrix with shape (n_samples, n_features) where + the distance matrix will be computed. y : Ignored Not used, present here for API consistency by convention. @@ -138,13 +214,16 @@ def _clustering(self): """ Perform the clustering using either the exact or approximate MDS method. """ - n = self.X.shape[0] + n = self.X_checked_.shape[0] + if self.manner != "exact" and self.manner != "approx": + print(f"Invalid manner: {self.manner}. Defaulting to 'approx'.") + raise ValueError("Invalid manner. Choose either 'exact' or 'approx'.") if self.manner == "exact": self._clustering_exact(n) else: self._clustering_approx(n) - def _clustering_exact(self, n): + def _clustering_exact(self, n: int) -> None: """ Perform exact MDS clustering. @@ -158,13 +237,34 @@ def _clustering_exact(self, n): This function uses the EMOS algorithm to solve the MDS problem. See: [jiang]_ for more details. """ - self.centers_, self._mds_exec_time = py_emos_main( - self.edges.flatten(), n, self.nb_edges + self.centers_, self.mds_exec_time_ = py_emos_main( + self.edges_.flatten(), n, self.nb_edges_ ) + self.centers_.sort() # Sort the centers to ensure consistent order - def _clustering_approx(self, n): + def _clustering_approx(self, n: int) -> None: """ Perform approximate MDS clustering. + This method uses a pretty trick to set the seed for + the random state of the C++ code of the MDS solver. + + .. tip:: + The random state is used to ensure reproducibility of the results + when using the approximate method. + If `random_state` is None, a default value of 42 is used. + + .. important:: + :collapsible: closed + The trick to set the random state is : + 1. Use the `check_random_state` function to get a `RandomState`singleton + instance, set up with the provided `random_state`. + 2. Use the `randint` method of the `RandomState` instance to generate a + random integer. + 3. Use this random integer as the seed for the C++ code of the MDS solver. + + This ensures that the seed passed to the C++ code is always an integer, + which is required by the MDS solver, and allows for + reproducibility of the results. Parameters: ----------- @@ -176,9 +276,15 @@ def _clustering_approx(self, n): This function uses the approximation method to solve the MDS problem. See [casado]_ for more details. """ - result = solve_mds(n, self.edges.flatten().astype(np.int32), self.nb_edges, "test") - self.centers_ = [x for x in result["solution_set"]] - self._mds_exec_time = result["Time"] + if self.random_state is None: + self.random_state = 42 + self.random_state_ = check_random_state(self.random_state) + seed = self.random_state_.randint(np.iinfo(np.int32).max) + result = solve_mds( + n, self.edges_.flatten().astype(np.int32), self.nb_edges_, seed + ) + self.centers_ = sorted([x for x in result["solution_set"]]) + self.mds_exec_time_ = result["Time"] def _compute_effective_radius(self): """ @@ -187,14 +293,14 @@ def _compute_effective_radius(self): The effective radius is the maximum radius among all clusters. That means EffRad = max(R(C_i)) for all i. """ - self.effective_radius = np.min(self.dist_mat[:, self.centers_], axis=1).max() + self.effective_radius_ = np.min(self.dist_mat_[:, self.centers_], axis=1).max() def _compute_labels(self): """ Compute the cluster labels for each point in the dataset. """ - distances = self.dist_mat[:, self.centers_] + distances = self.dist_mat_[:, self.centers_] self.labels_ = np.argmin(distances, axis=1) min_dist = np.min(distances, axis=1) - self.labels_[min_dist > self.threshold] = -1 + self.labels_[min_dist > self.radius] = -1 diff --git a/src/radius_clustering/utils/mds.pyx b/src/radius_clustering/utils/mds.pyx index 488ae90..2be8f77 100644 --- a/src/radius_clustering/utils/mds.pyx +++ b/src/radius_clustering/utils/mds.pyx @@ -37,9 +37,9 @@ cdef extern from "mds_core.cpp": cpp_unordered_set[int] getSolutionSet() void setSolutionSet(cpp_unordered_set[int] solutionSet) - cdef Result iterated_greedy_wrapper(int numNodes, const vector[int]& edges_list, int nb_edges, string name) nogil + cdef Result iterated_greedy_wrapper(int numNodes, const vector[int]& edges_list, int nb_edges, long seed) nogil -def solve_mds(int num_nodes, np.ndarray[int, ndim=1, mode="c"] edges not None, int nb_edges, str name): +def solve_mds(int num_nodes, np.ndarray[int, ndim=1, mode="c"] edges not None, int nb_edges, int seed): """ Solve the Minimum Dominating Set problem for a given graph. @@ -64,15 +64,12 @@ def solve_mds(int num_nodes, np.ndarray[int, ndim=1, mode="c"] edges not None, i # Cast the NumPy array to a C++ vector cpp_edge_list.assign(&edges[0], &edges[0] + edges.shape[0]) - cdef string instanceName = name.encode('utf-8') - cdef Result result with nogil: - result = iterated_greedy_wrapper(num_nodes, cpp_edge_list, nb_edges, instanceName) + result = iterated_greedy_wrapper(num_nodes, cpp_edge_list, nb_edges, seed) # Convert the C++ Result to a Python dictionary py_result = { - "instance_name": result.getInstanceName().decode('utf-8'), "solution_set": set(result.getSolutionSet()), } diff --git a/src/radius_clustering/utils/mds_core.cpp b/src/radius_clustering/utils/mds_core.cpp index 039888e..9e44945 100644 --- a/src/radius_clustering/utils/mds_core.cpp +++ b/src/radius_clustering/utils/mds_core.cpp @@ -449,18 +449,18 @@ class Main { public: Main() : algorithm(constructive, localSearch) {} - Result execute(int numNodes, const std::vector& edges_list, int nb_edges, std::string name) { - Instance instance(numNodes, edges_list, nb_edges, name); - RandomManager::setSeed(13); + Result execute(int numNodes, const std::vector& edges_list, int nb_edges, long seed) { + Instance instance(numNodes, edges_list, nb_edges, "name"); + RandomManager::setSeed(seed); signal(SIGINT, signal_handler); return algorithm.execute(instance); } }; extern "C" { - inline Result iterated_greedy_wrapper(int numNodes, const std::vector& edges_list, int nb_edges, std::string name) { + inline Result iterated_greedy_wrapper(int numNodes, const std::vector& edges_list, int nb_edges, long seed) { static Main main; // Create a single static instance - return main.execute(numNodes, edges_list, nb_edges, name); + return main.execute(numNodes, edges_list, nb_edges, seed); } } \ No newline at end of file diff --git a/tests/test_integration.py b/tests/test_integration.py new file mode 100644 index 0000000..d6d7cee --- /dev/null +++ b/tests/test_integration.py @@ -0,0 +1,140 @@ +import pytest + +from radius_clustering import RadiusClustering +from sklearn import datasets + +X = datasets.fetch_openml(name="iris", version=1, parser="auto")["data"] + +def test_radius_clustering_approx(): + """ + Test the approximate method of the RadiusClustering class. + """ + clusterer = RadiusClustering(manner="approx", radius=1.43) + + assert clusterer.manner == "approx", "The manner should be 'approx'." + assert clusterer.radius == 1.43, "The radius should be 1.43." + assert clusterer.random_state is None, "The random state should be None by default." + assert clusterer._estimator_type == "clusterer", "The estimator type should be 'clusterer'." + assert clusterer._check_symmetric(X) is False, "The input should not be a symmetric distance matrix." + + clusterer.fit(X) + + assert clusterer.X_checked_ is not None, "X_checked_ should not be None after fitting." + assert clusterer.dist_mat_ is not None, "dist_mat_ should not be None after fitting." + assert clusterer.nb_edges_ > 0, "There should be edges in the graph." + assert clusterer.labels_ is not None, "Labels should not be None after fitting." + assert clusterer.centers_ is not None, "Centers should not be None after fitting." + assert clusterer.effective_radius_ > 0, "Effective radius should be greater than 0." + assert clusterer.mds_exec_time_ >= 0, "MDS execution time should be non-negative." + assert clusterer.edges_ is not None, "Edges should not be None after fitting." + assert clusterer.random_state == 42, "Random state should be set to 42 after fitting." + + results = clusterer.labels_ + assert len(results) == X.shape[0], "The number of labels should match the number of samples." + assert len(set(results)) <= X.shape[0], "The number of unique labels should not exceed the number of samples." + + +def test_radius_clustering_exact(): + """ + Test the exact method of the RadiusClustering class. + """ + clusterer = RadiusClustering(manner="exact", radius=1.43) + + assert clusterer.manner == "exact", "The manner should be 'exact'." + assert clusterer.radius == 1.43, "The radius should be 1.43." + assert clusterer.random_state is None, "The random state should be None by default." + assert clusterer._estimator_type == "clusterer", "The estimator type should be 'clusterer'." + assert clusterer._check_symmetric(X) is False, "The input should not be a symmetric distance matrix." + + clusterer.fit(X) + + assert clusterer.X_checked_ is not None, "X_checked_ should not be None after fitting." + assert clusterer.dist_mat_ is not None, "dist_mat_ should not be None after fitting." + assert clusterer.nb_edges_ > 0, "There should be edges in the graph." + assert clusterer.labels_ is not None, "Labels should not be None after fitting." + assert clusterer.centers_ is not None, "Centers should not be None after fitting." + assert clusterer.effective_radius_ > 0, "Effective radius should be greater than 0." + assert clusterer.mds_exec_time_ >= 0, "MDS execution time should be non-negative." + assert clusterer.edges_ is not None, "Edges should not be None after fitting." + assert clusterer.random_state is None, "Random state should remain None." + + results = clusterer.labels_ + assert len(results) == X.shape[0], "The number of labels should match the number of samples." + assert len(set(results)) <= X.shape[0], "The number of unique labels should not exceed the number of samples." + +def test_radius_clustering_fit_predict(): + """ + Test the fit_predict method of the RadiusClustering class. + """ + clusterer = RadiusClustering(manner="approx", radius=1.43) + + assert clusterer.manner == "approx", "The manner should be 'approx'." + assert clusterer.radius == 1.43, "The radius should be 1.43." + assert clusterer.random_state is None, "The random state should be None by default." + assert clusterer._estimator_type == "clusterer", "The estimator type should be 'clusterer'." + + labels = clusterer.fit_predict(X) + + assert labels is not None, "Labels should not be None after fit_predict." + assert len(labels) == X.shape[0], "The number of labels should match the number of samples." + assert len(set(labels)) <= X.shape[0], "The number of unique labels should not exceed the number of samples." + +def test_radius_clustering_fit_predict_exact(): + """ + Test the fit_predict method of the RadiusClustering class with exact method. + """ + clusterer = RadiusClustering(manner="exact", radius=1.43) + + assert clusterer.manner == "exact", "The manner should be 'exact'." + assert clusterer.radius == 1.43, "The radius should be 1.43." + assert clusterer.random_state is None, "The random state should be None by default." + assert clusterer._estimator_type == "clusterer", "The estimator type should be 'clusterer'." + + labels = clusterer.fit_predict(X) + + assert labels is not None, "Labels should not be None after fit_predict." + assert len(labels) == X.shape[0], "The number of labels should match the number of samples." + assert len(set(labels)) <= X.shape[0], "The number of unique labels should not exceed the number of samples." + +def test_radius_clustering_random_state(): + """ + Test the random state functionality of the RadiusClustering class. + """ + clusterer = RadiusClustering(manner="approx", radius=1.43, random_state=123) + + assert clusterer.random_state == 123, "The random state should be set to 123." + + # Fit the model + clusterer.fit(X) + + # Check that the random state is preserved + assert clusterer.random_state == 123, "The random state should remain 123 after fitting." + + # Check that the results are consistent with the random state + labels1 = clusterer.labels_ + + # Re-initialize and fit again with the same random state + clusterer2 = RadiusClustering(manner="approx", radius=1.43, random_state=123) + clusterer2.fit(X) + + labels2 = clusterer2.labels_ + + assert (labels1 == labels2).all(), "Labels should be consistent across runs with the same random state." + +def test_deterministic_behavior(): + """ + Test the deterministic behavior of the RadiusClustering class with a fixed random state. + """ + clusterer1 = RadiusClustering(manner="approx", radius=1.43, random_state=42) + clusterer2 = RadiusClustering(manner="approx", radius=1.43, random_state=42) + + labels1 = clusterer1.fit_predict(X) + labels2 = clusterer2.fit_predict(X) + + assert (labels1 == labels2).all(), "Labels should be the same for two instances with the same random state." + + clusterer1 = RadiusClustering(manner="exact", radius=1.43) + clusterer2 = RadiusClustering(manner="exact", radius=1.43) + labels1 = clusterer1.fit_predict(X) + labels2 = clusterer2.fit_predict(X) + assert (labels1 == labels2).all(), "Labels should be the same for two exact instances." diff --git a/tests/test_rad.py b/tests/test_rad.py deleted file mode 100644 index c245068..0000000 --- a/tests/test_rad.py +++ /dev/null @@ -1,32 +0,0 @@ -def test_imports(): - import radius_clustering as rad - - -def test_from_import(): - from radius_clustering import RadiusClustering - - -def test_radius_clustering_approx(): - from radius_clustering import RadiusClustering - from sklearn import datasets - - # Load the Iris dataset - iris = datasets.fetch_openml(name="iris", version=1, parser="auto") - X = iris["data"] # Use dictionary-style access instead of attribute access - - graph_mds_api_consistent = RadiusClustering(manner="approx", threshold=1.43) - - result_api_consistent = graph_mds_api_consistent.fit_predict(X) - - -def test_radius_clustering_exact(): - from radius_clustering import RadiusClustering - from sklearn import datasets - - # Load the Iris dataset - iris = datasets.fetch_openml(name="iris", version=1, parser="auto") - X = iris["data"] # Use dictionary-style access instead of attribute access - - graph_mds_api_consistent = RadiusClustering(manner="exact", threshold=1.43) - - result_api_consistent = graph_mds_api_consistent.fit_predict(X) diff --git a/tests/test_regression.py b/tests/test_regression.py new file mode 100644 index 0000000..4f6c5dc --- /dev/null +++ b/tests/test_regression.py @@ -0,0 +1,64 @@ +import pytest +import numpy as np +from radius_clustering import RadiusClustering +from sklearn.datasets import load_iris + +@pytest.fixture +def iris_data(): + """Fixture to load the Iris dataset.""" + data = load_iris() + return data.data + +@pytest.fixture +def approx_results(): + """Fixture to store results for approximate clustering.""" + results = { + 'labels': [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,2,2,2,2,1,2,2,2,2, + 2,2,1,1,2,2,2,2,1,2,1,2,1,2,2,1,1,2,2,2,2,2,1,2,2,2,2,1,2,2,2,1,2,2,2,1,2, + 2,1], + "centers": [0,96,125], + "time" : 0.0280, + "effective_radius": 1.4282856857085722 + } + return results + +@pytest.fixture +def exact_results(): + """Fixture to store results for exact clustering.""" + results = { + 'labels':[ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,2,2,2,2,1,2,2,2,2, + 2,2,1,1,2,2,2,2,1,2,1,2,1,2,2,1,1,2,2,2,2,2,1,2,2,2,2,1,2,2,2,1,2,2,2,1,2, + 2,1 + ], + "centers": [0, 96, 102], + "time": 0.0004, + "effective_radius": 1.4282856857085722 + } + return results + +def assert_results_exact(results, expected): + """Helper function to assert clustering results.""" + assert_results(results, expected) + assert set(results.labels_) == set(expected['labels']), "Labels do not match expected" + assert results.centers_ == expected['centers'], "Centers do not match expected" + assert np.sum(results.labels_ - expected['labels']) == 0, "Labels do not match expected" + +def assert_results(results, expected): + assert len(results.labels_) == len(expected['labels']), "Labels length mismatch" + assert abs(results.mds_exec_time_ - expected['time']) < 0.1, "Execution time mismatch by more than 10%" + assert abs(results.effective_radius_ - expected['effective_radius'])/results.effective_radius_ < 0.1, "Effective radius mismatch" + +def test_exact(iris_data, exact_results): + """Test the RadiusClustering with exact""" + clustering = RadiusClustering(radius=1.43, manner='exact').fit(iris_data) + assert_results_exact(clustering, exact_results) + +def test_approx(iris_data, approx_results): + """Test the RadiusClustering with approx.""" + clustering = RadiusClustering(radius=1.43, manner='approx').fit(iris_data) + assert_results(clustering, approx_results) diff --git a/tests/test_structural.py b/tests/test_structural.py new file mode 100644 index 0000000..1401eac --- /dev/null +++ b/tests/test_structural.py @@ -0,0 +1,18 @@ +from logging import getLogger + +logger = getLogger(__name__) +logger.setLevel("INFO") + +def test_import(): + import radius_clustering as rad + + +def test_from_import(): + from radius_clustering import RadiusClustering + +def test_check_estimator_api_consistency(): + from radius_clustering import RadiusClustering + from sklearn.utils.estimator_checks import check_estimator + + # Check the API consistency of the RadiusClustering estimator + check_estimator(RadiusClustering()) diff --git a/tests/test_unit.py b/tests/test_unit.py new file mode 100644 index 0000000..52e874f --- /dev/null +++ b/tests/test_unit.py @@ -0,0 +1,93 @@ +from radius_clustering import RadiusClustering +import pytest + +def test_symmetric(): + """ + Test that the RadiusClustering class can handle symmetric distance matrices. + """ + import numpy as np + + # Check 1D array input + + X = np.array([0,1]) + with pytest.raises(ValueError): + RadiusClustering(manner="exact", radius=1.5)._check_symmetric(X) + + # Check a symmetric distance matrix + X = np.array([[0, 1, 2], + [1, 0, 1], + [2, 1, 0]]) + + clustering = RadiusClustering(manner="exact", radius=1.5) + assert clustering._check_symmetric(X), "The matrix should be symmetric." + + # Check a non-symmetric distance matrix + X_assym = np.array([[0, 1, 2], + [1, 0, 1], + [2, 2, 3]]) # This is not symmetric + assert not clustering._check_symmetric(X_assym), "The matrix should not be symmetric." + + # check a non-square matrix + X_non_square = np.array([[0, 1], + [1, 0], + [2, 1]]) # This is not square + + assert not clustering._check_symmetric(X_non_square), "The matrix should not be symmetric." + + +def test_fit(): + """ + Test that the RadiusClustering class can fit to a distance matrix and to a feature matrix. + This test checks both the exact and approximate methods of clustering. + """ + import numpy as np + + # Create a symmetric distance matrix + X = np.array([[0, 1, 2], + [1, 0, 1], + [2, 1, 0]]) + + clustering = RadiusClustering(manner="exact", radius=1.5) + clustering.fit(X) + + # Check that the labels are assigned correctly + assert len(clustering.labels_) == X.shape[0], "Labels length should match number of samples." + assert clustering.nb_edges_ > 0, "There should be edges in the graph." + assert np.array_equal(clustering.X_checked_, clustering.dist_mat_), "X_checked_ should be equal to dist_mat_ because X is a distance matrix." + + # Create a feature matrix + X_features = np.array([[0, 1], + [1, 0], + [2, 1]]) + + clustering = RadiusClustering(manner="approx", radius=1.5) + clustering.fit(X_features) + + # Check that the labels are assigned correctly + assert len(clustering.labels_) == X_features.shape[0], "Labels length should match number of samples." + assert clustering.nb_edges_ > 0, "There should be edges in the graph." + assert clustering._check_symmetric(clustering.dist_mat_), "Distance matrix should be symmetric after computed from features." + +def test_radius_clustering_invalid_manner(): + """ + Test that an error is raised when an invalid manner is provided. + """ + with pytest.raises(ValueError, match="Invalid manner. Choose either 'exact' or 'approx'."): + RadiusClustering(manner="invalid", radius=1.43).fit([[0, 1], [1, 0], [2, 1]]) + + with pytest.raises(ValueError, match="Invalid manner. Choose either 'exact' or 'approx'."): + RadiusClustering(manner="", radius=1.43).fit([[0, 1], [1, 0], [2, 1]]) + + +def test_radius_clustering_invalid_radius(): + """ + Test that an error is raised when an invalid radius is provided. + """ + with pytest.raises(ValueError, match="Radius must be a positive float."): + RadiusClustering(manner="exact", radius=-1.0).fit([[0, 1], [1, 0], [2, 1]]) + + with pytest.raises(ValueError, match="Radius must be a positive float."): + RadiusClustering(manner="approx", radius=0.0).fit([[0, 1], [1, 0], [2, 1]]) + + with pytest.raises(ValueError, match="Radius must be a positive float."): + RadiusClustering(manner="exact", radius="invalid").fit([[0, 1], [1, 0], [2, 1]]) \ No newline at end of file