From 2cc363d7885d5d2ed0060db2d58c47c9991a2faf Mon Sep 17 00:00:00 2001 From: Quentin Date: Tue, 17 Jun 2025 16:15:40 +0200 Subject: [PATCH 1/6] API modifications for scikit-learn --- .gitignore | 2 +- src/radius_clustering/radius_clustering.py | 111 +++++++++++++++------ src/radius_clustering/utils/mds.pyx | 9 +- src/radius_clustering/utils/mds_core.cpp | 10 +- tests/test_rad.py | 11 ++ 5 files changed, 100 insertions(+), 43 deletions(-) diff --git a/.gitignore b/.gitignore index 4477aa1..4cf08a2 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,7 @@ docs/build/ # env and caches -mdsenv/ +mds-env/ **/__pycache__/ .pytest_cache/ .ruff_cache/ diff --git a/src/radius_clustering/radius_clustering.py b/src/radius_clustering/radius_clustering.py index db49a56..7353661 100644 --- a/src/radius_clustering/radius_clustering.py +++ b/src/radius_clustering/radius_clustering.py @@ -12,7 +12,7 @@ import numpy as np from sklearn.metrics import pairwise_distances from sklearn.base import BaseEstimator, ClusterMixin -from sklearn.utils.validation import check_array +from sklearn.utils.validation import check_array, validate_data, check_random_state from radius_clustering.utils._emos import py_emos_main from radius_clustering.utils._mds_approx import solve_mds @@ -20,7 +20,7 @@ DIR_PATH = os.path.dirname(os.path.realpath(__file__)) -class RadiusClustering(BaseEstimator, ClusterMixin): +class RadiusClustering(ClusterMixin, BaseEstimator): """ Radius Clustering algorithm. @@ -42,29 +42,56 @@ class RadiusClustering(BaseEstimator, ClusterMixin): The indices of the cluster centers. labels\_ : array-like, shape (n_samples,) The cluster labels for each point in the input data. - effective_radius : float + effective_radius\_ : float The maximum distance between any point and its assigned cluster center. + random_state\_ : int | None + The random state used for reproducibility. If None, no random state is set. + + .. note:: + The `random_state_` attribute is not used when the `manner` is set to "exact". + + .. versionadded:: 1.3.0 + The *random_state* parameter was added to allow reproducibility in the approximate method. + + .. versionchanged:: 1.3.0 + All publicly accessible attributes are now suffixed with an underscore (e.g., `centers_`, `labels_`). + This is particularly useful for compatibility with scikit-learn's API. """ - def __init__(self, manner="approx", threshold=0.5): + _estimator_type = "clusterer" + + def __init__(self, manner: str ="approx", threshold: float =0.5, random_state: int | None = None) -> None: self.manner = manner self.threshold = threshold + self.random_state = random_state - def _check_symmetric(self, a, tol=1e-8): + def _check_symmetric(self, a: np.ndarray, tol: float =1e-8) -> bool: if a.ndim != 2: raise ValueError("Input must be a 2D array.") if a.shape[0] != a.shape[1]: return False return np.allclose(a, a.T, atol=tol) - def fit(self, X, y=None): + def fit(self, X: np.ndarray, y: None = None) -> "RadiusClustering": """ Fit the MDS clustering model to the input data. + This method computes the distance matrix if the input is a feature matrix, + or uses the provided distance matrix directly if the input is already a distance matrix. + + .. note:: + If the input is a distance matrix, it should be symmetric and square. + If the input is a feature matrix, the distance matrix will be computed using Euclidean distance. + + .. tip:: + Next version will support providing different metrics or even custom callables to compute the distance matrix. + Parameters: ----------- X : array-like, shape (n_samples, n_features) - The input data to cluster. + The input data to cluster. X should be a 2D array-like structure. It can either be : + - A distance matrix (symmetric, square) with shape (n_samples, n_samples). + - A feature matrix with shape (n_samples, n_features) where the distance matrix will be computed. y : Ignored Not used, present here for API consistency by convention. @@ -91,23 +118,24 @@ def fit(self, X, y=None): For examples on common datasets and differences with kmeans, see :ref:`sphx_glr_auto_examples_plot_iris_example.py` """ - self.X = check_array(X) + self.X_checked_ = validate_data(self, X) # Create dist and adj matrices - if not self._check_symmetric(self.X): - dist_mat = pairwise_distances(self.X, metric="euclidean") + if not self._check_symmetric(self.X_checked_): + dist_mat = pairwise_distances(self.X_checked_, metric="euclidean") else: - dist_mat = self.X + dist_mat = self.X_checked_ adj_mask = np.triu((dist_mat <= self.threshold), k=1) - self.nb_edges = np.sum(adj_mask) - if self.nb_edges == 0: - self.centers_ = list(range(self.X.shape[0])) - self.labels_ = self.centers_ - self.effective_radius = 0 - self._mds_exec_time = 0 + self.nb_edges_ = np.sum(adj_mask) + if self.nb_edges_ == 0: + self.centers_ = list(range(self.X_checked_.shape[0])) + self.labels_ = np.array(self.centers_) + self.effective_radius_ = 0 + self.mds_exec_time_ = 0 return self - self.edges = np.argwhere(adj_mask).astype(np.uint32) #TODO: changer en uint32 - self.dist_mat = dist_mat + self.edges_ = np.argwhere(adj_mask).astype(np.uint32) # Edges in the adjacency matrix + # uint32 is used to use less memory. Max number of features is 2^32-1 + self.dist_mat_ = dist_mat self._clustering() self._compute_effective_radius() @@ -115,14 +143,18 @@ def fit(self, X, y=None): return self - def fit_predict(self, X, y=None): + def fit_predict(self, X: np.ndarray, y: None = None) -> np.ndarray: """ Fit the model and return the cluster labels. + This method is a convenience function that combines `fit` and `predict`. + Parameters: ----------- X : array-like, shape (n_samples, n_features) - The input data to cluster. + The input data to cluster. X should be a 2D array-like structure. It can either be : + - A distance matrix (symmetric, square) with shape (n_samples, n_samples). + - A feature matrix with shape (n_samples, n_features) where the distance matrix will be computed. y : Ignored Not used, present here for API consistency by convention. @@ -138,13 +170,13 @@ def _clustering(self): """ Perform the clustering using either the exact or approximate MDS method. """ - n = self.X.shape[0] + n = self.X_checked_.shape[0] if self.manner == "exact": self._clustering_exact(n) else: self._clustering_approx(n) - def _clustering_exact(self, n): + def _clustering_exact(self, n: int) -> None: """ Perform exact MDS clustering. @@ -158,13 +190,26 @@ def _clustering_exact(self, n): This function uses the EMOS algorithm to solve the MDS problem. See: [jiang]_ for more details. """ - self.centers_, self._mds_exec_time = py_emos_main( - self.edges.flatten(), n, self.nb_edges + self.centers_, self.mds_exec_time_ = py_emos_main( + self.edges_.flatten(), n, self.nb_edges_ ) - def _clustering_approx(self, n): + def _clustering_approx(self, n: int) -> None: """ - Perform approximate MDS clustering. + Perform approximate MDS clustering. This method uses a pretty trick to set the seed for the random state of the C++ code of the MDS solver. + + .. tip:: + The random state is used to ensure reproducibility of the results when using the approximate method. + If `random_state` is None, a default value of 42 is used. + + .. important:: + :collapsible: closed + The trick to set the random state is : + 1. Use the `check_random_state` function to get a `RandomState`singleton instance, set up with the provided `random_state`. + 2. Use the `randint` method of the `RandomState` instance to generate a random integer. + 3. Use this random integer as the seed for the C++ code of the MDS solver. + + This ensures that the seed passed to the C++ code is always an integer, which is required by the MDS solver, and allows for reproducibility of the results. Parameters: ----------- @@ -176,9 +221,13 @@ def _clustering_approx(self, n): This function uses the approximation method to solve the MDS problem. See [casado]_ for more details. """ - result = solve_mds(n, self.edges.flatten().astype(np.int32), self.nb_edges, "test") + if self.random_state is None: + self.random_state = 42 + self.random_state_ = check_random_state(self.random_state) + seed = self.random_state_.randint(np.iinfo(np.int32).max) + result = solve_mds(n, self.edges_.flatten().astype(np.int32), self.nb_edges_, seed) self.centers_ = [x for x in result["solution_set"]] - self._mds_exec_time = result["Time"] + self.mds_exec_time_ = result["Time"] def _compute_effective_radius(self): """ @@ -187,13 +236,13 @@ def _compute_effective_radius(self): The effective radius is the maximum radius among all clusters. That means EffRad = max(R(C_i)) for all i. """ - self.effective_radius = np.min(self.dist_mat[:, self.centers_], axis=1).max() + self.effective_radius_ = np.min(self.dist_mat_[:, self.centers_], axis=1).max() def _compute_labels(self): """ Compute the cluster labels for each point in the dataset. """ - distances = self.dist_mat[:, self.centers_] + distances = self.dist_mat_[:, self.centers_] self.labels_ = np.argmin(distances, axis=1) min_dist = np.min(distances, axis=1) diff --git a/src/radius_clustering/utils/mds.pyx b/src/radius_clustering/utils/mds.pyx index 488ae90..2be8f77 100644 --- a/src/radius_clustering/utils/mds.pyx +++ b/src/radius_clustering/utils/mds.pyx @@ -37,9 +37,9 @@ cdef extern from "mds_core.cpp": cpp_unordered_set[int] getSolutionSet() void setSolutionSet(cpp_unordered_set[int] solutionSet) - cdef Result iterated_greedy_wrapper(int numNodes, const vector[int]& edges_list, int nb_edges, string name) nogil + cdef Result iterated_greedy_wrapper(int numNodes, const vector[int]& edges_list, int nb_edges, long seed) nogil -def solve_mds(int num_nodes, np.ndarray[int, ndim=1, mode="c"] edges not None, int nb_edges, str name): +def solve_mds(int num_nodes, np.ndarray[int, ndim=1, mode="c"] edges not None, int nb_edges, int seed): """ Solve the Minimum Dominating Set problem for a given graph. @@ -64,15 +64,12 @@ def solve_mds(int num_nodes, np.ndarray[int, ndim=1, mode="c"] edges not None, i # Cast the NumPy array to a C++ vector cpp_edge_list.assign(&edges[0], &edges[0] + edges.shape[0]) - cdef string instanceName = name.encode('utf-8') - cdef Result result with nogil: - result = iterated_greedy_wrapper(num_nodes, cpp_edge_list, nb_edges, instanceName) + result = iterated_greedy_wrapper(num_nodes, cpp_edge_list, nb_edges, seed) # Convert the C++ Result to a Python dictionary py_result = { - "instance_name": result.getInstanceName().decode('utf-8'), "solution_set": set(result.getSolutionSet()), } diff --git a/src/radius_clustering/utils/mds_core.cpp b/src/radius_clustering/utils/mds_core.cpp index 039888e..9e44945 100644 --- a/src/radius_clustering/utils/mds_core.cpp +++ b/src/radius_clustering/utils/mds_core.cpp @@ -449,18 +449,18 @@ class Main { public: Main() : algorithm(constructive, localSearch) {} - Result execute(int numNodes, const std::vector& edges_list, int nb_edges, std::string name) { - Instance instance(numNodes, edges_list, nb_edges, name); - RandomManager::setSeed(13); + Result execute(int numNodes, const std::vector& edges_list, int nb_edges, long seed) { + Instance instance(numNodes, edges_list, nb_edges, "name"); + RandomManager::setSeed(seed); signal(SIGINT, signal_handler); return algorithm.execute(instance); } }; extern "C" { - inline Result iterated_greedy_wrapper(int numNodes, const std::vector& edges_list, int nb_edges, std::string name) { + inline Result iterated_greedy_wrapper(int numNodes, const std::vector& edges_list, int nb_edges, long seed) { static Main main; // Create a single static instance - return main.execute(numNodes, edges_list, nb_edges, name); + return main.execute(numNodes, edges_list, nb_edges, seed); } } \ No newline at end of file diff --git a/tests/test_rad.py b/tests/test_rad.py index c245068..80f4834 100644 --- a/tests/test_rad.py +++ b/tests/test_rad.py @@ -1,3 +1,8 @@ +from logging import getLogger + +logger = getLogger(__name__) +logger.setLevel("INFO") + def test_imports(): import radius_clustering as rad @@ -5,6 +10,12 @@ def test_imports(): def test_from_import(): from radius_clustering import RadiusClustering +def test_check_estimator_api_consistency(): + from radius_clustering import RadiusClustering + from sklearn.utils.estimator_checks import check_estimator + + # Check the API consistency of the RadiusClustering estimator + stats = check_estimator(RadiusClustering()) def test_radius_clustering_approx(): from radius_clustering import RadiusClustering From 6e33df4ba3580336da5fcef5833e903c811a3024 Mon Sep 17 00:00:00 2001 From: Quentin Date: Wed, 18 Jun 2025 07:33:06 +0200 Subject: [PATCH 2/6] tests --- .coverage | Bin 0 -> 53248 bytes .gitignore | 3 + pyproject.toml | 15 +++ src/radius_clustering/__init__.py | 4 +- src/radius_clustering/radius_clustering.py | 26 +++- tests/test_integration.py | 140 +++++++++++++++++++++ tests/test_rad.py | 43 ------- tests/test_regression.py | 61 +++++++++ tests/test_structural.py | 18 +++ tests/test_unit.py | 93 ++++++++++++++ 10 files changed, 351 insertions(+), 52 deletions(-) create mode 100644 .coverage create mode 100644 tests/test_integration.py delete mode 100644 tests/test_rad.py create mode 100644 tests/test_regression.py create mode 100644 tests/test_structural.py create mode 100644 tests/test_unit.py diff --git a/.coverage b/.coverage new file mode 100644 index 0000000000000000000000000000000000000000..f87d0eb6bd52cf48126498f1ce7bcd7807f26716 GIT binary patch literal 53248 zcmeI5dyo{x9mjiikK5UvM{~RuL}Yxxac~dM2M8+OQ|^xAeSqL&+1s0gg}q(a-4jp} zJl>)L8ng&WtcsN~espQcy7@My<`Sce?xbp`if2O-z0a4k086FAH9B3xFt+ z0eor&dMFtrm3zu8wt|?&j}dmCwTM-iKewhDZ<$@e{l?RQNH7XJa6u9Z)Lcu7xT#rea-@)K+JM zyR|%@K^!gPZpnBFP>CB`o8UQ|X`*)TV2M0A~-E_WiThtVXob4$kSZ7SJSl}S4BSR&ue zL^_sRj#f^4n+>SvzSk9rz4d4~_2I^YFJG90tDUMY@r@!zZB}H~oxDy(eNJ^~z z1^J<&aK6on?yhz~zP`I_BR@G%!aHy#IaJA^%4U+Hv)}sM400~}`#a!tO62^EdZoG( zu}$5sI{D>Jf64NEHxkKmhj#)-z!al6G<&p0U{tzW(wpM&eJdR4&Ls137*2k;8s5xZ zC3+w5Q+}XuJmL0OCapwpz@SmdLku0hf9P?;XUV`p!e>!D?j$?&Q-+szypsc? zxp2-@Nf?n7pJ+}}WQ3M}ucM7O7TVyn$CK>?$69{Gq+OUGuolQ{o!yBjpDJnIhiLMC zHx*gyR*JpevhaNmBp5o?!QxQkJC27xu;y`HPL~TbfKa9lL5+8v9Lr9*< zL6%XutILg~yHj!@`KCoumgrw4zqs5&Xb|xr6Y<%w~igE;k*gqYW`ll^KM_h$$KZ{ z>36}R&tLulP39^6O`h*=t++oDht*_bB$k3H3u~}<1#o!AjtFFqv?seW`SHuc{P5)0 zp8ocF74*lFJ>)9OXNT#%IGIkz;IhguTj@D6)hS&UN%K`GUFx(alW{ka@U9I|qB#x6 zjHR7?8!o*n-O*o1dMmGE?fs?sHpz<%U*_9j(#iP+qN^*C=n&T{ei`;^g|(kgPJSG3 z5rKkEDNM>VOwq5^DJvCw`HCg$uiOYsc&%NL4sW{ilTuLQ=5_8R)p=K=hDgTE#JXIE zXUR7vKd?4O(oQtx!cMrBmX-RyX&CUc6|Uy*+$HP$t6a}yEfAV9L+j}llhIpe!fCju zcSU5gMd%waNWQkSf4nS$1sA^U6pjXEj^M~0%Y&8-< z0!RP}AOR$R1dsp{Kmter2_OL^APJcMNt*l>z(@TPgW^X3e*fQZ!{7fOW6R(JHza@r zkN^@u0!RP}AOR$R1dsp{Kmx;!zzTm6>8Y+-o`%N^RKe3+67ayFs)l5=y9;ExDi!I7 zb*ID8cy~JErr>EjRq0f8kksx>ES|0khhvFYCLD%G5Ulb+y|XU6-b;=TRWH_{>M569 zbwOkK{eQpxC1J-|8+_n~1dsp{Kmter2_OL^fCP{L5FkN^@u0!RP}AOR$R1dsp{Kmtf$5CZ)EKl=Ycz+e&*Kmter2_OL^fCP{L5Ta1$NtX#!cMa{*emP>_FMK0JHigIUF-q2k#(|r*izQS=CYgE zbmp*e%w#_M-}X8CZ}y+mx7nL)*Is2Wuxss`?J|3kU1E>0 zKDR!y-n0H_y>9hcS?gKrsCB^FX>GCM)*5TM)nd)HZm_0WS6H^?GcTI|FyA%bHh*uv zWF9l0F`qK`n?E*FW~aH@TxiyrGtKGdRpuzO*!Yie&NySdW4vMX8Cm05o*=$b!A*Mso+d|Du<(U{Q1 zT7a$-x=1UgYlZe}MZr$4H6O{HrLLEp(=)e`eo1y zg+57MrVEtrqpd>sk`r{k&?iVQZ4vrKNN*PU1@azt&joK6S&n3b zw+VSTOK%l9o+3x+9HE~iPtn;zA0|)ITZDds91hOn8jg7|OJ|DYL*xm1v(N{~L3)$W zkC6lPMxh@ikI@^1euO+qtA*ZA9--GOeTY^G{V>^2D}~-i9;OvS?;(3>xzhXS454?C z-E_LrduW-^JIJo!G|-yQx1H>ur6TzuNS-Q_AJV6Asrj~L_3MP%u3zh^)?WP@Pqnn_ zS9>bEE$DFN^DQJt^~qjhbFY4tr<$7eNuHYDt6%AD~kKF(A14faj@D&BLyvX_Z{r@;U!~Oq9?0xnpW2Uj#c-wT$*R5;ppV?=@>#t;|*sJU% z_#Xm(#eNFE2<&8ASQo2h?W~o}WHZ=UM(v#anf*KaT{~-^wh!5_*}Ls$?WFyH9kpBR z7544+wf6PaI6G*Mv#9l@bsl~NIAgtLowSZwzqIyQhph*#CM#h@tTyX5tIGV!{MdZQ zd>MTIaq~%Yj=9f#(2SS~v(21mR+yuW&%yV9X#CZ9)qLMLVf@B;+IZa91)hJsaj#Kt zOgAPOBMm>jNYBs@zpJGl>qP=c00|%gB!C2v0226LB@irzZ^PV+$GOtMP3Jb4pum*p zZTWf?fG>}!0F3!M6@W8eYw$8)OTe0U${pOByE5SByh8>#aOY7O5U}U%Dgb{TQ2`kA zH3h+iunNGU-=_lb==Z7sO!_@40GEEZ3c#kXmI3$at7JgHsNbamaOx{n09Jj43c#x` zSHYX)RGSRATVEyv0(O0=3c#-~Q2`kC#VP>DzDNaN*%uZBeG60oroB}K;M(V_0Bn1U z3c$BF7X&ApQ~=I>o(#Bkzf%STynCYxz`Qr80Ni`M3c$YCsq?!xTdVZ$Y>mvs9sC_K zAYkF=s$561w<~=(dz;DxF8)>(fQ_G{0`T#(RRBi*7FGUWc9z-!R(_@mz{}sP0x?Xe#BI^4@-mlH0G7vbFbGDO*~vkutmOYFPysd`BiX_fD3wsrf1? z=l4#MvSt32QZ}_rl(M(!3MuC;oFHXm-*_n-8beanH;j|AuYRnQb@RqZSyMM!%1~~U zl;dkkq&yoMDP?FplX6VRmU7e>OUjZ_rj)G2kkV#UO3MyP3C3SaaQ=K@r+PyP4%_#1!z|9=6s%dOi0 literal 0 HcmV?d00001 diff --git a/.gitignore b/.gitignore index 4cf08a2..9888adb 100644 --- a/.gitignore +++ b/.gitignore @@ -25,5 +25,8 @@ docs/source/modules/generated/ **/emos.c **/mds.cpp +# MAC OS files +.DS_Store + # Reportings reporting/ \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 175e565..4be7195 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,7 @@ documentation = "https://lias-laboratory.github.io/radius_clustering/" [project.optional-dependencies] dev = [ "pytest>=8.3.3", + "pytest-cov>=5.0.0", "pandas", "cython>=3.0", "setuptools>= 61.0", @@ -80,8 +81,22 @@ pythonpath = "src" testpaths = ["tests"] addopts = [ "--import-mode=importlib", + "--cov=src/radius_clustering", + "--cov-report=term-missing", + "--cov-report=html:coverage_html_report", ] +[tool.coverage.run] +source = ["src/radius_clustering"] +branch = true + +[tool.coverage.report] +show_missing = true + +[tool.coverage.html] +directory = "coverage_html_report" +title = "Coverage Report" + [tool.ruff] # Exclude a variety of commonly ignored directories. exclude = [ diff --git a/src/radius_clustering/__init__.py b/src/radius_clustering/__init__.py index 3ebc26e..6d528e4 100644 --- a/src/radius_clustering/__init__.py +++ b/src/radius_clustering/__init__.py @@ -2,6 +2,4 @@ from .radius_clustering import RadiusClustering __all__ = ["RadiusClustering"] - -# Optionally, you can set a version number for your package -__version__ = "1.2.2" +__version__ = "1.2.3" diff --git a/src/radius_clustering/radius_clustering.py b/src/radius_clustering/radius_clustering.py index 7353661..8753e92 100644 --- a/src/radius_clustering/radius_clustering.py +++ b/src/radius_clustering/radius_clustering.py @@ -31,7 +31,7 @@ class RadiusClustering(ClusterMixin, BaseEstimator): ----------- manner : str, optional (default="approx") The method to use for solving the MDS problem. Can be "exact" or "approx". - threshold : float, optional (default=0.5) + radius : float, optional (default=0.5) The dissimilarity threshold to act as radius constraint for the clustering. Attributes: @@ -56,13 +56,16 @@ class RadiusClustering(ClusterMixin, BaseEstimator): .. versionchanged:: 1.3.0 All publicly accessible attributes are now suffixed with an underscore (e.g., `centers_`, `labels_`). This is particularly useful for compatibility with scikit-learn's API. + + .. versionchanged:: 1.3.0 + The `threshold` parameter was renamed to `radius` to better reflect its purpose. """ _estimator_type = "clusterer" - def __init__(self, manner: str ="approx", threshold: float =0.5, random_state: int | None = None) -> None: + def __init__(self, manner: str ="approx", radius: float =0.5, random_state: int | None = None) -> None: self.manner = manner - self.threshold = threshold + self.radius = radius self.random_state = random_state def _check_symmetric(self, a: np.ndarray, tol: float =1e-8) -> bool: @@ -125,7 +128,12 @@ def fit(self, X: np.ndarray, y: None = None) -> "RadiusClustering": dist_mat = pairwise_distances(self.X_checked_, metric="euclidean") else: dist_mat = self.X_checked_ - adj_mask = np.triu((dist_mat <= self.threshold), k=1) + + if not isinstance(self.radius, (float, int)): + raise ValueError("Radius must be a positive float.") + if self.radius <= 0: + raise ValueError("Radius must be a positive float.") + adj_mask = np.triu((dist_mat <= self.radius), k=1) self.nb_edges_ = np.sum(adj_mask) if self.nb_edges_ == 0: self.centers_ = list(range(self.X_checked_.shape[0])) @@ -171,6 +179,11 @@ def _clustering(self): Perform the clustering using either the exact or approximate MDS method. """ n = self.X_checked_.shape[0] + if self.manner != "exact" and self.manner != "approx": + print(f"Invalid manner: {self.manner}. Defaulting to 'approx'.") + raise ValueError( + "Invalid manner. Choose either 'exact' or 'approx'." + ) if self.manner == "exact": self._clustering_exact(n) else: @@ -193,6 +206,7 @@ def _clustering_exact(self, n: int) -> None: self.centers_, self.mds_exec_time_ = py_emos_main( self.edges_.flatten(), n, self.nb_edges_ ) + self.centers_.sort() # Sort the centers to ensure consistent order def _clustering_approx(self, n: int) -> None: """ @@ -226,7 +240,7 @@ def _clustering_approx(self, n: int) -> None: self.random_state_ = check_random_state(self.random_state) seed = self.random_state_.randint(np.iinfo(np.int32).max) result = solve_mds(n, self.edges_.flatten().astype(np.int32), self.nb_edges_, seed) - self.centers_ = [x for x in result["solution_set"]] + self.centers_ = sorted([x for x in result["solution_set"]]) self.mds_exec_time_ = result["Time"] def _compute_effective_radius(self): @@ -246,4 +260,4 @@ def _compute_labels(self): self.labels_ = np.argmin(distances, axis=1) min_dist = np.min(distances, axis=1) - self.labels_[min_dist > self.threshold] = -1 + self.labels_[min_dist > self.radius] = -1 diff --git a/tests/test_integration.py b/tests/test_integration.py new file mode 100644 index 0000000..d6d7cee --- /dev/null +++ b/tests/test_integration.py @@ -0,0 +1,140 @@ +import pytest + +from radius_clustering import RadiusClustering +from sklearn import datasets + +X = datasets.fetch_openml(name="iris", version=1, parser="auto")["data"] + +def test_radius_clustering_approx(): + """ + Test the approximate method of the RadiusClustering class. + """ + clusterer = RadiusClustering(manner="approx", radius=1.43) + + assert clusterer.manner == "approx", "The manner should be 'approx'." + assert clusterer.radius == 1.43, "The radius should be 1.43." + assert clusterer.random_state is None, "The random state should be None by default." + assert clusterer._estimator_type == "clusterer", "The estimator type should be 'clusterer'." + assert clusterer._check_symmetric(X) is False, "The input should not be a symmetric distance matrix." + + clusterer.fit(X) + + assert clusterer.X_checked_ is not None, "X_checked_ should not be None after fitting." + assert clusterer.dist_mat_ is not None, "dist_mat_ should not be None after fitting." + assert clusterer.nb_edges_ > 0, "There should be edges in the graph." + assert clusterer.labels_ is not None, "Labels should not be None after fitting." + assert clusterer.centers_ is not None, "Centers should not be None after fitting." + assert clusterer.effective_radius_ > 0, "Effective radius should be greater than 0." + assert clusterer.mds_exec_time_ >= 0, "MDS execution time should be non-negative." + assert clusterer.edges_ is not None, "Edges should not be None after fitting." + assert clusterer.random_state == 42, "Random state should be set to 42 after fitting." + + results = clusterer.labels_ + assert len(results) == X.shape[0], "The number of labels should match the number of samples." + assert len(set(results)) <= X.shape[0], "The number of unique labels should not exceed the number of samples." + + +def test_radius_clustering_exact(): + """ + Test the exact method of the RadiusClustering class. + """ + clusterer = RadiusClustering(manner="exact", radius=1.43) + + assert clusterer.manner == "exact", "The manner should be 'exact'." + assert clusterer.radius == 1.43, "The radius should be 1.43." + assert clusterer.random_state is None, "The random state should be None by default." + assert clusterer._estimator_type == "clusterer", "The estimator type should be 'clusterer'." + assert clusterer._check_symmetric(X) is False, "The input should not be a symmetric distance matrix." + + clusterer.fit(X) + + assert clusterer.X_checked_ is not None, "X_checked_ should not be None after fitting." + assert clusterer.dist_mat_ is not None, "dist_mat_ should not be None after fitting." + assert clusterer.nb_edges_ > 0, "There should be edges in the graph." + assert clusterer.labels_ is not None, "Labels should not be None after fitting." + assert clusterer.centers_ is not None, "Centers should not be None after fitting." + assert clusterer.effective_radius_ > 0, "Effective radius should be greater than 0." + assert clusterer.mds_exec_time_ >= 0, "MDS execution time should be non-negative." + assert clusterer.edges_ is not None, "Edges should not be None after fitting." + assert clusterer.random_state is None, "Random state should remain None." + + results = clusterer.labels_ + assert len(results) == X.shape[0], "The number of labels should match the number of samples." + assert len(set(results)) <= X.shape[0], "The number of unique labels should not exceed the number of samples." + +def test_radius_clustering_fit_predict(): + """ + Test the fit_predict method of the RadiusClustering class. + """ + clusterer = RadiusClustering(manner="approx", radius=1.43) + + assert clusterer.manner == "approx", "The manner should be 'approx'." + assert clusterer.radius == 1.43, "The radius should be 1.43." + assert clusterer.random_state is None, "The random state should be None by default." + assert clusterer._estimator_type == "clusterer", "The estimator type should be 'clusterer'." + + labels = clusterer.fit_predict(X) + + assert labels is not None, "Labels should not be None after fit_predict." + assert len(labels) == X.shape[0], "The number of labels should match the number of samples." + assert len(set(labels)) <= X.shape[0], "The number of unique labels should not exceed the number of samples." + +def test_radius_clustering_fit_predict_exact(): + """ + Test the fit_predict method of the RadiusClustering class with exact method. + """ + clusterer = RadiusClustering(manner="exact", radius=1.43) + + assert clusterer.manner == "exact", "The manner should be 'exact'." + assert clusterer.radius == 1.43, "The radius should be 1.43." + assert clusterer.random_state is None, "The random state should be None by default." + assert clusterer._estimator_type == "clusterer", "The estimator type should be 'clusterer'." + + labels = clusterer.fit_predict(X) + + assert labels is not None, "Labels should not be None after fit_predict." + assert len(labels) == X.shape[0], "The number of labels should match the number of samples." + assert len(set(labels)) <= X.shape[0], "The number of unique labels should not exceed the number of samples." + +def test_radius_clustering_random_state(): + """ + Test the random state functionality of the RadiusClustering class. + """ + clusterer = RadiusClustering(manner="approx", radius=1.43, random_state=123) + + assert clusterer.random_state == 123, "The random state should be set to 123." + + # Fit the model + clusterer.fit(X) + + # Check that the random state is preserved + assert clusterer.random_state == 123, "The random state should remain 123 after fitting." + + # Check that the results are consistent with the random state + labels1 = clusterer.labels_ + + # Re-initialize and fit again with the same random state + clusterer2 = RadiusClustering(manner="approx", radius=1.43, random_state=123) + clusterer2.fit(X) + + labels2 = clusterer2.labels_ + + assert (labels1 == labels2).all(), "Labels should be consistent across runs with the same random state." + +def test_deterministic_behavior(): + """ + Test the deterministic behavior of the RadiusClustering class with a fixed random state. + """ + clusterer1 = RadiusClustering(manner="approx", radius=1.43, random_state=42) + clusterer2 = RadiusClustering(manner="approx", radius=1.43, random_state=42) + + labels1 = clusterer1.fit_predict(X) + labels2 = clusterer2.fit_predict(X) + + assert (labels1 == labels2).all(), "Labels should be the same for two instances with the same random state." + + clusterer1 = RadiusClustering(manner="exact", radius=1.43) + clusterer2 = RadiusClustering(manner="exact", radius=1.43) + labels1 = clusterer1.fit_predict(X) + labels2 = clusterer2.fit_predict(X) + assert (labels1 == labels2).all(), "Labels should be the same for two exact instances." diff --git a/tests/test_rad.py b/tests/test_rad.py deleted file mode 100644 index 80f4834..0000000 --- a/tests/test_rad.py +++ /dev/null @@ -1,43 +0,0 @@ -from logging import getLogger - -logger = getLogger(__name__) -logger.setLevel("INFO") - -def test_imports(): - import radius_clustering as rad - - -def test_from_import(): - from radius_clustering import RadiusClustering - -def test_check_estimator_api_consistency(): - from radius_clustering import RadiusClustering - from sklearn.utils.estimator_checks import check_estimator - - # Check the API consistency of the RadiusClustering estimator - stats = check_estimator(RadiusClustering()) - -def test_radius_clustering_approx(): - from radius_clustering import RadiusClustering - from sklearn import datasets - - # Load the Iris dataset - iris = datasets.fetch_openml(name="iris", version=1, parser="auto") - X = iris["data"] # Use dictionary-style access instead of attribute access - - graph_mds_api_consistent = RadiusClustering(manner="approx", threshold=1.43) - - result_api_consistent = graph_mds_api_consistent.fit_predict(X) - - -def test_radius_clustering_exact(): - from radius_clustering import RadiusClustering - from sklearn import datasets - - # Load the Iris dataset - iris = datasets.fetch_openml(name="iris", version=1, parser="auto") - X = iris["data"] # Use dictionary-style access instead of attribute access - - graph_mds_api_consistent = RadiusClustering(manner="exact", threshold=1.43) - - result_api_consistent = graph_mds_api_consistent.fit_predict(X) diff --git a/tests/test_regression.py b/tests/test_regression.py new file mode 100644 index 0000000..44389b6 --- /dev/null +++ b/tests/test_regression.py @@ -0,0 +1,61 @@ +import pytest +import numpy as np +from radius_clustering import RadiusClustering +from sklearn.datasets import load_iris + +@pytest.fixture +def iris_data(): + """Fixture to load the Iris dataset.""" + data = load_iris() + return data.data + +@pytest.fixture +def approx_results(): + """Fixture to store results for approximate clustering.""" + results = { + 'labels': [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,2,2,2,2,1,2,2,2,2, + 2,2,1,1,2,2,2,2,1,2,1,2,1,2,2,1,1,2,2,2,2,2,1,2,2,2,2,1,2,2,2,1,2,2,2,1,2, + 2,1], + "centers": [0,96,125], + "time" : 0.0280, + "effective_radius": 1.4282856857085722 + } + return results + +@pytest.fixture +def exact_results(): + """Fixture to store results for exact clustering.""" + results = { + 'labels':[ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,2,2,2,2,1,2,2,2,2, + 2,2,1,1,2,2,2,2,1,2,1,2,1,2,2,1,1,2,2,2,2,2,1,2,2,2,2,1,2,2,2,1,2,2,2,1,2, + 2,1 + ], + "centers": [0, 96, 102], + "time": 0.0004, + "effective_radius": 1.4282856857085722 + } + return results + +def assert_results(results, expected): + """Helper function to assert clustering results.""" + assert len(results.labels_) == len(expected['labels']), "Labels length mismatch" + assert set(results.labels_) == set(expected['labels']), "Labels do not match expected" + assert results.centers_ == expected['centers'], "Centers do not match expected" + assert abs(results.mds_exec_time_ - expected['time']) < 0.1, "Execution time mismatch by more than 0.1 seconds" + assert abs(results.effective_radius_ - expected['effective_radius']) < 0.01, "Effective radius mismatch" + assert np.sum(results.labels_ - expected['labels']) == 0, "Labels do not match expected" + +def test_exact(iris_data, exact_results): + """Test the RadiusClustering with exact""" + clustering = RadiusClustering(radius=1.43, manner='exact').fit(iris_data) + assert_results(clustering, exact_results) + +def test_approx(iris_data, approx_results): + """Test the RadiusClustering with approx.""" + clustering = RadiusClustering(radius=1.43, manner='approx').fit(iris_data) + assert_results(clustering, approx_results) diff --git a/tests/test_structural.py b/tests/test_structural.py new file mode 100644 index 0000000..1401eac --- /dev/null +++ b/tests/test_structural.py @@ -0,0 +1,18 @@ +from logging import getLogger + +logger = getLogger(__name__) +logger.setLevel("INFO") + +def test_import(): + import radius_clustering as rad + + +def test_from_import(): + from radius_clustering import RadiusClustering + +def test_check_estimator_api_consistency(): + from radius_clustering import RadiusClustering + from sklearn.utils.estimator_checks import check_estimator + + # Check the API consistency of the RadiusClustering estimator + check_estimator(RadiusClustering()) diff --git a/tests/test_unit.py b/tests/test_unit.py new file mode 100644 index 0000000..52e874f --- /dev/null +++ b/tests/test_unit.py @@ -0,0 +1,93 @@ +from radius_clustering import RadiusClustering +import pytest + +def test_symmetric(): + """ + Test that the RadiusClustering class can handle symmetric distance matrices. + """ + import numpy as np + + # Check 1D array input + + X = np.array([0,1]) + with pytest.raises(ValueError): + RadiusClustering(manner="exact", radius=1.5)._check_symmetric(X) + + # Check a symmetric distance matrix + X = np.array([[0, 1, 2], + [1, 0, 1], + [2, 1, 0]]) + + clustering = RadiusClustering(manner="exact", radius=1.5) + assert clustering._check_symmetric(X), "The matrix should be symmetric." + + # Check a non-symmetric distance matrix + X_assym = np.array([[0, 1, 2], + [1, 0, 1], + [2, 2, 3]]) # This is not symmetric + assert not clustering._check_symmetric(X_assym), "The matrix should not be symmetric." + + # check a non-square matrix + X_non_square = np.array([[0, 1], + [1, 0], + [2, 1]]) # This is not square + + assert not clustering._check_symmetric(X_non_square), "The matrix should not be symmetric." + + +def test_fit(): + """ + Test that the RadiusClustering class can fit to a distance matrix and to a feature matrix. + This test checks both the exact and approximate methods of clustering. + """ + import numpy as np + + # Create a symmetric distance matrix + X = np.array([[0, 1, 2], + [1, 0, 1], + [2, 1, 0]]) + + clustering = RadiusClustering(manner="exact", radius=1.5) + clustering.fit(X) + + # Check that the labels are assigned correctly + assert len(clustering.labels_) == X.shape[0], "Labels length should match number of samples." + assert clustering.nb_edges_ > 0, "There should be edges in the graph." + assert np.array_equal(clustering.X_checked_, clustering.dist_mat_), "X_checked_ should be equal to dist_mat_ because X is a distance matrix." + + # Create a feature matrix + X_features = np.array([[0, 1], + [1, 0], + [2, 1]]) + + clustering = RadiusClustering(manner="approx", radius=1.5) + clustering.fit(X_features) + + # Check that the labels are assigned correctly + assert len(clustering.labels_) == X_features.shape[0], "Labels length should match number of samples." + assert clustering.nb_edges_ > 0, "There should be edges in the graph." + assert clustering._check_symmetric(clustering.dist_mat_), "Distance matrix should be symmetric after computed from features." + +def test_radius_clustering_invalid_manner(): + """ + Test that an error is raised when an invalid manner is provided. + """ + with pytest.raises(ValueError, match="Invalid manner. Choose either 'exact' or 'approx'."): + RadiusClustering(manner="invalid", radius=1.43).fit([[0, 1], [1, 0], [2, 1]]) + + with pytest.raises(ValueError, match="Invalid manner. Choose either 'exact' or 'approx'."): + RadiusClustering(manner="", radius=1.43).fit([[0, 1], [1, 0], [2, 1]]) + + +def test_radius_clustering_invalid_radius(): + """ + Test that an error is raised when an invalid radius is provided. + """ + with pytest.raises(ValueError, match="Radius must be a positive float."): + RadiusClustering(manner="exact", radius=-1.0).fit([[0, 1], [1, 0], [2, 1]]) + + with pytest.raises(ValueError, match="Radius must be a positive float."): + RadiusClustering(manner="approx", radius=0.0).fit([[0, 1], [1, 0], [2, 1]]) + + with pytest.raises(ValueError, match="Radius must be a positive float."): + RadiusClustering(manner="exact", radius="invalid").fit([[0, 1], [1, 0], [2, 1]]) \ No newline at end of file From 37c44609f12711e94f6899d3b8135c43e72389ba Mon Sep 17 00:00:00 2001 From: Quentin Date: Wed, 18 Jun 2025 08:01:15 +0200 Subject: [PATCH 3/6] linting + changelog --- .coverage | Bin 53248 -> 53248 bytes CHANGELOG.md | 19 ++++ pyproject.toml | 6 +- setup.py | 10 +- src/radius_clustering/radius_clustering.py | 105 ++++++++++++++------- 5 files changed, 102 insertions(+), 38 deletions(-) create mode 100644 CHANGELOG.md diff --git a/.coverage b/.coverage index f87d0eb6bd52cf48126498f1ce7bcd7807f26716..f32d8f4b5a185863981f35324f83b3c048d4fa13 100644 GIT binary patch delta 2966 zcmY+_X>3$g6bJD8rqsFb+;{JrwiH@f3IZydvY4_ely=J0PFvefOGRXpol=pd68-Rq zOO=MOO;Nz6)`r9}M#qRTC^kh%NnH|RGzvkD34VYm;s+Hd(&u^JMeL{F|J*y1yw~=B zc6Mg$?96y?P-jLnTug>L=U1E&pNRKGN_2=e(JFR|X3;2ChlY zl6}BZtex#+FR&e~ku7Hl7GrbSR5p&~GtMmd6Mlt@$Fd?+NymA#j@|VEF%E!q^P^LbxgdB&Mnja-6?1h2xfxA+6 zsbWV8qUuH`>4FmF*GVT7D|e9BVS#cx>9C6eP|+xQ5~QcjXKn5le#Bw>c~KC&MQ11HbEGtApV_Q7;@vYYII zY0A6EZkVdvit{OE-fG_;7@wde1^20O3)w{{qb(~`pQe+f67}}PK%r2IwAd3Q8^}>Q zUY^&csdukrO)b4gsxD3MmP*voaq2Wd$4aGZ=oqQ$26~rNd7O@xs_Lerq$;aufmB5$ z9Vr#Bpd+NB<#f1|R~DtiBx7atPN~uu%~uyoX`WO`lnzyj(;-sbB{Wy6xS9@@Dk`Qq zQh7ejmdY=pS!$L~BT|?0sEeA6FDy5Y3OUTlrCcgIhdNS`Y|5nENU|$k#4`PGMy8E# zdQfKaxAYLN8JdWD&AsAYbkD`t$UW=!x*xk~_o#c=-Qzy* z?r=A`E8Rjj?iRTd-BH4KLHr^5#AWfL_*(22-PqA@igwW=s>EhdB36shVzC$^X7XFY z=D+iAv7JBTY5oc8qwlhfy%aNcm* zoR^$UPOY=tDP_Mnvz$pzwo~9_vg_}s}@m9QCX0vo|HSOorp%WxJ> z!&}e}d*L}~f*s_4KgS9pjkags5i~hAcyl0Xp|(3Htnz=$e47c_=37m`I^SXf_W4sf z1cknt1z&)_AZ+v}^?`+z-lRj&>6?OpRfgBF)f@Fu(CQmaz*^sMTNqex0v7u^6R_FW znt;{5#suv4)h1xMuevStKjC=lQG*!yKV5G?+fc?n&qg~lDJQgg|} z@{gK??O$R7)_<`H*#8U67aUF%nFCw^^G(1FFwX>B0S}pgJK#YRa0$#+VOEIX8kl2_ z+Q^aFCg38NWdd%3nI_;WNG7|-220=pGsG=0-2_|%(@elUFx7-zq;-mUl$O-}<^Wg0 zeSm+D*1x-8vSy-wQg9vrO@eJOQ5)C61W(_oO^?^Csl8XTE`5(?qV8_Zn#4HGbj?`J z>V`3z}db`IB!WILL;Ee`d> UoJ@a^eMhh@a9texwP$eTA9nlG#Q*>R delta 2678 zcmYk6Sxi({7=X{r0CVR*_nZre3@#|{qT@;fd@md(Qr1zp{rc z&MbC?wX(D92s^~;*fzG7Mc6VnmrZ6Pqs?vElg8+&dJ_GI_S0Vag#Jw9)S_2tD?LY# z&`;@px|LSZvTG^H2f*~#C3*rL&1d`fBWNC<4f_zhUa!Y*dtk5NB#$5V2u|=M!EV7m zPeP#9VUL&CeXz?N+a8Y(cDhX7*#JAl9LPJ^F6M*8h8k1I8#@QK2_7JCV5{N**dq8f z8GvfRugGiIEchjP1)BuFATME~;C}J~Hn?o(kbYP%<~?K()(JKr+aahD@`yZ!wSv3J zBdApKZdl{89m4r)F~3K8V3puby8>1Ujw4qHzKvWixQpC|GR1KyP4t+~FHK^k6H3IQ zlf)n*_$JPa1>YbyA*{FyiUeOLH(-U!b_KZ(g<^h{+=Bwa{rIco3+}LMAy05SF*^de zj^Onjw*n!7M=e)WY{6o2;v#8*MS?Goi?C2|Gr0f@ z1b<1I10Okz?>y(RqQCt(&OAfT!aT7*P0qkv!KcV+m?QWkIRpNCn3Pmq%^Q*aYG z0Xc$?k>fB!aT81z+(?eXG{wgtTi(Lwq%klR7ap(IB%i|+ai#&sljXQUo8+L!TW@I- z1)16emr7&Wc$Xrj+Blc2`aqULUT-;R)yBHR;+QtZrEsy9>AWAGSQ67lyTeF{HcG4` z+DMmTVQqv2%`2x*Ix zUF-{LhFE84+@-Xj##~BG({z_oQZ?$5krKV!kxTuy-hyTGX-owxH2mC0%!awGp@f|$EtN2nrfzM{?JiyZ#uz%SLEMqTwz`9sFyTq!S zSTj4!!t5ZcVU=tNo29?gU+Pcv+xlg_L%*Oe(wp?ddX0Wiuhfh59DNvli@|(G|D^Zz zr?iv)K);_tq{k$K>y=?h*D1l0Rw==hu2u4gbXQ7oV!B2O z!J4jCf;nBK1bezt2?n)7i0LJm)N-}zBJnaM*wj)b7}XLbSk;IU%xduv=?W{suog*i z%DO@d!L$}SVmcR2Fs=n^g>}tW(oQ<^)EytUaupx7Lh_Ik*yU0P7Iv9B)nYAG+-xmT zhcL2>-%F13dd%!1wZhIWRDz*hpdN6}ie9=i(FyGb>HwB@o)S#$TqW4rIYZ?9Y$aIR zSxPXsGnHU(bB0LM3?*3H>8_agfg8!OX=;Vd&2~N`{^K~&ovIdC-6{Vs8YY8t=zVu5 zDKsZaER9W&7%3eu(W)OOAA#}B!m$%tkGD8BRvrl#kC9js%aj-?87(m!86`0m9x1V? ze1ycpc!tD+!l1=T{NWPwb{3^c4&@D#7_?I*W`t5C_63tA1~UwaX+bVAHH}G3 bN!2A{kz4MDb-$euNOFpdNxt^%Zd&qxF}<)* diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..1bba442 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,19 @@ +# Changelog + +## [1.2.3] - 2025-06-18 + +### Added + +- Full test coverage for the entire codebase. +- Badge for test coverage in the README. +- Added `radius` parameter to the `RadiusClustering` class, allowing users to specify the radius for clustering. + +### Deprecated + +- Deprecated the `threshold` parameter in the `RadiusClustering` class. Use `radius` instead. + +### Changed + +- Updated all the attributes in the `RadiusClustering` class to fit `scikit-learn` standards and conventions. +- Updated the tests cases to reflect the changes in the `RadiusClustering` class. +- Updated README and documentation to reflect the new `radius` parameter and the deprecation of `threshold`. diff --git a/pyproject.toml b/pyproject.toml index 4be7195..0ee2d8d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -120,14 +120,14 @@ exclude = [ # Same as Black. line-length = 88 -indent-width = 4 +target-version = "py310" [tool.ruff.lint] # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default. # Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or # McCabe complexity (`C901`) by default. -select = ["E", "F"] -ignore = [] +select = ["E", "F", "W", "I"] +ignore = ["E203", "E731", "E741"] # Allow fix for all enabled rules (when `--fix`) is provided. fixable = ["ALL"] diff --git a/setup.py b/setup.py index bcab66c..909e82a 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,8 @@ import platform -from setuptools import setup, Extension -from Cython.Build import cythonize + import numpy as np +from Cython.Build import cythonize +from setuptools import Extension, setup SYSTEM = platform.system() CPU = platform.processor() @@ -21,7 +22,10 @@ extensions = [ Extension( "radius_clustering.utils._emos", - ["src/radius_clustering/utils/emos.pyx", "src/radius_clustering/utils/main-emos.c"], + [ + "src/radius_clustering/utils/emos.pyx", + "src/radius_clustering/utils/main-emos.c" + ], include_dirs=[np.get_include(), "src/radius_clustering/utils"], extra_compile_args=C_COMPILE_ARGS, ), diff --git a/src/radius_clustering/radius_clustering.py b/src/radius_clustering/radius_clustering.py index 8753e92..3d528d2 100644 --- a/src/radius_clustering/radius_clustering.py +++ b/src/radius_clustering/radius_clustering.py @@ -9,10 +9,12 @@ """ import os +import warnings + import numpy as np -from sklearn.metrics import pairwise_distances from sklearn.base import BaseEstimator, ClusterMixin -from sklearn.utils.validation import check_array, validate_data, check_random_state +from sklearn.metrics import pairwise_distances +from sklearn.utils.validation import check_random_state, validate_data from radius_clustering.utils._emos import py_emos_main from radius_clustering.utils._mds_approx import solve_mds @@ -21,7 +23,7 @@ class RadiusClustering(ClusterMixin, BaseEstimator): - """ + r""" Radius Clustering algorithm. This class implements clustering based on the Minimum Dominating Set (MDS) problem. @@ -46,29 +48,52 @@ class RadiusClustering(ClusterMixin, BaseEstimator): The maximum distance between any point and its assigned cluster center. random_state\_ : int | None The random state used for reproducibility. If None, no random state is set. - + .. note:: The `random_state_` attribute is not used when the `manner` is set to "exact". - + .. versionadded:: 1.3.0 - The *random_state* parameter was added to allow reproducibility in the approximate method. + The *random_state* parameter was added to allow reproducibility in + the approximate method. .. versionchanged:: 1.3.0 - All publicly accessible attributes are now suffixed with an underscore (e.g., `centers_`, `labels_`). + All publicly accessible attributes are now suffixed with an underscore + (e.g., `centers_`, `labels_`). This is particularly useful for compatibility with scikit-learn's API. - - .. versionchanged:: 1.3.0 - The `threshold` parameter was renamed to `radius` to better reflect its purpose. + + .. versionadded:: 1.3.0 + The `radius` parameter replaces the `threshold` parameter for setting + the dissimilarity threshold for better clarity and consistency. + + .. deprecated:: 1.3.0 + The `threshold` parameter is deprecated. Use `radius` instead. + Will be removed in a future version. """ _estimator_type = "clusterer" - def __init__(self, manner: str ="approx", radius: float =0.5, random_state: int | None = None) -> None: + def __init__( + self, + manner: str = "approx", + radius: float = 0.5, + threshold=None, + random_state: int | None = None, + ) -> None: + if threshold is not None: + warnings.warn( + "The 'threshold' parameter is deprecated and" + " will be removed in a future version." + "Please use 'radius' instead.", + DeprecationWarning, + stacklevel=2, + ) + radius = threshold + self.threshold = threshold # For backward compatibility self.manner = manner self.radius = radius self.random_state = random_state - def _check_symmetric(self, a: np.ndarray, tol: float =1e-8) -> bool: + def _check_symmetric(self, a: np.ndarray, tol: float = 1e-8) -> bool: if a.ndim != 2: raise ValueError("Input must be a 2D array.") if a.shape[0] != a.shape[1]: @@ -80,21 +105,26 @@ def fit(self, X: np.ndarray, y: None = None) -> "RadiusClustering": Fit the MDS clustering model to the input data. This method computes the distance matrix if the input is a feature matrix, - or uses the provided distance matrix directly if the input is already a distance matrix. + or uses the provided distance matrix directly if the input is already + a distance matrix. .. note:: If the input is a distance matrix, it should be symmetric and square. - If the input is a feature matrix, the distance matrix will be computed using Euclidean distance. - + If the input is a feature matrix, the distance matrix + will be computed using Euclidean distance. + .. tip:: - Next version will support providing different metrics or even custom callables to compute the distance matrix. + Next version will support providing different metrics or + even custom callables to compute the distance matrix. Parameters: ----------- X : array-like, shape (n_samples, n_features) - The input data to cluster. X should be a 2D array-like structure. It can either be : + The input data to cluster. X should be a 2D array-like structure. + It can either be : - A distance matrix (symmetric, square) with shape (n_samples, n_samples). - - A feature matrix with shape (n_samples, n_features) where the distance matrix will be computed. + - A feature matrix with shape (n_samples, n_features) + where the distance matrix will be computed. y : Ignored Not used, present here for API consistency by convention. @@ -128,7 +158,7 @@ def fit(self, X: np.ndarray, y: None = None) -> "RadiusClustering": dist_mat = pairwise_distances(self.X_checked_, metric="euclidean") else: dist_mat = self.X_checked_ - + if not isinstance(self.radius, (float, int)): raise ValueError("Radius must be a positive float.") if self.radius <= 0: @@ -141,7 +171,9 @@ def fit(self, X: np.ndarray, y: None = None) -> "RadiusClustering": self.effective_radius_ = 0 self.mds_exec_time_ = 0 return self - self.edges_ = np.argwhere(adj_mask).astype(np.uint32) # Edges in the adjacency matrix + self.edges_ = np.argwhere(adj_mask).astype( + np.uint32 + ) # Edges in the adjacency matrix # uint32 is used to use less memory. Max number of features is 2^32-1 self.dist_mat_ = dist_mat @@ -160,9 +192,11 @@ def fit_predict(self, X: np.ndarray, y: None = None) -> np.ndarray: Parameters: ----------- X : array-like, shape (n_samples, n_features) - The input data to cluster. X should be a 2D array-like structure. It can either be : + The input data to cluster. X should be a 2D array-like structure. + It can either be : - A distance matrix (symmetric, square) with shape (n_samples, n_samples). - - A feature matrix with shape (n_samples, n_features) where the distance matrix will be computed. + - A feature matrix with shape (n_samples, n_features) where + the distance matrix will be computed. y : Ignored Not used, present here for API consistency by convention. @@ -181,9 +215,7 @@ def _clustering(self): n = self.X_checked_.shape[0] if self.manner != "exact" and self.manner != "approx": print(f"Invalid manner: {self.manner}. Defaulting to 'approx'.") - raise ValueError( - "Invalid manner. Choose either 'exact' or 'approx'." - ) + raise ValueError("Invalid manner. Choose either 'exact' or 'approx'.") if self.manner == "exact": self._clustering_exact(n) else: @@ -210,20 +242,27 @@ def _clustering_exact(self, n: int) -> None: def _clustering_approx(self, n: int) -> None: """ - Perform approximate MDS clustering. This method uses a pretty trick to set the seed for the random state of the C++ code of the MDS solver. + Perform approximate MDS clustering. + This method uses a pretty trick to set the seed for + the random state of the C++ code of the MDS solver. .. tip:: - The random state is used to ensure reproducibility of the results when using the approximate method. + The random state is used to ensure reproducibility of the results + when using the approximate method. If `random_state` is None, a default value of 42 is used. - + .. important:: :collapsible: closed The trick to set the random state is : - 1. Use the `check_random_state` function to get a `RandomState`singleton instance, set up with the provided `random_state`. - 2. Use the `randint` method of the `RandomState` instance to generate a random integer. + 1. Use the `check_random_state` function to get a `RandomState`singleton + instance, set up with the provided `random_state`. + 2. Use the `randint` method of the `RandomState` instance to generate a + random integer. 3. Use this random integer as the seed for the C++ code of the MDS solver. - This ensures that the seed passed to the C++ code is always an integer, which is required by the MDS solver, and allows for reproducibility of the results. + This ensures that the seed passed to the C++ code is always an integer, + which is required by the MDS solver, and allows for + reproducibility of the results. Parameters: ----------- @@ -239,7 +278,9 @@ def _clustering_approx(self, n: int) -> None: self.random_state = 42 self.random_state_ = check_random_state(self.random_state) seed = self.random_state_.randint(np.iinfo(np.int32).max) - result = solve_mds(n, self.edges_.flatten().astype(np.int32), self.nb_edges_, seed) + result = solve_mds( + n, self.edges_.flatten().astype(np.int32), self.nb_edges_, seed + ) self.centers_ = sorted([x for x in result["solution_set"]]) self.mds_exec_time_ = result["Time"] From 58087d19f4f7b6377511125dae1849e0f6375f0a Mon Sep 17 00:00:00 2001 From: Quentin Date: Wed, 18 Jun 2025 08:06:59 +0200 Subject: [PATCH 4/6] updating docs --- CHANGELOG.md | 2 +- docs/source/conf.py | 2 +- docs/source/usage.rst | 4 +++- examples/plot_iris_example.py | 4 ++-- src/radius_clustering/__init__.py | 2 +- 5 files changed, 8 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1bba442..4a0e668 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Changelog -## [1.2.3] - 2025-06-18 +## [1.3.0] - 2025-06-18 ### Added diff --git a/docs/source/conf.py b/docs/source/conf.py index dc3bbb2..b23d6b7 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -16,7 +16,7 @@ project = "Radius Clustering" copyright = "2024, Haenn Quentin, Chardin Brice, Baron Mickaël" author = "Haenn Quentin, Chardin Brice, Baron Mickaël" -release = "1.0" +release = "1.3.0" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/docs/source/usage.rst b/docs/source/usage.rst index 6da5cf6..1826840 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -12,7 +12,9 @@ Here's a basic example of how to use Radius Clustering: X = np.random.rand(100, 2) # Create an instance of MdsClustering - rad = RadiusClustering(manner="approx", threshold=0.5) + rad = RadiusClustering(manner="approx", radius=0.5) + # Attention: the 'threshold' parameter is deprecated by version 1.3.0 + # and will be removed in a future version. Use 'radius' instead. # Fit the model to the data rad.fit(X) diff --git a/examples/plot_iris_example.py b/examples/plot_iris_example.py index 7204348..c31d9a5 100644 --- a/examples/plot_iris_example.py +++ b/examples/plot_iris_example.py @@ -82,7 +82,7 @@ # We create an instance of the `RadiusClustering` class and fit it to the Iris dataset. import time -rad = RadiusClustering(manner="exact", threshold=1.43) +rad = RadiusClustering(manner="exact", radius=1.43) t0 = time.time() rad.fit(X) t_rad = time.time() - t0 @@ -242,7 +242,7 @@ def get_order_labels(kmeans, rad, data): # Compute clustering with MDS -rad = RadiusClustering(manner="exact", threshold=232.09) +rad = RadiusClustering(manner="exact", radius=232.09) t0 = time.time() rad.fit(X) t_rad = time.time() - t0 diff --git a/src/radius_clustering/__init__.py b/src/radius_clustering/__init__.py index 6d528e4..9609e48 100644 --- a/src/radius_clustering/__init__.py +++ b/src/radius_clustering/__init__.py @@ -2,4 +2,4 @@ from .radius_clustering import RadiusClustering __all__ = ["RadiusClustering"] -__version__ = "1.2.3" +__version__ = "1.3.0" From 2c86d0d4e1677bd4c60ff98f78ce2df929a9ecaf Mon Sep 17 00:00:00 2001 From: Quentin Date: Wed, 18 Jun 2025 08:44:07 +0200 Subject: [PATCH 5/6] fixing test campains --- tests/test_regression.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tests/test_regression.py b/tests/test_regression.py index 44389b6..4f6c5dc 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -41,19 +41,22 @@ def exact_results(): } return results -def assert_results(results, expected): +def assert_results_exact(results, expected): """Helper function to assert clustering results.""" - assert len(results.labels_) == len(expected['labels']), "Labels length mismatch" + assert_results(results, expected) assert set(results.labels_) == set(expected['labels']), "Labels do not match expected" assert results.centers_ == expected['centers'], "Centers do not match expected" - assert abs(results.mds_exec_time_ - expected['time']) < 0.1, "Execution time mismatch by more than 0.1 seconds" - assert abs(results.effective_radius_ - expected['effective_radius']) < 0.01, "Effective radius mismatch" assert np.sum(results.labels_ - expected['labels']) == 0, "Labels do not match expected" +def assert_results(results, expected): + assert len(results.labels_) == len(expected['labels']), "Labels length mismatch" + assert abs(results.mds_exec_time_ - expected['time']) < 0.1, "Execution time mismatch by more than 10%" + assert abs(results.effective_radius_ - expected['effective_radius'])/results.effective_radius_ < 0.1, "Effective radius mismatch" + def test_exact(iris_data, exact_results): """Test the RadiusClustering with exact""" clustering = RadiusClustering(radius=1.43, manner='exact').fit(iris_data) - assert_results(clustering, exact_results) + assert_results_exact(clustering, exact_results) def test_approx(iris_data, approx_results): """Test the RadiusClustering with approx.""" From 134888cca1c5d98f71d29578227a87c0e12bd1be Mon Sep 17 00:00:00 2001 From: Quentin Date: Wed, 18 Jun 2025 09:19:34 +0200 Subject: [PATCH 6/6] fix type hint --- src/radius_clustering/radius_clustering.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/radius_clustering/radius_clustering.py b/src/radius_clustering/radius_clustering.py index 3d528d2..33d42c1 100644 --- a/src/radius_clustering/radius_clustering.py +++ b/src/radius_clustering/radius_clustering.py @@ -8,6 +8,8 @@ This module serves as the main interface for the Radius clustering library. """ +from __future__ import annotations + import os import warnings