22"""
33Robust Single Linkage: Density based single linkage clustering.
44"""
5- # Author: Leland McInnes <[email protected] > 6- #
7- # License: BSD 3 clause
8-
95import numpy as np
106
117from sklearn .base import BaseEstimator , ClusterMixin
2521
2622from warnings import warn
2723
24+ # Author: Leland McInnes <[email protected] > 25+ #
26+ # License: BSD 3 clause
27+
2828FAST_METRICS = KDTree .valid_metrics + BallTree .valid_metrics
2929
3030
31- def _rsl_generic (X , k = 5 , alpha = 1.4142135623730951 , metric = 'euclidean' , ** kwargs ):
31+ def _rsl_generic (X , k = 5 , alpha = 1.4142135623730951 , metric = 'euclidean' ,
32+ ** kwargs ):
3233 distance_matrix = pairwise_distances (X , metric = metric , ** kwargs )
3334
3435 mutual_reachability_ = mutual_reachability (distance_matrix , k )
3536
3637 min_spanning_tree = mst_linkage_core (mutual_reachability_ )
37- min_spanning_tree = min_spanning_tree [np .argsort (min_spanning_tree .T [2 ]), :]
38+ min_spanning_tree = min_spanning_tree [np .argsort (min_spanning_tree .T [2 ]),
39+ :]
3840
3941 single_linkage_tree = label (min_spanning_tree )
4042 single_linkage_tree = SingleLinkageTree (single_linkage_tree )
4143
4244 return single_linkage_tree
4345
4446
45- def _rsl_prims_kdtree (X , k = 5 , alpha = 1.4142135623730951 , metric = 'euclidean' , ** kwargs ):
47+ def _rsl_prims_kdtree (X , k = 5 , alpha = 1.4142135623730951 , metric = 'euclidean' ,
48+ ** kwargs ):
4649
4750 # The Cython routines used require contiguous arrays
4851 if not X .flags ['C_CONTIGUOUS' ]:
@@ -56,15 +59,17 @@ def _rsl_prims_kdtree(X, k=5, alpha=1.4142135623730951, metric='euclidean', **kw
5659 dist_metric = DistanceMetric .get_metric (metric , ** kwargs )
5760
5861 core_distances = tree .query (X , k = k )[0 ][:, - 1 ].copy (order = 'C' )
59- min_spanning_tree = mst_linkage_core_vector (X , core_distances , dist_metric , alpha )
62+ min_spanning_tree = mst_linkage_core_vector (X , core_distances , dist_metric ,
63+ alpha )
6064
6165 single_linkage_tree = label (min_spanning_tree )
6266 single_linkage_tree = SingleLinkageTree (single_linkage_tree )
6367
6468 return single_linkage_tree
6569
6670
67- def _rsl_prims_balltree (X , k = 5 , alpha = 1.4142135623730951 , metric = 'euclidean' , ** kwargs ):
71+ def _rsl_prims_balltree (X , k = 5 , alpha = 1.4142135623730951 , metric = 'euclidean' ,
72+ ** kwargs ):
6873
6974 # The Cython routines used require contiguous arrays
7075 if not X .flags ['C_CONTIGUOUS' ]:
@@ -78,7 +83,8 @@ def _rsl_prims_balltree(X, k=5, alpha=1.4142135623730951, metric='euclidean', **
7883 dist_metric = DistanceMetric .get_metric (metric , ** kwargs )
7984
8085 core_distances = tree .query (X , k = k )[0 ][:, - 1 ].copy (order = 'C' )
81- min_spanning_tree = mst_linkage_core_vector (X , core_distances , dist_metric , alpha )
86+ min_spanning_tree = mst_linkage_core_vector (X , core_distances , dist_metric ,
87+ alpha )
8288
8389 single_linkage_tree = label (min_spanning_tree )
8490 single_linkage_tree = SingleLinkageTree (single_linkage_tree )
@@ -122,8 +128,9 @@ def _rsl_boruvka_balltree(X, k=5, alpha=1.0,
122128
123129def robust_single_linkage (X , cut , k = 5 , alpha = 1.4142135623730951 ,
124130 gamma = 5 , metric = 'euclidean' , algorithm = 'best' ,
125- memory = Memory (cachedir = None , verbose = 0 ), leaf_size = 40 , ** kwargs ):
126- """Perform robust single linkage clustering from a vector array
131+ memory = Memory (cachedir = None , verbose = 0 ),
132+ leaf_size = 40 , ** kwargs ):
133+ r"""Perform robust single linkage clustering from a vector array
127134 or distance matrix.
128135
129136 Parameters
@@ -137,28 +144,28 @@ def robust_single_linkage(X, cut, k=5, alpha=1.4142135623730951,
137144 The reachability distance value to cut the cluster heirarchy at
138145 to derive a flat cluster labelling.
139146
140- k : int, optional
147+ k : int, optional (default=5)
141148 Reachability distances will be computed with regard to the `k`
142- nearest neighbors. (default 5)
149+ nearest neighbors.
143150
144- alpha : float, optional
151+ alpha : float, optional (default=np.sqrt(2))
145152 Distance scaling for reachability distance computation. Reachability
146- distance is computed as $max \{ core_k(a), core_k(b), 1/ \a lpha d(a,b) \}$.
147- (default sqrt(2))
153+ distance is computed as
154+ $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.
148155
149- gamma : int, optional
156+ gamma : int, optional (default=5)
150157 Ignore any clusters in the flat clustering with size less than gamma,
151- and declare points in such clusters as noise points. (default 5)
158+ and declare points in such clusters as noise points.
152159
153- metric : string, or callable, optional
160+ metric : string, or callable, optional (default='euclidean')
154161 The metric to use when calculating distance between instances in a
155162 feature array. If metric is a string or callable, it must be one of
156163 the options allowed by metrics.pairwise.pairwise_distances for its
157164 metric parameter.
158165 If metric is "precomputed", X is assumed to be a distance matrix and
159166 must be square.
160167
161- algorithm : string, optional
168+ algorithm : string, optional (default='best')
162169 Exactly which algorithm to use; hdbscan has variants specialised
163170 for different characteristics of the data. By default this is set
164171 to ``best`` which chooses the "best" algorithm given the nature of
@@ -176,96 +183,100 @@ def robust_single_linkage(X, cut, k=5, alpha=1.4142135623730951,
176183 By default, no caching is done. If a string is given, it is the
177184 path to the caching directory.
178185
179- leaf_size : int, optional
186+ leaf_size : int, optional (default=40)
180187 Leaf size for trees responsible for fast nearest
181- neighbour queries. (default 40)
188+ neighbour queries.
182189
183190 Returns
184191 -------
185- labels : array [ n_samples]
192+ labels : ndarray, shape ( n_samples, )
186193 Cluster labels for each point. Noisy samples are given the label -1.
187194
188- single_linkage_tree : array [ n_samples - 1, 4]
195+ single_linkage_tree : ndarray, shape ( n_samples - 1, 4)
189196 The single linkage tree produced during clustering in scipy
190197 hierarchical clustering format
191198 (see http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html).
192199
193200 References
194201 ----------
195- K. Chaudhuri and S. Dasgupta.
196- "Rates of convergence for the cluster tree."
197- In Advances in Neural Information Processing Systems, 2010 .
202+ .. [1] Chaudhuri, K., & Dasgupta, S. (2010). Rates of convergence for the
203+ cluster tree. In Advances in Neural Information Processing Systems
204+ (pp. 343-351) .
198205
199206 """
200207
201- if type ( k ) is not int or k < 1 :
208+ if not isinstance ( k , int ) or k < 1 :
202209 raise ValueError ('k must be an integer greater than zero!' )
203210
204- if type (alpha ) is not float or alpha < 1.0 :
211+ if not isinstance (alpha , float ) or alpha < 1.0 :
205212 raise ValueError ('alpha must be a float greater than or equal to 1.0!' )
206213
207- if type (gamma ) is not int or gamma < 1 :
214+ if not isinstance (gamma , int ) or gamma < 1 :
208215 raise ValueError ('gamma must be an integer greater than zero!' )
209216
210- if type (leaf_size ) is not int or leaf_size < 1 :
217+ if not isinstance (leaf_size , int ) or leaf_size < 1 :
211218 raise ValueError ('Leaf size must be at least one!' )
212219
213220 if metric == 'minkowski' :
214221 if 'p' not in kwargs or kwargs ['p' ] is None :
215222 raise TypeError ('Minkowski metric given but no p value supplied!' )
216223 if kwargs ['p' ] < 0 :
217- raise ValueError ('Minkowski metric with negative p value is not defined!' )
224+ raise ValueError ('Minkowski metric with negative p value is not'
225+ ' defined!' )
218226
219227 X = check_array (X , accept_sparse = 'csr' )
220228 if isinstance (memory , six .string_types ):
221229 memory = Memory (cachedir = memory , verbose = 0 )
222230
223231 if algorithm != 'best' :
224232 if algorithm == 'generic' :
225- single_linkage_tree = \
226- memory . cache ( _rsl_generic )( X , k , alpha , metric , ** kwargs )
233+ single_linkage_tree = memory . cache ( _rsl_generic )(
234+ X , k , alpha , metric , ** kwargs )
227235 elif algorithm == 'prims_kdtree' :
228- single_linkage_tree = \
229- memory . cache ( _rsl_prims_kdtree )( X , k , alpha , metric , ** kwargs )
236+ single_linkage_tree = memory . cache ( _rsl_prims_kdtree )(
237+ X , k , alpha , metric , ** kwargs )
230238 elif algorithm == 'prims_balltree' :
231- single_linkage_tree = \
232- memory . cache ( _rsl_prims_balltree )( X , k , alpha , metric , ** kwargs )
239+ single_linkage_tree = memory . cache ( _rsl_prims_balltree )(
240+ X , k , alpha , metric , ** kwargs )
233241 elif algorithm == 'boruvka_kdtree' :
234- single_linkage_tree = \
235- memory . cache ( _rsl_boruvka_kdtree )( X , k , alpha , metric , leaf_size , ** kwargs )
242+ single_linkage_tree = memory . cache ( _rsl_boruvka_kdtree )(
243+ X , k , alpha , metric , leaf_size , ** kwargs )
236244 elif algorithm == 'boruvka_balltree' :
237- single_linkage_tree = \
238- memory . cache ( _rsl_boruvka_balltree )( X , k , alpha , metric , leaf_size , ** kwargs )
245+ single_linkage_tree = memory . cache ( _rsl_boruvka_balltree )(
246+ X , k , alpha , metric , leaf_size , ** kwargs )
239247 else :
240248 raise TypeError ('Unknown algorithm type %s specified' % algorithm )
241249 else :
242- if issparse (X ) or metric not in FAST_METRICS : # We can't do much with sparse matrices ...
243- single_linkage_tree = \
244- memory .cache (_rsl_generic )(X , k , alpha , metric , ** kwargs )
250+ if issparse (X ) or metric not in FAST_METRICS :
251+ # We can't do much with sparse matrices ...
252+ single_linkage_tree = memory .cache (_rsl_generic )(
253+ X , k , alpha , metric , ** kwargs )
245254 elif metric in KDTree .valid_metrics :
246- # Need heuristic to decide when to go to boruvka; still debugging for now
255+ # Need heuristic to decide when to go to boruvka;
256+ # still debugging for now
247257 if X .shape [1 ] > 128 :
248- single_linkage_tree = \
249- memory . cache ( _rsl_prims_kdtree )( X , k , alpha , metric , ** kwargs )
258+ single_linkage_tree = memory . cache ( _rsl_prims_kdtree )(
259+ X , k , alpha , metric , ** kwargs )
250260 else :
251- single_linkage_tree = \
252- memory . cache ( _rsl_boruvka_kdtree )( X , k , alpha , metric , ** kwargs )
261+ single_linkage_tree = memory . cache ( _rsl_boruvka_kdtree )(
262+ X , k , alpha , metric , ** kwargs )
253263 else : # Metric is a valid BallTree metric
254- # Need heuristic to decide when to go to boruvka; still debugging for now
264+ # Need heuristic to decide when to go to boruvka;
265+ # still debugging for now
255266 if X .shape [1 ] > 128 :
256- single_linkage_tree = \
257- memory . cache ( _rsl_prims_kdtree )( X , k , alpha , metric , ** kwargs )
267+ single_linkage_tree = memory . cache ( _rsl_prims_kdtree )(
268+ X , k , alpha , metric , ** kwargs )
258269 else :
259- single_linkage_tree = \
260- memory . cache ( _rsl_boruvka_balltree )( X , k , alpha , metric , ** kwargs )
270+ single_linkage_tree = memory . cache ( _rsl_boruvka_balltree )(
271+ X , k , alpha , metric , ** kwargs )
261272
262273 labels = single_linkage_tree .get_clusters (cut , gamma )
263274
264275 return labels , single_linkage_tree
265276
266277
267278class RobustSingleLinkage (BaseEstimator , ClusterMixin ):
268- """Perform robust single linkage clustering from a vector array
279+ r """Perform robust single linkage clustering from a vector array
269280 or distance matrix.
270281
271282 Roust single linkage is a modified version of single linkage that
@@ -285,28 +296,28 @@ class RobustSingleLinkage(BaseEstimator, ClusterMixin):
285296 The reachability distance value to cut the cluster heirarchy at
286297 to derive a flat cluster labelling.
287298
288- k : int, optional
299+ k : int, optional (default=5)
289300 Reachability distances will be computed with regard to the `k`
290- nearest neighbors. (default 5)
301+ nearest neighbors.
291302
292- alpha : float, optional
303+ alpha : float, optional (default=np.sqrt(2))
293304 Distance scaling for reachability distance computation. Reachability
294- distance is computed as $max \{ core_k(a), core_k(b), 1/ \a lpha d(a,b) \}$.
295- (default sqrt(2))
305+ distance is computed as
306+ $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.
296307
297- gamma : int, optional
308+ gamma : int, optional (default=5)
298309 Ignore any clusters in the flat clustering with size less than gamma,
299- and declare points in such clusters as noise points. (default 5)
310+ and declare points in such clusters as noise points.
300311
301- metric : string, or callable, optional
312+ metric : string, or callable, optional (default='euclidean')
302313 The metric to use when calculating distance between instances in a
303314 feature array. If metric is a string or callable, it must be one of
304315 the options allowed by metrics.pairwise.pairwise_distances for its
305316 metric parameter.
306317 If metric is "precomputed", X is assumed to be a distance matrix and
307318 must be square.
308319
309- algorithm : string, optional
320+ algorithm : string, optional (default='best')
310321 Exactly which algorithm to use; hdbscan has variants specialised
311322 for different characteristics of the data. By default this is set
312323 to ``best`` which chooses the "best" algorithm given the nature of
@@ -319,27 +330,27 @@ class RobustSingleLinkage(BaseEstimator, ClusterMixin):
319330
320331 Attributes
321332 -------
322- labels_ : array [ n_samples]
333+ labels_ : ndarray, shape ( n_samples, )
323334 Cluster labels for each point. Noisy samples are given the label -1.
324335
325336 cluster_hierarchy_ : SingleLinkageTree object
326- The single linkage tree produced during clustering. This object provides
327- several methods for:
337+ The single linkage tree produced during clustering.
338+ This object provides several methods for:
328339 * Plotting
329340 * Generating a flat clustering
330341 * Exporting to NetworkX
331342 * Exporting to Pandas
332343
333344 References
334345 ----------
335- K. Chaudhuri and S. Dasgupta.
336- "Rates of convergence for the cluster tree."
337- In Advances in Neural Information Processing Systems, 2010 .
346+ .. [1] Chaudhuri, K., & Dasgupta, S. (2010). Rates of convergence for the
347+ cluster tree. In Advances in Neural Information Processing Systems
348+ (pp. 343-351) .
338349
339350 """
340351
341- def __init__ (self , cut = 0.4 , k = 5 , alpha = 1.4142135623730951 , gamma = 5 , metric = 'euclidean' ,
342- algorithm = 'best' , ** kwargs ):
352+ def __init__ (self , cut = 0.4 , k = 5 , alpha = 1.4142135623730951 , gamma = 5 ,
353+ metric = 'euclidean' , algorithm = 'best' , ** kwargs ):
343354
344355 self .cut = cut
345356 self .k = k
@@ -353,21 +364,29 @@ def __init__(self, cut=0.4, k=5, alpha=1.4142135623730951, gamma=5, metric='eucl
353364 self ._cluster_hierarchy_ = None
354365
355366 def fit (self , X , y = None ):
356- """Perform robust single linkage clustering from features or distance matrix.
367+ """Perform robust single linkage clustering from features or
368+ distance matrix.
357369
358370 Parameters
359371 ----------
360372 X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
361373 array of shape (n_samples, n_samples)
362374 A feature array, or array of distances between samples if
363375 ``metric='precomputed'``.
376+
377+ Returns
378+ -------
379+ self : object
380+ Returns self
364381 """
365382 X = check_array (X , accept_sparse = 'csr' )
366383
367384 kwargs = self .get_params ()
368385 kwargs .update (self ._metric_kwargs )
369386
370- self .labels_ , self ._cluster_hierarchy_ = robust_single_linkage (X , ** kwargs )
387+ self .labels_ , self ._cluster_hierarchy_ = robust_single_linkage (
388+ X , ** kwargs )
389+
371390 return self
372391
373392 def fit_predict (self , X , y = None ):
@@ -382,7 +401,7 @@ def fit_predict(self, X, y=None):
382401
383402 Returns
384403 -------
385- y : ndarray, shape (n_samples,)
404+ y : ndarray, shape (n_samples, )
386405 cluster labels
387406 """
388407
@@ -394,5 +413,6 @@ def cluster_hierarchy_(self):
394413 if self ._cluster_hierarchy_ is not None :
395414 return SingleLinkageTree (self ._cluster_hierarchy_ )
396415 else :
397- warn ('No single linkage tree was generated; try running fit first.' )
416+ warn ('No single linkage tree was generated; try running fit'
417+ ' first.' )
398418 return None
0 commit comments