Skip to content

Commit b238f0f

Browse files
author
Guillaume Lemaitre
committed
PEP8 robust single linkage
1 parent a1d0d0f commit b238f0f

File tree

1 file changed

+98
-78
lines changed

1 file changed

+98
-78
lines changed

hdbscan/robust_single_linkage_.py

Lines changed: 98 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,6 @@
22
"""
33
Robust Single Linkage: Density based single linkage clustering.
44
"""
5-
# Author: Leland McInnes <[email protected]>
6-
#
7-
# License: BSD 3 clause
8-
95
import numpy as np
106

117
from sklearn.base import BaseEstimator, ClusterMixin
@@ -25,24 +21,31 @@
2521

2622
from warnings import warn
2723

24+
# Author: Leland McInnes <[email protected]>
25+
#
26+
# License: BSD 3 clause
27+
2828
FAST_METRICS = KDTree.valid_metrics + BallTree.valid_metrics
2929

3030

31-
def _rsl_generic(X, k=5, alpha=1.4142135623730951, metric='euclidean', **kwargs):
31+
def _rsl_generic(X, k=5, alpha=1.4142135623730951, metric='euclidean',
32+
**kwargs):
3233
distance_matrix = pairwise_distances(X, metric=metric, **kwargs)
3334

3435
mutual_reachability_ = mutual_reachability(distance_matrix, k)
3536

3637
min_spanning_tree = mst_linkage_core(mutual_reachability_)
37-
min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :]
38+
min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]),
39+
:]
3840

3941
single_linkage_tree = label(min_spanning_tree)
4042
single_linkage_tree = SingleLinkageTree(single_linkage_tree)
4143

4244
return single_linkage_tree
4345

4446

45-
def _rsl_prims_kdtree(X, k=5, alpha=1.4142135623730951, metric='euclidean', **kwargs):
47+
def _rsl_prims_kdtree(X, k=5, alpha=1.4142135623730951, metric='euclidean',
48+
**kwargs):
4649

4750
# The Cython routines used require contiguous arrays
4851
if not X.flags['C_CONTIGUOUS']:
@@ -56,15 +59,17 @@ def _rsl_prims_kdtree(X, k=5, alpha=1.4142135623730951, metric='euclidean', **kw
5659
dist_metric = DistanceMetric.get_metric(metric, **kwargs)
5760

5861
core_distances = tree.query(X, k=k)[0][:, -1].copy(order='C')
59-
min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha)
62+
min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric,
63+
alpha)
6064

6165
single_linkage_tree = label(min_spanning_tree)
6266
single_linkage_tree = SingleLinkageTree(single_linkage_tree)
6367

6468
return single_linkage_tree
6569

6670

67-
def _rsl_prims_balltree(X, k=5, alpha=1.4142135623730951, metric='euclidean', **kwargs):
71+
def _rsl_prims_balltree(X, k=5, alpha=1.4142135623730951, metric='euclidean',
72+
**kwargs):
6873

6974
# The Cython routines used require contiguous arrays
7075
if not X.flags['C_CONTIGUOUS']:
@@ -78,7 +83,8 @@ def _rsl_prims_balltree(X, k=5, alpha=1.4142135623730951, metric='euclidean', **
7883
dist_metric = DistanceMetric.get_metric(metric, **kwargs)
7984

8085
core_distances = tree.query(X, k=k)[0][:, -1].copy(order='C')
81-
min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha)
86+
min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric,
87+
alpha)
8288

8389
single_linkage_tree = label(min_spanning_tree)
8490
single_linkage_tree = SingleLinkageTree(single_linkage_tree)
@@ -122,8 +128,9 @@ def _rsl_boruvka_balltree(X, k=5, alpha=1.0,
122128

123129
def robust_single_linkage(X, cut, k=5, alpha=1.4142135623730951,
124130
gamma=5, metric='euclidean', algorithm='best',
125-
memory=Memory(cachedir=None, verbose=0), leaf_size=40, **kwargs):
126-
"""Perform robust single linkage clustering from a vector array
131+
memory=Memory(cachedir=None, verbose=0),
132+
leaf_size=40, **kwargs):
133+
r"""Perform robust single linkage clustering from a vector array
127134
or distance matrix.
128135
129136
Parameters
@@ -137,28 +144,28 @@ def robust_single_linkage(X, cut, k=5, alpha=1.4142135623730951,
137144
The reachability distance value to cut the cluster heirarchy at
138145
to derive a flat cluster labelling.
139146
140-
k : int, optional
147+
k : int, optional (default=5)
141148
Reachability distances will be computed with regard to the `k`
142-
nearest neighbors. (default 5)
149+
nearest neighbors.
143150
144-
alpha : float, optional
151+
alpha : float, optional (default=np.sqrt(2))
145152
Distance scaling for reachability distance computation. Reachability
146-
distance is computed as $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.
147-
(default sqrt(2))
153+
distance is computed as
154+
$max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.
148155
149-
gamma : int, optional
156+
gamma : int, optional (default=5)
150157
Ignore any clusters in the flat clustering with size less than gamma,
151-
and declare points in such clusters as noise points. (default 5)
158+
and declare points in such clusters as noise points.
152159
153-
metric : string, or callable, optional
160+
metric : string, or callable, optional (default='euclidean')
154161
The metric to use when calculating distance between instances in a
155162
feature array. If metric is a string or callable, it must be one of
156163
the options allowed by metrics.pairwise.pairwise_distances for its
157164
metric parameter.
158165
If metric is "precomputed", X is assumed to be a distance matrix and
159166
must be square.
160167
161-
algorithm : string, optional
168+
algorithm : string, optional (default='best')
162169
Exactly which algorithm to use; hdbscan has variants specialised
163170
for different characteristics of the data. By default this is set
164171
to ``best`` which chooses the "best" algorithm given the nature of
@@ -176,96 +183,100 @@ def robust_single_linkage(X, cut, k=5, alpha=1.4142135623730951,
176183
By default, no caching is done. If a string is given, it is the
177184
path to the caching directory.
178185
179-
leaf_size : int, optional
186+
leaf_size : int, optional (default=40)
180187
Leaf size for trees responsible for fast nearest
181-
neighbour queries. (default 40)
188+
neighbour queries.
182189
183190
Returns
184191
-------
185-
labels : array [n_samples]
192+
labels : ndarray, shape (n_samples, )
186193
Cluster labels for each point. Noisy samples are given the label -1.
187194
188-
single_linkage_tree : array [n_samples - 1, 4]
195+
single_linkage_tree : ndarray, shape (n_samples - 1, 4)
189196
The single linkage tree produced during clustering in scipy
190197
hierarchical clustering format
191198
(see http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html).
192199
193200
References
194201
----------
195-
K. Chaudhuri and S. Dasgupta.
196-
"Rates of convergence for the cluster tree."
197-
In Advances in Neural Information Processing Systems, 2010.
202+
.. [1] Chaudhuri, K., & Dasgupta, S. (2010). Rates of convergence for the
203+
cluster tree. In Advances in Neural Information Processing Systems
204+
(pp. 343-351).
198205
199206
"""
200207

201-
if type(k) is not int or k < 1:
208+
if not isinstance(k, int) or k < 1:
202209
raise ValueError('k must be an integer greater than zero!')
203210

204-
if type(alpha) is not float or alpha < 1.0:
211+
if not isinstance(alpha, float) or alpha < 1.0:
205212
raise ValueError('alpha must be a float greater than or equal to 1.0!')
206213

207-
if type(gamma) is not int or gamma < 1:
214+
if not isinstance(gamma, int) or gamma < 1:
208215
raise ValueError('gamma must be an integer greater than zero!')
209216

210-
if type(leaf_size) is not int or leaf_size < 1:
217+
if not isinstance(leaf_size, int) or leaf_size < 1:
211218
raise ValueError('Leaf size must be at least one!')
212219

213220
if metric == 'minkowski':
214221
if 'p' not in kwargs or kwargs['p'] is None:
215222
raise TypeError('Minkowski metric given but no p value supplied!')
216223
if kwargs['p'] < 0:
217-
raise ValueError('Minkowski metric with negative p value is not defined!')
224+
raise ValueError('Minkowski metric with negative p value is not'
225+
' defined!')
218226

219227
X = check_array(X, accept_sparse='csr')
220228
if isinstance(memory, six.string_types):
221229
memory = Memory(cachedir=memory, verbose=0)
222230

223231
if algorithm != 'best':
224232
if algorithm == 'generic':
225-
single_linkage_tree = \
226-
memory.cache(_rsl_generic)(X, k, alpha, metric, **kwargs)
233+
single_linkage_tree = memory.cache(_rsl_generic)(
234+
X, k, alpha, metric, **kwargs)
227235
elif algorithm == 'prims_kdtree':
228-
single_linkage_tree = \
229-
memory.cache(_rsl_prims_kdtree)(X, k, alpha, metric, **kwargs)
236+
single_linkage_tree = memory.cache(_rsl_prims_kdtree)(
237+
X, k, alpha, metric, **kwargs)
230238
elif algorithm == 'prims_balltree':
231-
single_linkage_tree = \
232-
memory.cache(_rsl_prims_balltree)(X, k, alpha, metric, **kwargs)
239+
single_linkage_tree = memory.cache(_rsl_prims_balltree)(
240+
X, k, alpha, metric, **kwargs)
233241
elif algorithm == 'boruvka_kdtree':
234-
single_linkage_tree = \
235-
memory.cache(_rsl_boruvka_kdtree)(X, k, alpha, metric, leaf_size, **kwargs)
242+
single_linkage_tree = memory.cache(_rsl_boruvka_kdtree)(
243+
X, k, alpha, metric, leaf_size, **kwargs)
236244
elif algorithm == 'boruvka_balltree':
237-
single_linkage_tree = \
238-
memory.cache(_rsl_boruvka_balltree)(X, k, alpha, metric, leaf_size, **kwargs)
245+
single_linkage_tree = memory.cache(_rsl_boruvka_balltree)(
246+
X, k, alpha, metric, leaf_size, **kwargs)
239247
else:
240248
raise TypeError('Unknown algorithm type %s specified' % algorithm)
241249
else:
242-
if issparse(X) or metric not in FAST_METRICS: # We can't do much with sparse matrices ...
243-
single_linkage_tree = \
244-
memory.cache(_rsl_generic)(X, k, alpha, metric, **kwargs)
250+
if issparse(X) or metric not in FAST_METRICS:
251+
# We can't do much with sparse matrices ...
252+
single_linkage_tree = memory.cache(_rsl_generic)(
253+
X, k, alpha, metric, **kwargs)
245254
elif metric in KDTree.valid_metrics:
246-
# Need heuristic to decide when to go to boruvka; still debugging for now
255+
# Need heuristic to decide when to go to boruvka;
256+
# still debugging for now
247257
if X.shape[1] > 128:
248-
single_linkage_tree = \
249-
memory.cache(_rsl_prims_kdtree)(X, k, alpha, metric, **kwargs)
258+
single_linkage_tree = memory.cache(_rsl_prims_kdtree)(
259+
X, k, alpha, metric, **kwargs)
250260
else:
251-
single_linkage_tree = \
252-
memory.cache(_rsl_boruvka_kdtree)(X, k, alpha, metric, **kwargs)
261+
single_linkage_tree = memory.cache(_rsl_boruvka_kdtree)(
262+
X, k, alpha, metric, **kwargs)
253263
else: # Metric is a valid BallTree metric
254-
# Need heuristic to decide when to go to boruvka; still debugging for now
264+
# Need heuristic to decide when to go to boruvka;
265+
# still debugging for now
255266
if X.shape[1] > 128:
256-
single_linkage_tree = \
257-
memory.cache(_rsl_prims_kdtree)(X, k, alpha, metric, **kwargs)
267+
single_linkage_tree = memory.cache(_rsl_prims_kdtree)(
268+
X, k, alpha, metric, **kwargs)
258269
else:
259-
single_linkage_tree = \
260-
memory.cache(_rsl_boruvka_balltree)(X, k, alpha, metric, **kwargs)
270+
single_linkage_tree = memory.cache(_rsl_boruvka_balltree)(
271+
X, k, alpha, metric, **kwargs)
261272

262273
labels = single_linkage_tree.get_clusters(cut, gamma)
263274

264275
return labels, single_linkage_tree
265276

266277

267278
class RobustSingleLinkage(BaseEstimator, ClusterMixin):
268-
"""Perform robust single linkage clustering from a vector array
279+
r"""Perform robust single linkage clustering from a vector array
269280
or distance matrix.
270281
271282
Roust single linkage is a modified version of single linkage that
@@ -285,28 +296,28 @@ class RobustSingleLinkage(BaseEstimator, ClusterMixin):
285296
The reachability distance value to cut the cluster heirarchy at
286297
to derive a flat cluster labelling.
287298
288-
k : int, optional
299+
k : int, optional (default=5)
289300
Reachability distances will be computed with regard to the `k`
290-
nearest neighbors. (default 5)
301+
nearest neighbors.
291302
292-
alpha : float, optional
303+
alpha : float, optional (default=np.sqrt(2))
293304
Distance scaling for reachability distance computation. Reachability
294-
distance is computed as $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.
295-
(default sqrt(2))
305+
distance is computed as
306+
$max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.
296307
297-
gamma : int, optional
308+
gamma : int, optional (default=5)
298309
Ignore any clusters in the flat clustering with size less than gamma,
299-
and declare points in such clusters as noise points. (default 5)
310+
and declare points in such clusters as noise points.
300311
301-
metric : string, or callable, optional
312+
metric : string, or callable, optional (default='euclidean')
302313
The metric to use when calculating distance between instances in a
303314
feature array. If metric is a string or callable, it must be one of
304315
the options allowed by metrics.pairwise.pairwise_distances for its
305316
metric parameter.
306317
If metric is "precomputed", X is assumed to be a distance matrix and
307318
must be square.
308319
309-
algorithm : string, optional
320+
algorithm : string, optional (default='best')
310321
Exactly which algorithm to use; hdbscan has variants specialised
311322
for different characteristics of the data. By default this is set
312323
to ``best`` which chooses the "best" algorithm given the nature of
@@ -319,27 +330,27 @@ class RobustSingleLinkage(BaseEstimator, ClusterMixin):
319330
320331
Attributes
321332
-------
322-
labels_ : array [n_samples]
333+
labels_ : ndarray, shape (n_samples, )
323334
Cluster labels for each point. Noisy samples are given the label -1.
324335
325336
cluster_hierarchy_ : SingleLinkageTree object
326-
The single linkage tree produced during clustering. This object provides
327-
several methods for:
337+
The single linkage tree produced during clustering.
338+
This object provides several methods for:
328339
* Plotting
329340
* Generating a flat clustering
330341
* Exporting to NetworkX
331342
* Exporting to Pandas
332343
333344
References
334345
----------
335-
K. Chaudhuri and S. Dasgupta.
336-
"Rates of convergence for the cluster tree."
337-
In Advances in Neural Information Processing Systems, 2010.
346+
.. [1] Chaudhuri, K., & Dasgupta, S. (2010). Rates of convergence for the
347+
cluster tree. In Advances in Neural Information Processing Systems
348+
(pp. 343-351).
338349
339350
"""
340351

341-
def __init__(self, cut=0.4, k=5, alpha=1.4142135623730951, gamma=5, metric='euclidean',
342-
algorithm='best', **kwargs):
352+
def __init__(self, cut=0.4, k=5, alpha=1.4142135623730951, gamma=5,
353+
metric='euclidean', algorithm='best', **kwargs):
343354

344355
self.cut = cut
345356
self.k = k
@@ -353,21 +364,29 @@ def __init__(self, cut=0.4, k=5, alpha=1.4142135623730951, gamma=5, metric='eucl
353364
self._cluster_hierarchy_ = None
354365

355366
def fit(self, X, y=None):
356-
"""Perform robust single linkage clustering from features or distance matrix.
367+
"""Perform robust single linkage clustering from features or
368+
distance matrix.
357369
358370
Parameters
359371
----------
360372
X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
361373
array of shape (n_samples, n_samples)
362374
A feature array, or array of distances between samples if
363375
``metric='precomputed'``.
376+
377+
Returns
378+
-------
379+
self : object
380+
Returns self
364381
"""
365382
X = check_array(X, accept_sparse='csr')
366383

367384
kwargs = self.get_params()
368385
kwargs.update(self._metric_kwargs)
369386

370-
self.labels_, self._cluster_hierarchy_ = robust_single_linkage(X, **kwargs)
387+
self.labels_, self._cluster_hierarchy_ = robust_single_linkage(
388+
X, **kwargs)
389+
371390
return self
372391

373392
def fit_predict(self, X, y=None):
@@ -382,7 +401,7 @@ def fit_predict(self, X, y=None):
382401
383402
Returns
384403
-------
385-
y : ndarray, shape (n_samples,)
404+
y : ndarray, shape (n_samples, )
386405
cluster labels
387406
"""
388407

@@ -394,5 +413,6 @@ def cluster_hierarchy_(self):
394413
if self._cluster_hierarchy_ is not None:
395414
return SingleLinkageTree(self._cluster_hierarchy_)
396415
else:
397-
warn('No single linkage tree was generated; try running fit first.')
416+
warn('No single linkage tree was generated; try running fit'
417+
' first.')
398418
return None

0 commit comments

Comments
 (0)