Skip to content

Commit a03ce02

Browse files
committed
Further coverage improvements
1 parent 5528e10 commit a03ce02

File tree

5 files changed

+231
-26
lines changed

5 files changed

+231
-26
lines changed

hdbscan/hdbscan_.py

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,7 @@
1818
from sklearn.externals.joblib import Memory
1919
from sklearn.externals import six
2020
from warnings import warn
21-
22-
#Try and work around older sklearn api
23-
try:
24-
from sklearn.utils import check_array
25-
except ImportError:
26-
from sklearn.utils import check_arrays
27-
28-
check_array = check_arrays
21+
from sklearn.utils import check_array
2922

3023
from scipy.sparse import csgraph
3124

@@ -527,7 +520,7 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
527520
if X.shape[1] > 60:
528521
(single_linkage_tree,
529522
result_min_span_tree) = \
530-
memory.cache(_hdbscan_prims_kdtree)(X, min_samples, alpha,
523+
memory.cache(_hdbscan_prims_balltree)(X, min_samples, alpha,
531524
metric, p, leaf_size,
532525
gen_min_span_tree, **kwargs)
533526
else:
@@ -764,8 +757,12 @@ def outlier_scores_(self):
764757
if self._outlier_scores is not None:
765758
return self._outlier_scores
766759
else:
767-
self._outlier_scores = outlier_scores(self._condensed_tree)
768-
return self._outlier_scores
760+
if self._condensed_tree is not None:
761+
self._outlier_scores = outlier_scores(self._condensed_tree)
762+
return self._outlier_scores
763+
else:
764+
warn('No condensed tree was generated; try running fit first.')
765+
return None
769766

770767
@property
771768
def condensed_tree_(self):
@@ -794,7 +791,7 @@ def minimum_spanning_tree_(self):
794791
'No minimum spanning tree will be provided without raw data.')
795792
return None
796793
else:
797-
warn('No minimum spanning tree was generated. \n'
798-
'This may be due to optimized algorithm variations that skip\n'
799-
'explicit generation of the spanning tree.')
794+
warn('No minimum spanning tree was generated.'
795+
'This may be due to optimized algorithm variations that skip'
796+
'explicit generation of the spanning tree.')
800797
return None

hdbscan/robust_single_linkage_.py

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,7 @@
1414

1515
from sklearn.externals.joblib import Memory
1616
from sklearn.externals import six
17-
18-
try:
19-
from sklearn.utils import check_array
20-
except ImportError:
21-
from sklearn.utils import check_arrays
22-
23-
check_array = check_arrays
17+
from sklearn.utils import check_array
2418

2519
from ._hdbscan_linkage import mst_linkage_core, mst_linkage_core_vector, label
2620
from ._hdbscan_boruvka import KDTreeBoruvkaAlgorithm, BallTreeBoruvkaAlgorithm
@@ -116,6 +110,12 @@ def _rsl_prims_balltree(X, k=5, alpha=1.4142135623730951, metric='minkowski', p=
116110

117111
def _rsl_boruvka_kdtree(X, k=5, alpha=1.0,
118112
metric='minkowski', p=2, leaf_size=40):
113+
if metric == 'minkowski':
114+
if p is None:
115+
raise TypeError('Minkowski metric given but no p value supplied!')
116+
if p < 0:
117+
raise ValueError('Minkowski metric with negative p value is not defined!')
118+
119119
dim = X.shape[0]
120120
min_samples = min(dim - 1, k)
121121

@@ -132,6 +132,12 @@ def _rsl_boruvka_kdtree(X, k=5, alpha=1.0,
132132

133133
def _rsl_boruvka_balltree(X, k=5, alpha=1.0,
134134
metric='minkowski', p=2, leaf_size=40):
135+
if metric == 'minkowski':
136+
if p is None:
137+
raise TypeError('Minkowski metric given but no p value supplied!')
138+
if p < 0:
139+
raise ValueError('Minkowski metric with negative p value is not defined!')
140+
135141
dim = X.shape[0]
136142
min_samples = min(dim - 1, k)
137143

@@ -148,7 +154,7 @@ def _rsl_boruvka_balltree(X, k=5, alpha=1.0,
148154

149155
def robust_single_linkage(X, cut, k=5, alpha=1.4142135623730951,
150156
gamma=5, metric='minkowski', p=2, algorithm='best',
151-
memory=Memory(cachedir=None, verbose=0)):
157+
memory=Memory(cachedir=None, verbose=0), leaf_size=40):
152158
"""Perform robust single linkage clustering from a vector array
153159
or distance matrix.
154160
@@ -202,6 +208,10 @@ def robust_single_linkage(X, cut, k=5, alpha=1.4142135623730951,
202208
By default, no caching is done. If a string is given, it is the
203209
path to the caching directory.
204210
211+
leaf_size : int, optional
212+
Leaf size for trees responsible for fast nearest
213+
neighbour queries. (default 40)
214+
205215
Returns
206216
-------
207217
labels : array [n_samples]
@@ -229,6 +239,9 @@ def robust_single_linkage(X, cut, k=5, alpha=1.4142135623730951,
229239
if type(gamma) is not int or gamma < 1:
230240
raise ValueError('gamma must be an integer greater than zero!')
231241

242+
if type(leaf_size) is not int or leaf_size < 1:
243+
raise ValueError('Leaf size must be at least one!')
244+
232245
X = check_array(X, accept_sparse='csr')
233246
if isinstance(memory, six.string_types):
234247
memory = Memory(cachedir=memory, verbose=0)
@@ -245,10 +258,10 @@ def robust_single_linkage(X, cut, k=5, alpha=1.4142135623730951,
245258
memory.cache(_rsl_prims_balltree)(X, k, alpha, metric, p)
246259
elif algorithm == 'boruvka_kdtree':
247260
single_linkage_tree = \
248-
memory.cache(_rsl_boruvka_kdtree)(X, k, alpha, metric, p)
261+
memory.cache(_rsl_boruvka_kdtree)(X, k, alpha, metric, p, leaf_size)
249262
elif algorithm == 'boruvka_balltree':
250263
single_linkage_tree = \
251-
memory.cache(_rsl_boruvka_balltree)(X, k, alpha, metric, p)
264+
memory.cache(_rsl_boruvka_balltree)(X, k, alpha, metric, p, leaf_size)
252265
else:
253266
raise TypeError('Unknown algorithm type %s specified' % algorithm)
254267
else:

hdbscan/tests/test_hdbscan.py

Lines changed: 137 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030

3131
from sklearn import datasets
3232

33+
import warnings
34+
3335
n_clusters = 3
3436
# X = generate_clustered_data(n_clusters=n_clusters, n_samples_per_cluster=50)
3537
X, y = make_blobs(n_samples=50, random_state=0)
@@ -110,7 +112,7 @@ def test_hdbscan_sparse_distance_matrix():
110112
n_clusters_1 = len(set(labels)) - int(-1 in labels) # ignore noise
111113
assert_equal(n_clusters_1, n_clusters)
112114

113-
labels = HDBSCAN(metric="precomputed").fit(D).labels_
115+
labels = HDBSCAN(metric="precomputed", gen_min_span_tree=True).fit(D).labels_
114116
n_clusters_2 = len(set(labels)) - int(-1 in labels)
115117
assert_equal(n_clusters_2, n_clusters)
116118

@@ -124,6 +126,89 @@ def test_hdbscan_feature_vector():
124126
n_clusters_2 = len(set(labels)) - int(-1 in labels)
125127
assert_equal(n_clusters_2, n_clusters)
126128

129+
def test_hdbscan_prims_kdtree():
130+
labels, p, persist, ctree, ltree, mtree = hdbscan(X, algorithm='prims_kdtree')
131+
n_clusters_1 = len(set(labels)) - int(-1 in labels)
132+
assert_equal(n_clusters_1, n_clusters)
133+
134+
labels = HDBSCAN(algorithm='prims_kdtree', gen_min_span_tree=True).fit(X).labels_
135+
n_clusters_2 = len(set(labels)) - int(-1 in labels)
136+
assert_equal(n_clusters_2, n_clusters)
137+
138+
assert_raises(ValueError,
139+
hdbscan,
140+
X,
141+
algorithm='prims_kdtree',
142+
metric='russelrao')
143+
144+
def test_hdbscan_prims_balltree():
145+
labels, p, persist, ctree, ltree, mtree = hdbscan(X, algorithm='prims_balltree')
146+
n_clusters_1 = len(set(labels)) - int(-1 in labels)
147+
assert_equal(n_clusters_1, n_clusters)
148+
149+
labels = HDBSCAN(algorithm='prims_balltree', gen_min_span_tree=True).fit(X).labels_
150+
n_clusters_2 = len(set(labels)) - int(-1 in labels)
151+
assert_equal(n_clusters_2, n_clusters)
152+
153+
assert_raises(ValueError,
154+
hdbscan,
155+
X,
156+
algorithm='prims_balltree',
157+
metric='cosine')
158+
159+
def test_hdbscan_boruvka_kdtree():
160+
labels, p, persist, ctree, ltree, mtree = hdbscan(X, algorithm='boruvka_kdtree')
161+
n_clusters_1 = len(set(labels)) - int(-1 in labels)
162+
assert_equal(n_clusters_1, n_clusters)
163+
164+
labels = HDBSCAN(algorithm='boruvka_kdtree', gen_min_span_tree=True).fit(X).labels_
165+
n_clusters_2 = len(set(labels)) - int(-1 in labels)
166+
assert_equal(n_clusters_2, n_clusters)
167+
168+
assert_raises(ValueError,
169+
hdbscan,
170+
X,
171+
algorithm='boruvka_kdtree',
172+
metric='russelrao')
173+
174+
def test_hdbscan_boruvka_balltree():
175+
labels, p, persist, ctree, ltree, mtree = hdbscan(X, algorithm='boruvka_balltree')
176+
n_clusters_1 = len(set(labels)) - int(-1 in labels)
177+
assert_equal(n_clusters_1, n_clusters)
178+
179+
labels = HDBSCAN(algorithm='boruvka_balltree', gen_min_span_tree=True).fit(X).labels_
180+
n_clusters_2 = len(set(labels)) - int(-1 in labels)
181+
assert_equal(n_clusters_2, n_clusters)
182+
183+
assert_raises(ValueError,
184+
hdbscan,
185+
X,
186+
algorithm='boruvka_balltree',
187+
metric='cosine')
188+
189+
190+
def test_hdbscan_high_dimensional():
191+
H, y = make_blobs(n_samples=50, random_state=0, n_features=64)
192+
# H, y = shuffle(X, y, random_state=7)
193+
H = StandardScaler().fit_transform(H)
194+
labels, p, persist, ctree, ltree, mtree = hdbscan(H)
195+
n_clusters_1 = len(set(labels)) - int(-1 in labels)
196+
assert_equal(n_clusters_1, n_clusters)
197+
198+
labels = HDBSCAN(algorithm='best', metric='seuclidean', V=np.ones(H.shape[1])).fit(H).labels_
199+
n_clusters_2 = len(set(labels)) - int(-1 in labels)
200+
assert_equal(n_clusters_2, n_clusters)
201+
202+
def test_hdbscan_best_balltree_metric():
203+
labels, p, persist, ctree, ltree, mtree = hdbscan(X, metric='seuclidean', V=np.ones(X.shape[1]))
204+
n_clusters_1 = len(set(labels)) - int(-1 in labels)
205+
assert_equal(n_clusters_1, n_clusters)
206+
207+
labels = HDBSCAN(metric='seuclidean', V=np.ones(X.shape[1])).fit(X).labels_
208+
n_clusters_2 = len(set(labels)) - int(-1 in labels)
209+
assert_equal(n_clusters_2, n_clusters)
210+
211+
127212
def test_hdbscan_no_clusters():
128213
labels, p, persist, ctree, ltree, mtree = hdbscan(X, min_cluster_size=len(X)+1)
129214
n_clusters_1 = len(set(labels)) - int(-1 in labels)
@@ -212,6 +297,42 @@ def test_tree_networkx_output_formats():
212297
if_networkx(clusterer.single_linkage_tree_.to_networkx)()
213298
if_networkx(clusterer.minimum_spanning_tree_.to_networkx)()
214299

300+
def test_hdbscan_outliers():
301+
clusterer = HDBSCAN(gen_min_span_tree=True).fit(X)
302+
scores = clusterer.outlier_scores_
303+
assert(scores is not None)
304+
305+
def test_hdbscan_unavailable_attributes():
306+
clusterer = HDBSCAN(gen_min_span_tree=False)
307+
with warnings.catch_warnings(record=True) as w:
308+
tree = clusterer.condensed_tree_
309+
assert(len(w) > 0)
310+
assert(tree is None)
311+
with warnings.catch_warnings(record=True) as w:
312+
tree = clusterer.single_linkage_tree_
313+
assert(len(w) > 0)
314+
assert(tree is None)
315+
with warnings.catch_warnings(record=True) as w:
316+
scores = clusterer.outlier_scores_
317+
assert(len(w) > 0)
318+
assert(scores is None)
319+
with warnings.catch_warnings(record=True) as w:
320+
tree = clusterer.minimum_spanning_tree_
321+
assert(len(w) > 0)
322+
assert(tree is None)
323+
324+
def test_hdbscan_min_span_tree_availability():
325+
clusterer = HDBSCAN().fit(X)
326+
tree = clusterer.minimum_spanning_tree_
327+
assert(tree is None)
328+
D = distance.squareform(distance.pdist(X))
329+
D /= np.max(D)
330+
HDBSCAN(metric='precomputed').fit(D)
331+
tree = clusterer.minimum_spanning_tree_
332+
assert(tree is None)
333+
334+
335+
215336
def test_hdbscan_badargs():
216337
assert_raises(ValueError,
217338
hdbscan,
@@ -246,6 +367,18 @@ def test_hdbscan_badargs():
246367
assert_raises(ValueError,
247368
hdbscan,
248369
X, metric='minkowski', p=-1, algorithm='boruvka_balltree')
370+
assert_raises(ValueError,
371+
hdbscan,
372+
X, metric='precomputed', algorithm='boruvka_kdtree')
373+
assert_raises(ValueError,
374+
hdbscan,
375+
X, metric='precomputed', algorithm='prims_kdtree')
376+
assert_raises(ValueError,
377+
hdbscan,
378+
X, metric='precomputed', algorithm='prims_balltree')
379+
assert_raises(ValueError,
380+
hdbscan,
381+
X, metric='precomputed',algorithm='boruvka_balltree')
249382
assert_raises(ValueError,
250383
hdbscan,
251384
X, alpha=-1)
@@ -258,6 +391,9 @@ def test_hdbscan_badargs():
258391
assert_raises(TypeError,
259392
hdbscan,
260393
X, metric='minkowski', p=None)
394+
assert_raises(ValueError,
395+
hdbscan,
396+
X, leaf_size=0)
261397

262398
def test_hdbscan_sparse():
263399

hdbscan/tests/test_rsl.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,65 @@ def test_rsl_boruvka_balltree():
9696
# n_clusters_2 = len(set(labels)) - int(-1 in labels)
9797
# assert_equal(n_clusters_2, n_clusters)
9898

99+
def test_rsl_badargs():
100+
assert_raises(ValueError,
101+
robust_single_linkage,
102+
'fail', 0.4)
103+
assert_raises(ValueError,
104+
robust_single_linkage,
105+
None, 0.4)
106+
assert_raises(ValueError,
107+
robust_single_linkage,
108+
X, 0.4, k='fail')
109+
assert_raises(ValueError,
110+
robust_single_linkage,
111+
X, 0.4, k=-1)
112+
assert_raises(ValueError,
113+
robust_single_linkage,
114+
X, 0.4, metric='imperial')
115+
assert_raises(ValueError,
116+
robust_single_linkage,
117+
X, 0.4, metric=None)
118+
assert_raises(ValueError,
119+
robust_single_linkage,
120+
X, 0.4, metric='minkowski', p=-1)
121+
assert_raises(ValueError,
122+
robust_single_linkage,
123+
X, 0.4, metric='minkowski', p=-1, algorithm='prims_kdtree')
124+
assert_raises(ValueError,
125+
robust_single_linkage,
126+
X, 0.4, metric='minkowski', p=-1, algorithm='prims_balltree')
127+
assert_raises(ValueError,
128+
robust_single_linkage,
129+
X, 0.4, metric='minkowski', p=-1, algorithm='boruvka_balltree')
130+
assert_raises(ValueError,
131+
robust_single_linkage,
132+
X, 0.4, metric='precomputed', algorithm='boruvka_kdtree')
133+
assert_raises(ValueError,
134+
robust_single_linkage,
135+
X, 0.4, metric='precomputed', algorithm='prims_kdtree')
136+
assert_raises(ValueError,
137+
robust_single_linkage,
138+
X, 0.4, metric='precomputed', algorithm='prims_balltree')
139+
assert_raises(ValueError,
140+
robust_single_linkage,
141+
X, 0.4, metric='precomputed',algorithm='boruvka_balltree')
142+
assert_raises(ValueError,
143+
robust_single_linkage,
144+
X, 0.4, alpha=-1)
145+
assert_raises(ValueError,
146+
robust_single_linkage,
147+
X, 0.4, alpha='fail')
148+
assert_raises(Exception,
149+
robust_single_linkage,
150+
X, 0.4, algorithm='something_else')
151+
assert_raises(TypeError,
152+
robust_single_linkage,
153+
X, 0.4, metric='minkowski', p=None)
154+
assert_raises(ValueError,
155+
robust_single_linkage,
156+
X, 0.4, leaf_size=0)
157+
99158
def test_rsl_is_sklearn_estimator():
100159

101160
check_estimator(RobustSingleLinkage)

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
cython>=0.22
22
numpy>=1.9
33
scipy >= 0.9
4-
scikit-learn>=0.16
4+
scikit-learn>=0.17
55

0 commit comments

Comments
 (0)