Skip to content

Commit e46f033

Browse files
committed
minor cleanup and documentation
1 parent 178802b commit e46f033

File tree

1 file changed

+46
-21
lines changed

1 file changed

+46
-21
lines changed

hdbscan/hdbscan_.py

Lines changed: 46 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,10 @@ def _hdbscan_generic(X, min_samples=5, alpha=1.0,
8989
else:
9090
result_min_span_tree = None
9191

92+
#Sort edges of the min_spanning_tree by weight
9293
min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :]
9394

95+
#Convert edge list into standard hierarchical clustering format
9496
single_linkage_tree = label(min_spanning_tree)
9597

9698
return single_linkage_tree, result_min_span_tree
@@ -106,20 +108,25 @@ def _hdbscan_prims_kdtree(X, min_samples=5, alpha=1.0,
106108
elif p is None:
107109
p = 2 # Unused, but needs to be integer; assume euclidean
108110

109-
dim = X.shape[0]
110-
min_samples = min(dim - 1, min_samples)
111+
size = X.shape[0]
112+
min_samples = min(size - 1, min_samples)
111113

112114
tree = KDTree(X, metric=metric, leaf_size=leaf_size)
113115

116+
#TO DO: Deal with p for minkowski appropriately
114117
dist_metric = DistanceMetric.get_metric(metric)
115118

119+
#Get distance to kth nearest neighbour
116120
core_distances = tree.query(X, k=min_samples,
117121
dualtree=True,
118122
breadth_first=True)[0][:, -1]
123+
#Mutual reachability distance is implicite in mst_linkage_core_cdist
119124
min_spanning_tree = mst_linkage_core_cdist(X, core_distances, dist_metric, alpha)
120125

126+
#Sort edges of the min_spanning_tree by weight
121127
min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :]
122128

129+
#Convert edge list into standard hierarchical clustering format
123130
single_linkage_tree = label(min_spanning_tree)
124131

125132
return single_linkage_tree, None
@@ -135,19 +142,23 @@ def _hdbscan_prims_balltree(X, min_samples=5, alpha=1.0,
135142
elif p is None:
136143
p = 2 # Unused, but needs to be integer; assume euclidean
137144

138-
dim = X.shape[0]
139-
min_samples = min(dim - 1, min_samples)
145+
size = X.shape[0]
146+
min_samples = min(size - 1, min_samples)
140147

141148
tree = BallTree(X, metric=metric, leaf_size=leaf_size)
142149

143150
dist_metric = DistanceMetric.get_metric(metric)
144151

152+
#Get distance to kth nearest neighbour
145153
core_distances = tree.query(X, k=min_samples,
146154
dualtree=True,
147155
breadth_first=True)[0][:, -1]
156+
157+
#Mutual reachability distance is implicite in mst_linkage_core_cdist
148158
min_spanning_tree = mst_linkage_core_cdist(X, core_distances, dist_metric, alpha)
159+
#Sort edges of the min_spanning_tree by weight
149160
min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :]
150-
161+
#Convert edge list into standard hierarchical clustering format
151162
single_linkage_tree = label(min_spanning_tree)
152163

153164
return single_linkage_tree, None
@@ -163,15 +174,19 @@ def _hdbscan_boruvka_kdtree(X, min_samples=5, alpha=1.0,
163174
if p < 0:
164175
raise ValueError('Minkowski metric with negative p value is not defined!')
165176

166-
dim = X.shape[0]
167-
min_samples = min(dim - 1, min_samples)
177+
if leaf_size < 3:
178+
leaf_size = 3
179+
180+
size = X.shape[0]
181+
min_samples = min(size - 1, min_samples)
168182

169183
tree = KDTree(X, metric=metric, leaf_size=leaf_size)
170184
alg = KDTreeBoruvkaAlgorithm(tree, min_samples, metric=metric, leaf_size=leaf_size // 3,
171185
approx_min_span_tree=approx_min_span_tree)
172186
min_spanning_tree = alg.spanning_tree()
187+
#Sort edges of the min_spanning_tree by weight
173188
min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :]
174-
189+
#Convert edge list into standard hierarchical clustering format
175190
single_linkage_tree = label(min_spanning_tree)
176191

177192
if gen_min_span_tree:
@@ -190,15 +205,19 @@ def _hdbscan_boruvka_balltree(X, min_samples=5, alpha=1.0,
190205
if p < 0:
191206
raise ValueError('Minkowski metric with negative p value is not defined!')
192207

193-
dim = X.shape[0]
194-
min_samples = min(dim - 1, min_samples)
208+
if leaf_size < 3:
209+
leaf_size = 3
210+
211+
size = X.shape[0]
212+
min_samples = min(size - 1, min_samples)
195213

196214
tree = BallTree(X, metric=metric, leaf_size=leaf_size)
197215
alg = BallTreeBoruvkaAlgorithm(tree, min_samples, metric=metric, leaf_size=leaf_size // 3,
198216
approx_min_span_tree=approx_min_span_tree)
199217
min_spanning_tree = alg.spanning_tree()
218+
#Sort edges of the min_spanning_tree by weight
200219
min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :]
201-
220+
#Convert edge list into standard hierarchical clustering format
202221
single_linkage_tree = label(min_spanning_tree)
203222

204223
if gen_min_span_tree:
@@ -230,6 +249,11 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
230249
to be considered as a core point. This includes the point itself.
231250
defaults to the min_cluster_size.
232251
252+
alpha : float, optional
253+
A distance scaling parameter as used in robust single linkage.
254+
See (K. Chaudhuri and S. Dasgupta "Rates of convergence
255+
for the cluster tree."). (default 1.0)
256+
233257
metric : string, or callable, optional
234258
The metric to use when calculating distance between instances in a
235259
feature array. If metric is a string or callable, it must be one of
@@ -242,10 +266,9 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
242266
p : int, optional
243267
p value to use if using the minkowski metric. (default 2)
244268
245-
alpha : float, optional
246-
A distance scaling parameter as used in robust single linkage.
247-
See (K. Chaudhuri and S. Dasgupta "Rates of convergence
248-
for the cluster tree."). (default 1.0)
269+
leaf_size : int, optional
270+
Leaf size for trees responsible for fast nearest
271+
neighbour queries. (default 40)
249272
250273
algorithm : string, optional
251274
Exactly which algorithm to use; hdbscan has variants specialised
@@ -314,28 +337,30 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
314337
if min_samples <= 0 or min_cluster_size <= 0:
315338
raise ValueError('Min samples and Min cluster size must be positive integers')
316339

340+
#Checks input and converts to an nd-array where possible
317341
X = check_array(X, accept_sparse='csr')
342+
#Python 2 and 3 compliant string_type checking
318343
if isinstance(memory, six.string_types):
319344
memory = Memory(cachedir=memory, verbose=0)
320345

321346
if algorithm != 'best':
322347
if algorithm == 'generic':
323-
(single_linkage_tree,
348+
(single_linkage_tree,
324349
result_min_span_tree) = \
325350
memory.cache(_hdbscan_generic)(X, min_samples, alpha, metric,
326351
p, leaf_size, gen_min_span_tree)
327352
elif algorithm == 'prims_kdtree':
328353
if metric not in KDTree.valid_metrics:
329354
raise ValueError("Cannot use Prim's with KDTree for this metric!")
330-
(single_linkage_tree,
355+
(single_linkage_tree,
331356
result_min_span_tree) = \
332357
memory.cache(_hdbscan_prims_kdtree)(X, min_samples, alpha,
333358
metric, p, leaf_size,
334359
gen_min_span_tree)
335360
elif algorithm == 'prims_balltree':
336361
if metric not in BallTree.valid_metrics:
337362
raise ValueError("Cannot use Prim's with BallTree for this metric!")
338-
(single_linkage_tree,
363+
(single_linkage_tree,
339364
result_min_span_tree) = \
340365
memory.cache(_hdbscan_prims_balltree)(X, min_samples, alpha,
341366
metric, p, leaf_size,
@@ -369,7 +394,7 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
369394
alpha, metric, p, leaf_size,
370395
gen_min_span_tree)
371396
elif metric in KDTree.valid_metrics:
372-
# Need heuristic to decide when to go to boruvka; still debugging for now
397+
#TO DO: Need heuristic to decide when to go to boruvka; still debugging for now
373398
if X.shape[1] > 60:
374399
(single_linkage_tree,
375400
result_min_span_tree) = \
@@ -384,7 +409,7 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
384409
approx_min_span_tree,
385410
gen_min_span_tree)
386411
else: # Metric is a valid BallTree metric
387-
# Need heuristic to decide when to go to boruvka; still debugging for now
412+
#TO DO: Need heuristic to decide when to go to boruvka; still debugging for now
388413
if X.shape[1] > 60:
389414
(single_linkage_tree,
390415
result_min_span_tree) = \
@@ -401,7 +426,7 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
401426

402427
return _tree_to_labels(X, single_linkage_tree, min_cluster_size) + (result_min_span_tree,)
403428

404-
429+
#Inherits from sklearn
405430
class HDBSCAN(BaseEstimator, ClusterMixin):
406431
"""Perform HDBSCAN clustering from vector array or distance matrix.
407432

0 commit comments

Comments
 (0)