@@ -89,8 +89,10 @@ def _hdbscan_generic(X, min_samples=5, alpha=1.0,
8989 else :
9090 result_min_span_tree = None
9191
92+ #Sort edges of the min_spanning_tree by weight
9293 min_spanning_tree = min_spanning_tree [np .argsort (min_spanning_tree .T [2 ]), :]
9394
95+ #Convert edge list into standard hierarchical clustering format
9496 single_linkage_tree = label (min_spanning_tree )
9597
9698 return single_linkage_tree , result_min_span_tree
@@ -106,20 +108,25 @@ def _hdbscan_prims_kdtree(X, min_samples=5, alpha=1.0,
106108 elif p is None :
107109 p = 2 # Unused, but needs to be integer; assume euclidean
108110
109- dim = X .shape [0 ]
110- min_samples = min (dim - 1 , min_samples )
111+ size = X .shape [0 ]
112+ min_samples = min (size - 1 , min_samples )
111113
112114 tree = KDTree (X , metric = metric , leaf_size = leaf_size )
113115
116+ #TO DO: Deal with p for minkowski appropriately
114117 dist_metric = DistanceMetric .get_metric (metric )
115118
119+ #Get distance to kth nearest neighbour
116120 core_distances = tree .query (X , k = min_samples ,
117121 dualtree = True ,
118122 breadth_first = True )[0 ][:, - 1 ]
123+ #Mutual reachability distance is implicite in mst_linkage_core_cdist
119124 min_spanning_tree = mst_linkage_core_cdist (X , core_distances , dist_metric , alpha )
120125
126+ #Sort edges of the min_spanning_tree by weight
121127 min_spanning_tree = min_spanning_tree [np .argsort (min_spanning_tree .T [2 ]), :]
122128
129+ #Convert edge list into standard hierarchical clustering format
123130 single_linkage_tree = label (min_spanning_tree )
124131
125132 return single_linkage_tree , None
@@ -135,19 +142,23 @@ def _hdbscan_prims_balltree(X, min_samples=5, alpha=1.0,
135142 elif p is None :
136143 p = 2 # Unused, but needs to be integer; assume euclidean
137144
138- dim = X .shape [0 ]
139- min_samples = min (dim - 1 , min_samples )
145+ size = X .shape [0 ]
146+ min_samples = min (size - 1 , min_samples )
140147
141148 tree = BallTree (X , metric = metric , leaf_size = leaf_size )
142149
143150 dist_metric = DistanceMetric .get_metric (metric )
144151
152+ #Get distance to kth nearest neighbour
145153 core_distances = tree .query (X , k = min_samples ,
146154 dualtree = True ,
147155 breadth_first = True )[0 ][:, - 1 ]
156+
157+ #Mutual reachability distance is implicite in mst_linkage_core_cdist
148158 min_spanning_tree = mst_linkage_core_cdist (X , core_distances , dist_metric , alpha )
159+ #Sort edges of the min_spanning_tree by weight
149160 min_spanning_tree = min_spanning_tree [np .argsort (min_spanning_tree .T [2 ]), :]
150-
161+ #Convert edge list into standard hierarchical clustering format
151162 single_linkage_tree = label (min_spanning_tree )
152163
153164 return single_linkage_tree , None
@@ -163,15 +174,19 @@ def _hdbscan_boruvka_kdtree(X, min_samples=5, alpha=1.0,
163174 if p < 0 :
164175 raise ValueError ('Minkowski metric with negative p value is not defined!' )
165176
166- dim = X .shape [0 ]
167- min_samples = min (dim - 1 , min_samples )
177+ if leaf_size < 3 :
178+ leaf_size = 3
179+
180+ size = X .shape [0 ]
181+ min_samples = min (size - 1 , min_samples )
168182
169183 tree = KDTree (X , metric = metric , leaf_size = leaf_size )
170184 alg = KDTreeBoruvkaAlgorithm (tree , min_samples , metric = metric , leaf_size = leaf_size // 3 ,
171185 approx_min_span_tree = approx_min_span_tree )
172186 min_spanning_tree = alg .spanning_tree ()
187+ #Sort edges of the min_spanning_tree by weight
173188 min_spanning_tree = min_spanning_tree [np .argsort (min_spanning_tree .T [2 ]), :]
174-
189+ #Convert edge list into standard hierarchical clustering format
175190 single_linkage_tree = label (min_spanning_tree )
176191
177192 if gen_min_span_tree :
@@ -190,15 +205,19 @@ def _hdbscan_boruvka_balltree(X, min_samples=5, alpha=1.0,
190205 if p < 0 :
191206 raise ValueError ('Minkowski metric with negative p value is not defined!' )
192207
193- dim = X .shape [0 ]
194- min_samples = min (dim - 1 , min_samples )
208+ if leaf_size < 3 :
209+ leaf_size = 3
210+
211+ size = X .shape [0 ]
212+ min_samples = min (size - 1 , min_samples )
195213
196214 tree = BallTree (X , metric = metric , leaf_size = leaf_size )
197215 alg = BallTreeBoruvkaAlgorithm (tree , min_samples , metric = metric , leaf_size = leaf_size // 3 ,
198216 approx_min_span_tree = approx_min_span_tree )
199217 min_spanning_tree = alg .spanning_tree ()
218+ #Sort edges of the min_spanning_tree by weight
200219 min_spanning_tree = min_spanning_tree [np .argsort (min_spanning_tree .T [2 ]), :]
201-
220+ #Convert edge list into standard hierarchical clustering format
202221 single_linkage_tree = label (min_spanning_tree )
203222
204223 if gen_min_span_tree :
@@ -230,6 +249,11 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
230249 to be considered as a core point. This includes the point itself.
231250 defaults to the min_cluster_size.
232251
252+ alpha : float, optional
253+ A distance scaling parameter as used in robust single linkage.
254+ See (K. Chaudhuri and S. Dasgupta "Rates of convergence
255+ for the cluster tree."). (default 1.0)
256+
233257 metric : string, or callable, optional
234258 The metric to use when calculating distance between instances in a
235259 feature array. If metric is a string or callable, it must be one of
@@ -242,10 +266,9 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
242266 p : int, optional
243267 p value to use if using the minkowski metric. (default 2)
244268
245- alpha : float, optional
246- A distance scaling parameter as used in robust single linkage.
247- See (K. Chaudhuri and S. Dasgupta "Rates of convergence
248- for the cluster tree."). (default 1.0)
269+ leaf_size : int, optional
270+ Leaf size for trees responsible for fast nearest
271+ neighbour queries. (default 40)
249272
250273 algorithm : string, optional
251274 Exactly which algorithm to use; hdbscan has variants specialised
@@ -314,28 +337,30 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
314337 if min_samples <= 0 or min_cluster_size <= 0 :
315338 raise ValueError ('Min samples and Min cluster size must be positive integers' )
316339
340+ #Checks input and converts to an nd-array where possible
317341 X = check_array (X , accept_sparse = 'csr' )
342+ #Python 2 and 3 compliant string_type checking
318343 if isinstance (memory , six .string_types ):
319344 memory = Memory (cachedir = memory , verbose = 0 )
320345
321346 if algorithm != 'best' :
322347 if algorithm == 'generic' :
323- (single_linkage_tree ,
348+ (single_linkage_tree ,
324349 result_min_span_tree ) = \
325350 memory .cache (_hdbscan_generic )(X , min_samples , alpha , metric ,
326351 p , leaf_size , gen_min_span_tree )
327352 elif algorithm == 'prims_kdtree' :
328353 if metric not in KDTree .valid_metrics :
329354 raise ValueError ("Cannot use Prim's with KDTree for this metric!" )
330- (single_linkage_tree ,
355+ (single_linkage_tree ,
331356 result_min_span_tree ) = \
332357 memory .cache (_hdbscan_prims_kdtree )(X , min_samples , alpha ,
333358 metric , p , leaf_size ,
334359 gen_min_span_tree )
335360 elif algorithm == 'prims_balltree' :
336361 if metric not in BallTree .valid_metrics :
337362 raise ValueError ("Cannot use Prim's with BallTree for this metric!" )
338- (single_linkage_tree ,
363+ (single_linkage_tree ,
339364 result_min_span_tree ) = \
340365 memory .cache (_hdbscan_prims_balltree )(X , min_samples , alpha ,
341366 metric , p , leaf_size ,
@@ -369,7 +394,7 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
369394 alpha , metric , p , leaf_size ,
370395 gen_min_span_tree )
371396 elif metric in KDTree .valid_metrics :
372- # Need heuristic to decide when to go to boruvka; still debugging for now
397+ #TO DO: Need heuristic to decide when to go to boruvka; still debugging for now
373398 if X .shape [1 ] > 60 :
374399 (single_linkage_tree ,
375400 result_min_span_tree ) = \
@@ -384,7 +409,7 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
384409 approx_min_span_tree ,
385410 gen_min_span_tree )
386411 else : # Metric is a valid BallTree metric
387- # Need heuristic to decide when to go to boruvka; still debugging for now
412+ #TO DO: Need heuristic to decide when to go to boruvka; still debugging for now
388413 if X .shape [1 ] > 60 :
389414 (single_linkage_tree ,
390415 result_min_span_tree ) = \
@@ -401,7 +426,7 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
401426
402427 return _tree_to_labels (X , single_linkage_tree , min_cluster_size ) + (result_min_span_tree ,)
403428
404-
429+ #Inherits from sklearn
405430class HDBSCAN (BaseEstimator , ClusterMixin ):
406431 """Perform HDBSCAN clustering from vector array or distance matrix.
407432
0 commit comments