Skip to content

Commit dbfcdce

Browse files
committed
Default approx_min_span_tree to on for speed reasons. This needs to be debugged in the long term.
1 parent 56afef2 commit dbfcdce

File tree

1 file changed

+15
-13
lines changed

1 file changed

+15
-13
lines changed

hdbscan/hdbscan_.py

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ def _hdbscan_prims_balltree(X, min_samples=5, alpha=1.0,
149149

150150
def _hdbscan_boruvka_kdtree(X, min_samples=5, alpha=1.0,
151151
metric='minkowski', p=2, leaf_size=40,
152-
approx_min_span_tree=False,
152+
approx_min_span_tree=True,
153153
gen_min_span_tree=False):
154154
if metric == 'minkowski':
155155
if p is None:
@@ -176,7 +176,7 @@ def _hdbscan_boruvka_kdtree(X, min_samples=5, alpha=1.0,
176176

177177
def _hdbscan_boruvka_balltree(X, min_samples=5, alpha=1.0,
178178
metric='minkowski', p=2, leaf_size=40,
179-
approx_min_span_tree=False,
179+
approx_min_span_tree=True,
180180
gen_min_span_tree=False):
181181
if metric == 'minkowski':
182182
if p is None:
@@ -204,7 +204,7 @@ def _hdbscan_boruvka_balltree(X, min_samples=5, alpha=1.0,
204204
def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
205205
metric='minkowski', p=2, leaf_size=40,
206206
algorithm='best', memory=Memory(cachedir=None, verbose=0),
207-
approx_min_span_tree=False, gen_min_span_tree=False):
207+
approx_min_span_tree=True, gen_min_span_tree=False):
208208
"""Perform HDBSCAN clustering from a vector array or distance matrix.
209209
210210
Parameters
@@ -259,12 +259,13 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
259259
By default, no caching is done. If a string is given, it is the
260260
path to the caching directory.
261261
262-
approx_min_span_tree : Bool (optional)
262+
approx_min_span_tree : Bool, optional
263263
Whether to accept an only approximate minimum spanning tree.
264264
For some algorithms this can provide a significant speedup, but
265-
the resulting clustering is of lower quality, and may have issues.
266-
If you are willing to sacrifice correctness for speed you may want
267-
to explore this; in general this should be left at the default False.
265+
the resulting clustering may be of marginally lower quality.
266+
If you are willing to sacrifice speed for correctness you may want
267+
to explore this; in general this should be left at the default True.
268+
(default True)
268269
269270
gen_min_span_tree : bool, optional
270271
Whether to generate the minimum spanning tree for later analysis.
@@ -276,7 +277,8 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
276277
Cluster labels for each point. Noisy samples are given the label -1.
277278
278279
probabilities : array [n_samples]
279-
Cluster membership strengths for each point. Noisy samples are assigned 0.
280+
Cluster membership strengths for each point. Noisy samples are assigned
281+
0.
280282
281283
condensed_tree : record array
282284
The condensed cluster hierarchy used to generate clusters.
@@ -459,10 +461,10 @@ class HDBSCAN(BaseEstimator, ClusterMixin):
459461
approx_min_span_tree : Bool, optional
460462
Whether to accept an only approximate minimum spanning tree.
461463
For some algorithms this can provide a significant speedup, but
462-
the resulting clustering is of lower quality, and may have issues.
463-
If you are willing to sacrifice correctness for speed you may want
464-
to explore this; in general this should be left at the default False.
465-
(default False)
464+
the resulting clustering may be of marginally lower quality.
465+
If you are willing to sacrifice speed for correctness you may want
466+
to explore this; in general this should be left at the default True.
467+
(default True)
466468
467469
gen_min_span_tree: bool, optional
468470
Whether to generate the minimum spanning tree with regard
@@ -516,7 +518,7 @@ def __init__(self, min_cluster_size=5, min_samples=None,
516518
metric='euclidean', alpha=1.0, p=None,
517519
algorithm='best', leaf_size=40,
518520
memory=Memory(cachedir=None, verbose=0),
519-
approx_min_span_tree=False,
521+
approx_min_span_tree=True,
520522
gen_min_span_tree=False):
521523
self.min_cluster_size = min_cluster_size
522524
self.min_samples = min_samples

0 commit comments

Comments
 (0)