Skip to content

Commit fdf2331

Browse files
committed
fix for issue #74 supplying n_jobs as a parameter for robust single linkage.
1 parent b53d164 commit fdf2331

File tree

2 files changed

+42
-11
lines changed

2 files changed

+42
-11
lines changed

hdbscan/hdbscan_.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -613,7 +613,8 @@ class HDBSCAN (BaseEstimator, ClusterMixin):
613613
614614
core_dist_n_jobs : int, optional
615615
Number of parallel jobs to run in core distance computations (if
616-
supported by the specific algorithm).
616+
supported by the specific algorithm). For ``core_dist_n_jobs``
617+
below -1, (n_cpus + 1 + core_dist_n_jobs) are used.
617618
(default 4)
618619
619620
allow_single_cluster : boolean

hdbscan/robust_single_linkage_.py

Lines changed: 40 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,11 @@ def _rsl_prims_balltree(X, k=5, alpha=1.4142135623730951, metric='euclidean', **
8787

8888

8989
def _rsl_boruvka_kdtree(X, k=5, alpha=1.0,
90-
metric='euclidean', leaf_size=40, **kwargs):
90+
metric='euclidean', leaf_size=40,
91+
core_dist_n_jobs=4, **kwargs):
92+
93+
if core_dist_n_jobs < 1:
94+
core_dist_n_jobs = max(cpu_count() + 1 + core_dist_n_jobs, 1)
9195

9296
dim = X.shape[0]
9397
min_samples = min(dim - 1, k)
@@ -104,9 +108,13 @@ def _rsl_boruvka_kdtree(X, k=5, alpha=1.0,
104108

105109

106110
def _rsl_boruvka_balltree(X, k=5, alpha=1.0,
107-
metric='euclidean', leaf_size=40, **kwargs):
111+
metric='euclidean', leaf_size=40,
112+
core_dist_n_jobs=4, **kwargs):
108113

109-
dim = X.shape[0]
114+
if core_dist_n_jobs < 1:
115+
core_dist_n_jobs = max(cpu_count() + 1 + core_dist_n_jobs, 1)
116+
117+
dim = X.shape[0]
110118
min_samples = min(dim - 1, k)
111119

112120
tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs)
@@ -122,7 +130,8 @@ def _rsl_boruvka_balltree(X, k=5, alpha=1.0,
122130

123131
def robust_single_linkage(X, cut, k=5, alpha=1.4142135623730951,
124132
gamma=5, metric='euclidean', algorithm='best',
125-
memory=Memory(cachedir=None, verbose=0), leaf_size=40, **kwargs):
133+
memory=Memory(cachedir=None, verbose=0), leaf_size=40,
134+
core_dist_n_jobs=4, **kwargs):
126135
"""Perform robust single linkage clustering from a vector array
127136
or distance matrix.
128137
@@ -180,6 +189,12 @@ def robust_single_linkage(X, cut, k=5, alpha=1.4142135623730951,
180189
Leaf size for trees responsible for fast nearest
181190
neighbour queries. (default 40)
182191
192+
core_dist_n_jobs : int, optional
193+
Number of parallel jobs to run in core distance computations (if
194+
supported by the specific algorithm). For ``core_dist_n_jobs``
195+
below -1, (n_cpus + 1 + core_dist_n_jobs) are used.
196+
(default 4)
197+
183198
Returns
184199
-------
185200
labels : array [n_samples]
@@ -232,10 +247,12 @@ def robust_single_linkage(X, cut, k=5, alpha=1.4142135623730951,
232247
memory.cache(_rsl_prims_balltree)(X, k, alpha, metric, **kwargs)
233248
elif algorithm == 'boruvka_kdtree':
234249
single_linkage_tree = \
235-
memory.cache(_rsl_boruvka_kdtree)(X, k, alpha, metric, leaf_size, **kwargs)
250+
memory.cache(_rsl_boruvka_kdtree)(X, k, alpha, metric, leaf_size,
251+
core_dist_n_jobs, **kwargs)
236252
elif algorithm == 'boruvka_balltree':
237253
single_linkage_tree = \
238-
memory.cache(_rsl_boruvka_balltree)(X, k, alpha, metric, leaf_size, **kwargs)
254+
memory.cache(_rsl_boruvka_balltree)(X, k, alpha, metric, leaf_size,
255+
core_dist_n_jobs, **kwargs)
239256
else:
240257
raise TypeError('Unknown algorithm type %s specified' % algorithm)
241258
else:
@@ -249,21 +266,26 @@ def robust_single_linkage(X, cut, k=5, alpha=1.4142135623730951,
249266
memory.cache(_rsl_prims_kdtree)(X, k, alpha, metric, **kwargs)
250267
else:
251268
single_linkage_tree = \
252-
memory.cache(_rsl_boruvka_kdtree)(X, k, alpha, metric, **kwargs)
269+
memory.cache(_rsl_boruvka_kdtree)(X, k, alpha, metric,
270+
leaf_size,
271+
core_dist_n_jobs,
272+
**kwargs)
253273
else: # Metric is a valid BallTree metric
254274
# Need heuristic to decide when to go to boruvka; still debugging for now
255275
if X.shape[1] > 128:
256276
single_linkage_tree = \
257277
memory.cache(_rsl_prims_kdtree)(X, k, alpha, metric, **kwargs)
258278
else:
259279
single_linkage_tree = \
260-
memory.cache(_rsl_boruvka_balltree)(X, k, alpha, metric, **kwargs)
280+
memory.cache(_rsl_boruvka_balltree)(X, k, alpha, metric,
281+
leaf_size,
282+
core_dist_n_jobs,
283+
**kwargs)
261284

262285
labels = single_linkage_tree.get_clusters(cut, gamma)
263286

264287
return labels, single_linkage_tree
265288

266-
267289
class RobustSingleLinkage(BaseEstimator, ClusterMixin):
268290
"""Perform robust single linkage clustering from a vector array
269291
or distance matrix.
@@ -317,6 +339,13 @@ class RobustSingleLinkage(BaseEstimator, ClusterMixin):
317339
* ``large_kdtree``
318340
* ``large_kdtree_fastcluster``
319341
342+
343+
core_dist_n_jobs : int, optional
344+
Number of parallel jobs to run in core distance computations (if
345+
supported by the specific algorithm). For ``core_dist_n_jobs``
346+
below -1, (n_cpus + 1 + core_dist_n_jobs) are used.
347+
(default 4)
348+
320349
Attributes
321350
-------
322351
labels_ : array [n_samples]
@@ -339,14 +368,15 @@ class RobustSingleLinkage(BaseEstimator, ClusterMixin):
339368
"""
340369

341370
def __init__(self, cut=0.4, k=5, alpha=1.4142135623730951, gamma=5, metric='euclidean',
342-
algorithm='best', **kwargs):
371+
algorithm='best', core_dist_n_jobs=4, **kwargs):
343372

344373
self.cut = cut
345374
self.k = k
346375
self.alpha = alpha
347376
self.gamma = gamma
348377
self.metric = metric
349378
self.algorithm = algorithm
379+
self.core_dist_n_jobs = core_dist_n_jobs
350380

351381
self._metric_kwargs = kwargs
352382

0 commit comments

Comments
 (0)