Skip to content

Commit e53b1da

Browse files
author
cmalzer
committed
add epsilon parameter
1 parent a042ef9 commit e53b1da

File tree

2 files changed

+108
-37
lines changed

2 files changed

+108
-37
lines changed

hdbscan/_hdbscan_tree.pyx

Lines changed: 82 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -44,19 +44,19 @@ cpdef np.ndarray condense_tree(np.ndarray[np.double_t, ndim=2] hierarchy,
4444
np.intp_t min_cluster_size=10):
4545
"""Condense a tree according to a minimum cluster size. This is akin
4646
to the runt pruning procedure of Stuetzle. The result is a much simpler
47-
tree that is easier to visualize. We include extra information on the
47+
tree that is easier to visualize. We include extra information on the
4848
lambda value at which individual points depart clusters for later
4949
analysis and computation.
50-
50+
5151
Parameters
5252
----------
5353
hierarchy : ndarray (n_samples, 4)
5454
A single linkage hierarchy in scipy.cluster.hierarchy format.
55-
55+
5656
min_cluster_size : int, optional (default 10)
57-
The minimum size of clusters to consider. Smaller "runt"
57+
The minimum size of clusters to consider. Smaller "runt"
5858
clusters are pruned from the tree.
59-
59+
6060
Returns
6161
-------
6262
condensed_tree : numpy recarray
@@ -341,22 +341,22 @@ cpdef np.ndarray[np.intp_t, ndim=1] labelling_at_cut(
341341
np.double_t cut,
342342
np.intp_t min_cluster_size):
343343
"""Given a single linkage tree and a cut value, return the
344-
vector of cluster labels at that cut value. This is useful
344+
vector of cluster labels at that cut value. This is useful
345345
for Robust Single Linkage, and extracting DBSCAN results
346346
from a single HDBSCAN run.
347-
347+
348348
Parameters
349349
----------
350350
linkage : ndarray (n_samples, 4)
351351
The single linkage tree in scipy.cluster.hierarchy format.
352-
352+
353353
cut : double
354354
The cut value at which to find clusters.
355-
355+
356356
min_cluster_size : int
357357
The minimum cluster size; clusters below this size at
358358
the cut will be considered noise.
359-
359+
360360
Returns
361361
-------
362362
labels : ndarray (n_samples,)
@@ -519,12 +519,12 @@ cdef get_probabilities(np.ndarray tree, dict cluster_map, np.ndarray labels):
519519

520520
cpdef np.ndarray[np.double_t, ndim=1] outlier_scores(np.ndarray tree):
521521
"""Generate GLOSH outlier scores from a condensed tree.
522-
522+
523523
Parameters
524524
----------
525525
tree : numpy recarray
526526
The condensed tree to generate GLOSH outlier scores from
527-
527+
528528
Returns
529529
-------
530530
outlier_scores : ndarray (n_samples,)
@@ -609,43 +609,80 @@ cpdef list get_cluster_tree_leaves(np.ndarray cluster_tree):
609609
root = cluster_tree['parent'].min()
610610
return recurse_leaf_dfs(cluster_tree, root)
611611

612+
cpdef np.intp_t traverse_upwards(np.ndarray cluster_tree, np.double_t cut_value, np.intp_t leaf):
613+
614+
root = cluster_tree['parent'].min()
615+
parent = cluster_tree[cluster_tree['child'] == leaf]['parent']
616+
if parent == root:
617+
return leaf #return node closest to root
618+
619+
parent_eps = 1/cluster_tree[cluster_tree['child'] == parent]['lambda_val']
620+
if parent_eps > cut_value:
621+
return parent
622+
else:
623+
return traverse_upwards(cluster_tree, cut_value, parent)
624+
625+
cpdef set epsilon_search(set leaves, np.ndarray cluster_tree, np.double_t cut_value):
626+
627+
selected_clusters = list()
628+
processed = list()
629+
630+
for leaf in leaves:
631+
eps = 1/cluster_tree['lambda_val'][cluster_tree['child'] == leaf][0]
632+
if eps < cut_value:
633+
if leaf not in processed:
634+
epsilon_child = traverse_upwards(cluster_tree, cut_value, leaf)
635+
selected_clusters.append(epsilon_child)
636+
637+
for sub_node in bfs_from_cluster_tree(cluster_tree, epsilon_child):
638+
if sub_node != epsilon_child:
639+
processed.append(sub_node)
640+
else:
641+
selected_clusters.append(leaf)
642+
643+
return set(selected_clusters)
644+
612645
cpdef tuple get_clusters(np.ndarray tree, dict stability,
613646
cluster_selection_method='eom',
614647
allow_single_cluster=False,
615-
match_reference_implementation=False):
616-
"""Given a tree and stability dict, produce the cluster labels
648+
match_reference_implementation=False,
649+
cut_value=None):
650+
"""Given a tree and stability dict, produce the cluster labels
617651
(and probabilities) for a flat clustering based on the chosen
618652
cluster selection method.
619-
653+
620654
Parameters
621655
----------
622656
tree : numpy recarray
623657
The condensed tree to extract flat clusters from
624-
658+
625659
stability : dict
626660
A dictionary mapping cluster_ids to stability values
627-
661+
628662
cluster_selection_method : string, optional (default 'eom')
629663
The method of selecting clusters. The default is the
630664
Excess of Mass algorithm specified by 'eom'. The alternate
631665
option is 'leaf'.
632-
666+
633667
allow_single_cluster : boolean, optional (default False)
634668
Whether to allow a single cluster to be selected by the
635669
Excess of Mass algorithm.
636-
670+
637671
match_reference_implementation : boolean, optional (default False)
638672
Whether to match the reference implementation in how to handle
639673
certain edge cases.
640-
674+
675+
cut_value: float, optional (default None)
676+
A threshold for cluster splits
677+
641678
Returns
642679
-------
643680
labels : ndarray (n_samples,)
644681
An integer array of cluster labels, with -1 denoting noise.
645-
682+
646683
probabilities : ndarray (n_samples,)
647684
The cluster membership strength of each sample.
648-
685+
649686
stabilities : ndarray (n_clusters,)
650687
The cluster coherence strengths of each cluster.
651688
"""
@@ -689,21 +726,40 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability,
689726
for sub_node in bfs_from_cluster_tree(cluster_tree, node):
690727
if sub_node != node:
691728
is_cluster[sub_node] = False
729+
730+
if cut_value is not None and cut_value != 0.0:
731+
732+
eom_clusters = set([c for c in is_cluster if is_cluster[c]])
733+
selected_clusters = epsilon_search(eom_clusters, cluster_tree, cut_value)
734+
for c in is_cluster:
735+
if c in selected_clusters:
736+
is_cluster[c] = True
737+
else:
738+
is_cluster[c] = False
739+
740+
692741
elif cluster_selection_method == 'leaf':
693742
leaves = set(get_cluster_tree_leaves(cluster_tree))
694743
if len(leaves) == 0:
695744
for c in is_cluster:
696745
is_cluster[c] = False
697746
is_cluster[tree['parent'].min()] = True
747+
748+
if cut_value is not None and cut_value != 0.0:
749+
selected_clusters = epsilon_search(leaves, cluster_tree, cut_value)
750+
else:
751+
selected_clusters = leaves
752+
698753
for c in is_cluster:
699-
if c in leaves:
700-
is_cluster[c] = True
701-
else:
702-
is_cluster[c] = False
754+
if c in selected_clusters:
755+
is_cluster[c] = True
756+
else:
757+
is_cluster[c] = False
703758
else:
704759
raise ValueError('Invalid Cluster Selection Method: %s\n'
705760
'Should be one of: "eom", "leaf"\n')
706761

762+
707763
clusters = set([c for c in is_cluster if is_cluster[c]])
708764
cluster_map = {c: n for n, c in enumerate(sorted(list(clusters)))}
709765
reverse_cluster_map = {n: c for c, n in cluster_map.items()}

hdbscan/hdbscan_.py

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,8 @@
4848
def _tree_to_labels(X, single_linkage_tree, min_cluster_size=10,
4949
cluster_selection_method='eom',
5050
allow_single_cluster=False,
51-
match_reference_implementation=False):
51+
match_reference_implementation=False,
52+
epsilon=None):
5253
"""Converts a pretrained tree and cluster size into a
5354
set of labels and probabilities.
5455
"""
@@ -59,7 +60,8 @@ def _tree_to_labels(X, single_linkage_tree, min_cluster_size=10,
5960
stability_dict,
6061
cluster_selection_method,
6162
allow_single_cluster,
62-
match_reference_implementation)
63+
match_reference_implementation,
64+
epsilon)
6365

6466
return (labels, probabilities, stabilities, condensed_tree,
6567
single_linkage_tree)
@@ -210,9 +212,9 @@ def _hdbscan_prims_kdtree(X, min_samples=5, alpha=1.0,
210212

211213
# Convert edge list into standard hierarchical clustering format
212214
single_linkage_tree = label(min_spanning_tree)
213-
215+
214216
if gen_min_span_tree:
215-
warn('Cannot generate Minimum Spanning Tree; '
217+
warn('Cannot generate Minimum Spanning Tree; '
216218
'the implemented Prim\'s does not produce '
217219
'the full minimum spanning tree ', UserWarning)
218220

@@ -246,9 +248,9 @@ def _hdbscan_prims_balltree(X, min_samples=5, alpha=1.0,
246248
:]
247249
# Convert edge list into standard hierarchical clustering format
248250
single_linkage_tree = label(min_spanning_tree)
249-
251+
250252
if gen_min_span_tree:
251-
warn('Cannot generate Minimum Spanning Tree; '
253+
warn('Cannot generate Minimum Spanning Tree; '
252254
'the implemented Prim\'s does not produce '
253255
'the full minimum spanning tree ', UserWarning)
254256

@@ -327,7 +329,7 @@ def check_precomputed_distance_matrix(X):
327329
check_array(tmp)
328330

329331

330-
def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
332+
def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0, epsilon=None,
331333
metric='minkowski', p=2, leaf_size=40,
332334
algorithm='best', memory=Memory(cachedir=None, verbose=0),
333335
approx_min_span_tree=True, gen_min_span_tree=False,
@@ -353,6 +355,9 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
353355
to be considered as a core point. This includes the point itself.
354356
defaults to the min_cluster_size.
355357
358+
epsilon: float, optional (default=None)
359+
A threshold for cluster splits.
360+
356361
alpha : float, optional (default=1.0)
357362
A distance scaling parameter as used in robust single linkage.
358363
See [2]_ for more information.
@@ -485,6 +490,15 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
485490
if min_cluster_size == 1:
486491
raise ValueError('Min cluster size must be greater than one')
487492

493+
if epsilon is None:
494+
epsilon = 0.0
495+
496+
if type(epsilon) is int:
497+
epsilon = float(epsilon)
498+
499+
if type(epsilon) is not float or epsilon < 0.0:
500+
raise ValueError('Epsilon must be a float value greater than or equal to 0!')
501+
488502
if not isinstance(alpha, float) or alpha <= 0.0:
489503
raise ValueError('Alpha must be a positive float value greater than'
490504
' 0!')
@@ -528,7 +542,7 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
528542
if algorithm != 'best':
529543
if metric != 'precomputed' and issparse(X) and metric != 'generic':
530544
raise ValueError("Sparse data matrices only support algorithm 'generic'.")
531-
545+
532546
if algorithm == 'generic':
533547
(single_linkage_tree,
534548
result_min_span_tree) = memory.cache(
@@ -616,7 +630,8 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
616630
min_cluster_size,
617631
cluster_selection_method,
618632
allow_single_cluster,
619-
match_reference_implementation) + \
633+
match_reference_implementation,
634+
epsilon) + \
620635
(result_min_span_tree,)
621636

622637

@@ -815,7 +830,7 @@ class HDBSCAN(BaseEstimator, ClusterMixin):
815830
816831
"""
817832

818-
def __init__(self, min_cluster_size=5, min_samples=None,
833+
def __init__(self, min_cluster_size=5, min_samples=None, epsilon=None,
819834
metric='euclidean', alpha=1.0, p=None,
820835
algorithm='best', leaf_size=40,
821836
memory=Memory(cachedir=None, verbose=0),
@@ -829,7 +844,7 @@ def __init__(self, min_cluster_size=5, min_samples=None,
829844
self.min_cluster_size = min_cluster_size
830845
self.min_samples = min_samples
831846
self.alpha = alpha
832-
847+
self.epsilon = epsilon
833848
self.metric = metric
834849
self.p = p
835850
self.algorithm = algorithm

0 commit comments

Comments
 (0)