Merge pull request #329 from cmalzer/master

lmcinnes · web-flow · commit 6c1a6d4a214d · 2019-11-10T16:49:19.000-05:00
add epsilon parameter to merge clusters based on distance
diff --git a/hdbscan/_hdbscan_tree.pyx b/hdbscan/_hdbscan_tree.pyx
@@ -44,19 +44,19 @@ cpdef np.ndarray condense_tree(np.ndarray[np.double_t, ndim=2] hierarchy,
                                np.intp_t min_cluster_size=10):
     """Condense a tree according to a minimum cluster size. This is akin
     to the runt pruning procedure of Stuetzle. The result is a much simpler
-    tree that is easier to visualize. We include extra information on the 
+    tree that is easier to visualize. We include extra information on the
     lambda value at which individual points depart clusters for later
     analysis and computation.
-    
+
     Parameters
     ----------
     hierarchy : ndarray (n_samples, 4)
         A single linkage hierarchy in scipy.cluster.hierarchy format.
-        
+
     min_cluster_size : int, optional (default 10)
-        The minimum size of clusters to consider. Smaller "runt" 
+        The minimum size of clusters to consider. Smaller "runt"
         clusters are pruned from the tree.
-        
+
     Returns
     -------
     condensed_tree : numpy recarray
@@ -341,22 +341,22 @@ cpdef np.ndarray[np.intp_t, ndim=1] labelling_at_cut(
         np.double_t cut,
         np.intp_t min_cluster_size):
     """Given a single linkage tree and a cut value, return the
-    vector of cluster labels at that cut value. This is useful 
+    vector of cluster labels at that cut value. This is useful
     for Robust Single Linkage, and extracting DBSCAN results
     from a single HDBSCAN run.
-    
+
     Parameters
     ----------
     linkage : ndarray (n_samples, 4)
         The single linkage tree in scipy.cluster.hierarchy format.
-        
+
     cut : double
         The cut value at which to find clusters.
-        
+
     min_cluster_size : int
         The minimum cluster size; clusters below this size at
         the cut will be considered noise.
-        
+
     Returns
     -------
     labels : ndarray (n_samples,)
@@ -519,12 +519,12 @@ cdef get_probabilities(np.ndarray tree, dict cluster_map, np.ndarray labels):
 
 cpdef np.ndarray[np.double_t, ndim=1] outlier_scores(np.ndarray tree):
     """Generate GLOSH outlier scores from a condensed tree.
-    
+
     Parameters
     ----------
     tree : numpy recarray
         The condensed tree to generate GLOSH outlier scores from
-        
+
     Returns
     -------
     outlier_scores : ndarray (n_samples,)
@@ -609,43 +609,80 @@ cpdef list get_cluster_tree_leaves(np.ndarray cluster_tree):
     root = cluster_tree['parent'].min()
     return recurse_leaf_dfs(cluster_tree, root)
 
+cpdef np.intp_t traverse_upwards(np.ndarray cluster_tree, np.double_t cluster_selection_epsilon, np.intp_t leaf):
+
+    root = cluster_tree['parent'].min()
+    parent = cluster_tree[cluster_tree['child'] == leaf]['parent']
+    if parent == root:
+        return leaf #return node closest to root
+
+    parent_eps = 1/cluster_tree[cluster_tree['child'] == parent]['lambda_val']
+    if parent_eps > cluster_selection_epsilon:
+        return parent
+    else:
+        return traverse_upwards(cluster_tree, cluster_selection_epsilon, parent)
+
+cpdef set epsilon_search(set leaves, np.ndarray cluster_tree, np.double_t cluster_selection_epsilon):
+
+    selected_clusters = list()
+    processed = list()
+
+    for leaf in leaves:
+        eps = 1/cluster_tree['lambda_val'][cluster_tree['child'] == leaf][0]
+        if eps < cluster_selection_epsilon:
+                if leaf not in processed:
+                    epsilon_child = traverse_upwards(cluster_tree, cluster_selection_epsilon, leaf)
+                    selected_clusters.append(epsilon_child)
+
+                    for sub_node in bfs_from_cluster_tree(cluster_tree, epsilon_child):
+                        if sub_node != epsilon_child:
+                            processed.append(sub_node)
+        else:
+                selected_clusters.append(leaf)
+
+    return set(selected_clusters)
+
 cpdef tuple get_clusters(np.ndarray tree, dict stability,
                          cluster_selection_method='eom',
                          allow_single_cluster=False,
-                         match_reference_implementation=False):
-    """Given a tree and stability dict, produce the cluster labels 
+                         match_reference_implementation=False,
+                         cluster_selection_epsilon=0.0):
+    """Given a tree and stability dict, produce the cluster labels
     (and probabilities) for a flat clustering based on the chosen
     cluster selection method.
-    
+
     Parameters
     ----------
     tree : numpy recarray
         The condensed tree to extract flat clusters from
-        
+
     stability : dict
         A dictionary mapping cluster_ids to stability values
-        
+
     cluster_selection_method : string, optional (default 'eom')
         The method of selecting clusters. The default is the
         Excess of Mass algorithm specified by 'eom'. The alternate
         option is 'leaf'.
-        
+
     allow_single_cluster : boolean, optional (default False)
         Whether to allow a single cluster to be selected by the
         Excess of Mass algorithm.
-        
+
     match_reference_implementation : boolean, optional (default False)
         Whether to match the reference implementation in how to handle
         certain edge cases.
-        
+
+    cluster_selection_epsilon: float, optional (default 0.0)
+        A distance threshold for cluster splits.
+
     Returns
     -------
     labels : ndarray (n_samples,)
         An integer array of cluster labels, with -1 denoting noise.
-        
+
     probabilities : ndarray (n_samples,)
         The cluster membership strength of each sample.
-        
+
     stabilities : ndarray (n_clusters,)
         The cluster coherence strengths of each cluster.
     """
@@ -689,21 +726,40 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability,
                 for sub_node in bfs_from_cluster_tree(cluster_tree, node):
                     if sub_node != node:
                         is_cluster[sub_node] = False
+
+        if cluster_selection_epsilon != 0.0:
+
+            eom_clusters = set([c for c in is_cluster if is_cluster[c]])
+            selected_clusters = epsilon_search(eom_clusters, cluster_tree, cluster_selection_epsilon)
+            for c in is_cluster:
+                if c in selected_clusters:
+                    is_cluster[c] = True
+                else:
+                    is_cluster[c] = False
+
+
     elif cluster_selection_method == 'leaf':
         leaves = set(get_cluster_tree_leaves(cluster_tree))
         if len(leaves) == 0:
             for c in is_cluster:
                 is_cluster[c] = False
             is_cluster[tree['parent'].min()] = True
+
+        if cluster_selection_epsilon != 0.0:
+            selected_clusters = epsilon_search(leaves, cluster_tree, cluster_selection_epsilon)
+        else:
+            selected_clusters = leaves
+
         for c in is_cluster:
-            if c in leaves:
-                is_cluster[c] = True
-            else:
-                is_cluster[c] = False
+                if c in selected_clusters:
+                    is_cluster[c] = True
+                else:
+                    is_cluster[c] = False
     else:
         raise ValueError('Invalid Cluster Selection Method: %s\n'
                          'Should be one of: "eom", "leaf"\n')
 
+
     clusters = set([c for c in is_cluster if is_cluster[c]])
     cluster_map = {c: n for n, c in enumerate(sorted(list(clusters)))}
     reverse_cluster_map = {n: c for c, n in cluster_map.items()}
diff --git a/hdbscan/hdbscan_.py b/hdbscan/hdbscan_.py
@@ -48,7 +48,8 @@
 def _tree_to_labels(X, single_linkage_tree, min_cluster_size=10,
                     cluster_selection_method='eom',
                     allow_single_cluster=False,
-                    match_reference_implementation=False):
+                    match_reference_implementation=False,
+					cluster_selection_epsilon=0.0):
     """Converts a pretrained tree and cluster size into a
     set of labels and probabilities.
     """
@@ -59,7 +60,8 @@ def _tree_to_labels(X, single_linkage_tree, min_cluster_size=10,
                                                       stability_dict,
                                                       cluster_selection_method,
                                                       allow_single_cluster,
-                                                      match_reference_implementation)
+                                                      match_reference_implementation,
+													  cluster_selection_epsilon)
 
     return (labels, probabilities, stabilities, condensed_tree,
             single_linkage_tree)
@@ -210,9 +212,9 @@ def _hdbscan_prims_kdtree(X, min_samples=5, alpha=1.0,
 
     # Convert edge list into standard hierarchical clustering format
     single_linkage_tree = label(min_spanning_tree)
-         
+
     if gen_min_span_tree:
-        warn('Cannot generate Minimum Spanning Tree; ' 
+        warn('Cannot generate Minimum Spanning Tree; '
              'the implemented Prim\'s does not produce '
              'the full minimum spanning tree ', UserWarning)
 
@@ -246,9 +248,9 @@ def _hdbscan_prims_balltree(X, min_samples=5, alpha=1.0,
                         :]
     # Convert edge list into standard hierarchical clustering format
     single_linkage_tree = label(min_spanning_tree)
-    
+
     if gen_min_span_tree:
-        warn('Cannot generate Minimum Spanning Tree; ' 
+        warn('Cannot generate Minimum Spanning Tree; '
              'the implemented Prim\'s does not produce '
              'the full minimum spanning tree ', UserWarning)
 
@@ -327,7 +329,7 @@ def check_precomputed_distance_matrix(X):
     check_array(tmp)
 
 
-def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
+def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0, cluster_selection_epsilon=0.0,
             metric='minkowski', p=2, leaf_size=40,
             algorithm='best', memory=Memory(cachedir=None, verbose=0),
             approx_min_span_tree=True, gen_min_span_tree=False,
@@ -353,6 +355,10 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
         to be considered as a core point. This includes the point itself.
         defaults to the min_cluster_size.
 
+	cluster_selection_epsilon: float, optional (default=0.0)
+		A distance threshold. Clusters below this value will be merged.
+        See [3]_ for more information.
+
     alpha : float, optional (default=1.0)
         A distance scaling parameter as used in robust single linkage.
         See [2]_ for more information.
@@ -471,6 +477,8 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
        cluster tree. In Advances in Neural Information Processing Systems
        (pp. 343-351).
 
+    .. [3] Malzer, C., & Baum, M. (2019). HDBSCAN(ε^): An Alternative Cluster
+       Extraction Method for HDBSCAN. arxiv preprint 1911.02282.
     """
     if min_samples is None:
         min_samples = min_cluster_size
@@ -485,6 +493,12 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
     if min_cluster_size == 1:
         raise ValueError('Min cluster size must be greater than one')
 
+    if type(cluster_selection_epsilon) is int:
+        cluster_selection_epsilon = float(cluster_selection_epsilon)
+
+    if type(cluster_selection_epsilon) is not float or cluster_selection_epsilon < 0.0:
+        raise ValueError('Epsilon must be a float value greater than or equal to 0!')
+
     if not isinstance(alpha, float) or alpha <= 0.0:
         raise ValueError('Alpha must be a positive float value greater than'
                          ' 0!')
@@ -528,7 +542,7 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
     if algorithm != 'best':
         if metric != 'precomputed' and issparse(X) and metric != 'generic':
             raise ValueError("Sparse data matrices only support algorithm 'generic'.")
-                  
+
         if algorithm == 'generic':
             (single_linkage_tree,
              result_min_span_tree) = memory.cache(
@@ -616,7 +630,8 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
                            min_cluster_size,
                            cluster_selection_method,
                            allow_single_cluster,
-                           match_reference_implementation) + \
+                           match_reference_implementation,
+						   cluster_selection_epsilon) + \
             (result_min_span_tree,)
 
 
@@ -656,6 +671,10 @@ class HDBSCAN(BaseEstimator, ClusterMixin):
         A distance scaling parameter as used in robust single linkage.
         See [3]_ for more information.
 
+    cluster_selection_epsilon: float, optional (default=0.0)
+		A distance threshold. Clusters below this value will be merged.
+        See [5]_ for more information.
+
     algorithm : string, optional (default='best')
         Exactly which algorithm to use; hdbscan has variants specialised
         for different characteristics of the data. By default this is set
@@ -813,9 +832,12 @@ class HDBSCAN(BaseEstimator, ClusterMixin):
        Sander, J., 2014. Density-Based Clustering Validation. In SDM
        (pp. 839-847).
 
+    .. [5] Malzer, C., & Baum, M. (2019). HDBSCAN(ε^): An Alternative Cluster
+       Extraction Method for HDBSCAN. arxiv preprint 1911.02282.
+
     """
 
-    def __init__(self, min_cluster_size=5, min_samples=None,
+    def __init__(self, min_cluster_size=5, min_samples=None, cluster_selection_epsilon=0.0,
                  metric='euclidean', alpha=1.0, p=None,
                  algorithm='best', leaf_size=40,
                  memory=Memory(cachedir=None, verbose=0),
@@ -829,7 +851,7 @@ def __init__(self, min_cluster_size=5, min_samples=None,
         self.min_cluster_size = min_cluster_size
         self.min_samples = min_samples
         self.alpha = alpha
-
+        self.cluster_selection_epsilon = cluster_selection_epsilon
         self.metric = metric
         self.p = p
         self.algorithm = algorithm