@@ -44,19 +44,19 @@ cpdef np.ndarray condense_tree(np.ndarray[np.double_t, ndim=2] hierarchy,
4444 np.intp_t min_cluster_size = 10 ):
4545 """ Condense a tree according to a minimum cluster size. This is akin
4646 to the runt pruning procedure of Stuetzle. The result is a much simpler
47- tree that is easier to visualize. We include extra information on the
47+ tree that is easier to visualize. We include extra information on the
4848 lambda value at which individual points depart clusters for later
4949 analysis and computation.
50-
50+
5151 Parameters
5252 ----------
5353 hierarchy : ndarray (n_samples, 4)
5454 A single linkage hierarchy in scipy.cluster.hierarchy format.
55-
55+
5656 min_cluster_size : int, optional (default 10)
57- The minimum size of clusters to consider. Smaller "runt"
57+ The minimum size of clusters to consider. Smaller "runt"
5858 clusters are pruned from the tree.
59-
59+
6060 Returns
6161 -------
6262 condensed_tree : numpy recarray
@@ -341,22 +341,22 @@ cpdef np.ndarray[np.intp_t, ndim=1] labelling_at_cut(
341341 np.double_t cut,
342342 np.intp_t min_cluster_size):
343343 """ Given a single linkage tree and a cut value, return the
344- vector of cluster labels at that cut value. This is useful
344+ vector of cluster labels at that cut value. This is useful
345345 for Robust Single Linkage, and extracting DBSCAN results
346346 from a single HDBSCAN run.
347-
347+
348348 Parameters
349349 ----------
350350 linkage : ndarray (n_samples, 4)
351351 The single linkage tree in scipy.cluster.hierarchy format.
352-
352+
353353 cut : double
354354 The cut value at which to find clusters.
355-
355+
356356 min_cluster_size : int
357357 The minimum cluster size; clusters below this size at
358358 the cut will be considered noise.
359-
359+
360360 Returns
361361 -------
362362 labels : ndarray (n_samples,)
@@ -519,12 +519,12 @@ cdef get_probabilities(np.ndarray tree, dict cluster_map, np.ndarray labels):
519519
520520cpdef np.ndarray[np.double_t, ndim= 1 ] outlier_scores(np.ndarray tree):
521521 """ Generate GLOSH outlier scores from a condensed tree.
522-
522+
523523 Parameters
524524 ----------
525525 tree : numpy recarray
526526 The condensed tree to generate GLOSH outlier scores from
527-
527+
528528 Returns
529529 -------
530530 outlier_scores : ndarray (n_samples,)
@@ -609,43 +609,80 @@ cpdef list get_cluster_tree_leaves(np.ndarray cluster_tree):
609609 root = cluster_tree[' parent' ].min()
610610 return recurse_leaf_dfs(cluster_tree, root)
611611
612+ cpdef np.intp_t traverse_upwards(np.ndarray cluster_tree, np.double_t cut_value, np.intp_t leaf):
613+
614+ root = cluster_tree[' parent' ].min()
615+ parent = cluster_tree[cluster_tree[' child' ] == leaf][' parent' ]
616+ if parent == root:
617+ return leaf # return node closest to root
618+
619+ parent_eps = 1 / cluster_tree[cluster_tree[' child' ] == parent][' lambda_val' ]
620+ if parent_eps > cut_value:
621+ return parent
622+ else :
623+ return traverse_upwards(cluster_tree, cut_value, parent)
624+
625+ cpdef set epsilon_search(set leaves, np.ndarray cluster_tree, np.double_t cut_value):
626+
627+ selected_clusters = list ()
628+ processed = list ()
629+
630+ for leaf in leaves:
631+ eps = 1 / cluster_tree[' lambda_val' ][cluster_tree[' child' ] == leaf][0 ]
632+ if eps < cut_value:
633+ if leaf not in processed:
634+ epsilon_child = traverse_upwards(cluster_tree, cut_value, leaf)
635+ selected_clusters.append(epsilon_child)
636+
637+ for sub_node in bfs_from_cluster_tree(cluster_tree, epsilon_child):
638+ if sub_node != epsilon_child:
639+ processed.append(sub_node)
640+ else :
641+ selected_clusters.append(leaf)
642+
643+ return set (selected_clusters)
644+
612645cpdef tuple get_clusters(np.ndarray tree, dict stability,
613646 cluster_selection_method = ' eom' ,
614647 allow_single_cluster = False ,
615- match_reference_implementation = False ):
616- """ Given a tree and stability dict, produce the cluster labels
648+ match_reference_implementation = False ,
649+ cut_value = None ):
650+ """ Given a tree and stability dict, produce the cluster labels
617651 (and probabilities) for a flat clustering based on the chosen
618652 cluster selection method.
619-
653+
620654 Parameters
621655 ----------
622656 tree : numpy recarray
623657 The condensed tree to extract flat clusters from
624-
658+
625659 stability : dict
626660 A dictionary mapping cluster_ids to stability values
627-
661+
628662 cluster_selection_method : string, optional (default 'eom')
629663 The method of selecting clusters. The default is the
630664 Excess of Mass algorithm specified by 'eom'. The alternate
631665 option is 'leaf'.
632-
666+
633667 allow_single_cluster : boolean, optional (default False)
634668 Whether to allow a single cluster to be selected by the
635669 Excess of Mass algorithm.
636-
670+
637671 match_reference_implementation : boolean, optional (default False)
638672 Whether to match the reference implementation in how to handle
639673 certain edge cases.
640-
674+
675+ cut_value: float, optional (default None)
676+ A threshold for cluster splits
677+
641678 Returns
642679 -------
643680 labels : ndarray (n_samples,)
644681 An integer array of cluster labels, with -1 denoting noise.
645-
682+
646683 probabilities : ndarray (n_samples,)
647684 The cluster membership strength of each sample.
648-
685+
649686 stabilities : ndarray (n_clusters,)
650687 The cluster coherence strengths of each cluster.
651688 """
@@ -689,21 +726,40 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability,
689726 for sub_node in bfs_from_cluster_tree(cluster_tree, node):
690727 if sub_node != node:
691728 is_cluster[sub_node] = False
729+
730+ if cut_value is not None and cut_value != 0.0 :
731+
732+ eom_clusters = set ([c for c in is_cluster if is_cluster[c]])
733+ selected_clusters = epsilon_search(eom_clusters, cluster_tree, cut_value)
734+ for c in is_cluster:
735+ if c in selected_clusters:
736+ is_cluster[c] = True
737+ else :
738+ is_cluster[c] = False
739+
740+
692741 elif cluster_selection_method == ' leaf' :
693742 leaves = set (get_cluster_tree_leaves(cluster_tree))
694743 if len (leaves) == 0 :
695744 for c in is_cluster:
696745 is_cluster[c] = False
697746 is_cluster[tree[' parent' ].min()] = True
747+
748+ if cut_value is not None and cut_value != 0.0 :
749+ selected_clusters = epsilon_search(leaves, cluster_tree, cut_value)
750+ else :
751+ selected_clusters = leaves
752+
698753 for c in is_cluster:
699- if c in leaves :
700- is_cluster[c] = True
701- else :
702- is_cluster[c] = False
754+ if c in selected_clusters :
755+ is_cluster[c] = True
756+ else :
757+ is_cluster[c] = False
703758 else :
704759 raise ValueError (' Invalid Cluster Selection Method: %s \n '
705760 ' Should be one of: "eom", "leaf"\n ' )
706761
762+
707763 clusters = set ([c for c in is_cluster if is_cluster[c]])
708764 cluster_map = {c: n for n, c in enumerate (sorted (list (clusters)))}
709765 reverse_cluster_map = {n: c for c, n in cluster_map.items()}
0 commit comments