1- # cython: boundscheck=False, nonecheck=False, initializedcheck=False
1+ # cython: boundscheck=False
2+ # cython: nonecheck=False
3+ # cython: initializedcheck=False
24# Tree handling (condensing, finding stable clusters) for hdbscan
35# Authors: Leland McInnes
46# License: 3-clause BSD
@@ -8,8 +10,9 @@ cimport numpy as np
810
911cdef np.double_t INFTY = np.inf
1012
11- cdef list bfs_from_hierarchy(np.ndarray[np.double_t, ndim= 2 ] hierarchy, np.intp_t bfs_root):
1213
14+ cdef list bfs_from_hierarchy(np.ndarray[np.double_t, ndim= 2 ] hierarchy,
15+ np.intp_t bfs_root):
1316 """
1417 Perform a breadth first search on a tree in scipy hclust format.
1518 """
@@ -29,12 +32,14 @@ cdef list bfs_from_hierarchy(np.ndarray[np.double_t, ndim=2] hierarchy, np.intp_
2932 while to_process:
3033 result.extend(to_process)
3134 to_process = [x - num_points for x in
32- to_process if x >= num_points]
35+ to_process if x >= num_points]
3336 if to_process:
34- to_process = hierarchy[to_process,:2 ].flatten().astype(np.intp).tolist()
37+ to_process = hierarchy[to_process,
38+ :2 ].flatten().astype(np.intp).tolist()
3539
3640 return result
3741
42+
3843cpdef np.ndarray condense_tree(np.ndarray[np.double_t, ndim= 2 ] hierarchy,
3944 np.intp_t min_cluster_size = 10 ):
4045
@@ -92,43 +97,48 @@ cpdef np.ndarray condense_tree(np.ndarray[np.double_t, ndim=2] hierarchy,
9297 if left_count >= min_cluster_size and right_count >= min_cluster_size:
9398 relabel[left] = next_label
9499 next_label += 1
95- result_list.append((relabel[node], relabel[left], lambda_value, left_count))
100+ result_list.append((relabel[node], relabel[left], lambda_value,
101+ left_count))
96102
97103 relabel[right] = next_label
98104 next_label += 1
99- result_list.append((relabel[node], relabel[right], lambda_value, right_count))
105+ result_list.append((relabel[node], relabel[right], lambda_value,
106+ right_count))
100107
101108 elif left_count < min_cluster_size and right_count < min_cluster_size:
102109 for sub_node in bfs_from_hierarchy(hierarchy, left):
103110 if sub_node < num_points:
104- result_list.append((relabel[node], sub_node, lambda_value, 1 ))
111+ result_list.append((relabel[node], sub_node,
112+ lambda_value, 1 ))
105113 ignore[sub_node] = True
106114
107115 for sub_node in bfs_from_hierarchy(hierarchy, right):
108116 if sub_node < num_points:
109- result_list.append((relabel[node], sub_node, lambda_value, 1 ))
117+ result_list.append((relabel[node], sub_node,
118+ lambda_value, 1 ))
110119 ignore[sub_node] = True
111120
112121 elif left_count < min_cluster_size:
113122 relabel[right] = relabel[node]
114123 for sub_node in bfs_from_hierarchy(hierarchy, left):
115124 if sub_node < num_points:
116- result_list.append((relabel[node], sub_node, lambda_value, 1 ))
125+ result_list.append((relabel[node], sub_node,
126+ lambda_value, 1 ))
117127 ignore[sub_node] = True
118128
119129 else :
120130 relabel[left] = relabel[node]
121131 for sub_node in bfs_from_hierarchy(hierarchy, right):
122132 if sub_node < num_points:
123- result_list.append((relabel[node], sub_node, lambda_value, 1 ))
133+ result_list.append((relabel[node], sub_node,
134+ lambda_value, 1 ))
124135 ignore[sub_node] = True
125136
126- return np.array(result_list, dtype = [
127- (' parent' , np.intp),
137+ return np.array(result_list, dtype = [(' parent' , np.intp),
128138 (' child' , np.intp),
129139 (' lambda_val' , float ),
130- (' child_size' , np.intp)
131- ])
140+ (' child_size' , np.intp)])
141+
132142
133143cpdef dict compute_stability(np.ndarray condensed_tree):
134144
@@ -154,12 +164,14 @@ cpdef dict compute_stability(np.ndarray condensed_tree):
154164
155165 cdef np.intp_t largest_child = condensed_tree[' child' ].max()
156166 cdef np.intp_t smallest_cluster = condensed_tree[' parent' ].min()
157- cdef np.intp_t num_clusters = condensed_tree[' parent' ].max() - smallest_cluster + 1
167+ cdef np.intp_t num_clusters = (condensed_tree[' parent' ].max() -
168+ smallest_cluster + 1 )
158169
159170 if largest_child < smallest_cluster:
160171 largest_child = smallest_cluster
161172
162- sorted_child_data = np.sort(condensed_tree[[' child' , ' lambda_val' ]], axis = 0 )
173+ sorted_child_data = np.sort(condensed_tree[[' child' , ' lambda_val' ]],
174+ axis = 0 )
163175 births_arr = np.nan * np.ones(largest_child + 1 , dtype = np.double)
164176 births = (< np.double_t * > births_arr.data)
165177 sorted_children = sorted_child_data[' child' ].copy()
@@ -201,10 +213,13 @@ cpdef dict compute_stability(np.ndarray condensed_tree):
201213
202214 result_arr[result_index] += (lambda_ - births[parent]) * child_size
203215
204- result_pre_dict = np.vstack((np.arange(smallest_cluster, condensed_tree[' parent' ].max() + 1 ), result_arr)).T
216+ result_pre_dict = np.vstack((np.arange(smallest_cluster,
217+ condensed_tree[' parent' ].max() + 1 ),
218+ result_arr)).T
205219
206220 return dict (result_pre_dict)
207221
222+
208223cdef list bfs_from_cluster_tree(np.ndarray tree, np.intp_t bfs_root):
209224
210225 cdef list result
@@ -219,6 +234,7 @@ cdef list bfs_from_cluster_tree(np.ndarray tree, np.intp_t bfs_root):
219234
220235 return result
221236
237+
222238cdef max_lambdas(np.ndarray tree):
223239
224240 cdef np.ndarray sorted_parent_data
@@ -261,16 +277,18 @@ cdef max_lambdas(np.ndarray tree):
261277
262278 return deaths_arr
263279
280+
264281cdef class TreeUnionFind (object ):
265282
266283 cdef np.ndarray _data_arr
267- cdef np.intp_t[:,::1 ] _data
284+ cdef np.intp_t[:, ::1 ] _data
268285 cdef np.ndarray is_component
269286
270287 def __init__ (self , size ):
271288 self ._data_arr = np.zeros((size, 2 ), dtype = np.intp)
272289 self ._data_arr.T[0 ] = np.arange(size)
273- self ._data = (< np.intp_t[:size, :2 :1 ]> (< np.intp_t * > self ._data_arr.data))
290+ self ._data = (< np.intp_t[:size, :2 :1 ]> (
291+ < np.intp_t * > self ._data_arr.data))
274292 self .is_component = np.ones(size, dtype = np.bool)
275293
276294 cdef union_(self , np.intp_t x, np.intp_t y):
@@ -296,9 +314,11 @@ cdef class TreeUnionFind (object):
296314 cdef np.ndarray[np.intp_t, ndim= 1 ] components(self ):
297315 return self .is_component.nonzero()[0 ]
298316
299- cpdef np.ndarray[np.intp_t, ndim= 1 ] labelling_at_cut(np.ndarray linkage,
300- np.double_t cut,
301- np.intp_t min_cluster_size):
317+
318+ cpdef np.ndarray[np.intp_t, ndim= 1 ] labelling_at_cut(
319+ np.ndarray linkage,
320+ np.double_t cut,
321+ np.intp_t min_cluster_size):
302322
303323 cdef np.intp_t root
304324 cdef np.intp_t num_points
@@ -311,7 +331,6 @@ cpdef np.ndarray[np.intp_t, ndim=1] labelling_at_cut(np.ndarray linkage,
311331 cdef np.intp_t cluster
312332 cdef np.intp_t cluster_id
313333
314-
315334 root = 2 * linkage.shape[0 ]
316335 num_points = root // 2 + 1
317336
@@ -327,14 +346,13 @@ cpdef np.ndarray[np.intp_t, ndim=1] labelling_at_cut(np.ndarray linkage,
327346 union_find.union_(< np.intp_t> row[1 ], cluster)
328347 cluster += 1
329348
330-
331349 cluster_size = np.zeros(cluster, dtype = np.intp)
332350 for n in range (num_points):
333351 cluster = union_find.find(n)
334352 cluster_size[cluster] += 1
335353 result[n] = cluster
336354
337- cluster_label_map = {- 1 :- 1 }
355+ cluster_label_map = {- 1 : - 1 }
338356 cluster_label = 0
339357 unique_labels = np.unique(result_arr)
340358
@@ -350,10 +368,12 @@ cpdef np.ndarray[np.intp_t, ndim=1] labelling_at_cut(np.ndarray linkage,
350368
351369 return result_arr
352370
353- cdef np.ndarray[np.intp_t, ndim= 1 ] do_labelling(np.ndarray tree,
354- set clusters,
355- dict cluster_label_map,
356- np.intp_t allow_single_cluster):
371+
372+ cdef np.ndarray[np.intp_t, ndim= 1 ] do_labelling(
373+ np.ndarray tree,
374+ set clusters,
375+ dict cluster_label_map,
376+ np.intp_t allow_single_cluster):
357377
358378 cdef np.intp_t root_cluster
359379 cdef np.ndarray[np.intp_t, ndim= 1 ] result_arr
@@ -373,7 +393,6 @@ cdef np.ndarray[np.intp_t, ndim=1] do_labelling(np.ndarray tree,
373393 result_arr = np.empty(root_cluster, dtype = np.intp)
374394 result = (< np.intp_t * > result_arr.data)
375395
376-
377396 union_find = TreeUnionFind(parent_array.max() + 1 )
378397
379398 for n in range (tree.shape[0 ]):
@@ -398,6 +417,7 @@ cdef np.ndarray[np.intp_t, ndim=1] do_labelling(np.ndarray tree,
398417
399418 return result_arr
400419
420+
401421cdef get_probabilities(np.ndarray tree, dict cluster_map, np.ndarray labels):
402422
403423 cdef np.ndarray[np.double_t, ndim= 1 ] result
@@ -441,6 +461,7 @@ cdef get_probabilities(np.ndarray tree, dict cluster_map, np.ndarray labels):
441461
442462 return result
443463
464+
444465cpdef np.ndarray[np.double_t, ndim= 1 ] outlier_scores(np.ndarray tree):
445466
446467 cdef np.ndarray[np.double_t, ndim= 1 ] result
@@ -463,7 +484,7 @@ cpdef np.ndarray[np.double_t, ndim=1] outlier_scores(np.ndarray tree):
463484 result = np.zeros(root_cluster, dtype = np.double)
464485
465486 topological_sort_order = np.argsort(parent_array)
466- # topologically_sorted_tree = tree[topological_sort_order]
487+ # topologically_sorted_tree = tree[topological_sort_order]
467488
468489 for n in topological_sort_order:
469490 cluster = child_array[n]
@@ -489,6 +510,7 @@ cpdef np.ndarray[np.double_t, ndim=1] outlier_scores(np.ndarray tree):
489510
490511 return result
491512
513+
492514cpdef np.ndarray get_stability_scores(np.ndarray labels, set clusters,
493515 dict stability, np.double_t max_lambda):
494516
@@ -501,7 +523,9 @@ cpdef np.ndarray get_stability_scores(np.ndarray labels, set clusters,
501523
502524 return result
503525
504- cpdef tuple get_clusters(np.ndarray tree, dict stability, allow_single_cluster = False ):
526+
527+ cpdef tuple get_clusters(np.ndarray tree, dict stability,
528+ allow_single_cluster = False ):
505529 """
506530 The tree is assumed to have numeric node ids such that a reverse numeric
507531 sort is equivalent to a topological sort.
@@ -525,17 +549,19 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability, allow_single_cluster=F
525549 if allow_single_cluster:
526550 node_list = sorted (stability.keys(), reverse = True )
527551 else :
528- node_list = sorted (stability.keys(), reverse = True )[:- 1 ] # (exclude root)
552+ node_list = sorted (stability.keys(), reverse = True )[:- 1 ]
553+ # (exclude root)
529554
530555 cluster_tree = tree[tree[' child_size' ] > 1 ]
531- is_cluster = {cluster:True for cluster in node_list}
556+ is_cluster = {cluster: True for cluster in node_list}
532557 num_points = np.max(tree[tree[' child_size' ] == 1 ][' child' ]) + 1
533558 max_lambda = np.max(tree[' lambda_val' ])
534559
535560 for node in node_list:
536561 child_selection = (cluster_tree[' parent' ] == node)
537- subtree_stability = np.sum([stability[child] for
538- child in cluster_tree[' child' ][child_selection]])
562+ subtree_stability = np.sum([
563+ stability[child] for
564+ child in cluster_tree[' child' ][child_selection]])
539565 if subtree_stability > stability[node]:
540566 is_cluster[node] = False
541567 stability[node] = subtree_stability
@@ -545,15 +571,11 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability, allow_single_cluster=F
545571 is_cluster[sub_node] = False
546572
547573 clusters = set ([c for c in is_cluster if is_cluster[c]])
548- cluster_map = {c:n for n, c in enumerate (clusters)}
549- reverse_cluster_map = {n:c for n, c in enumerate (clusters)}
574+ cluster_map = {c: n for n, c in enumerate (clusters)}
575+ reverse_cluster_map = {n: c for n, c in enumerate (clusters)}
550576
551577 labels = do_labelling(tree, clusters, cluster_map, allow_single_cluster)
552578 probs = get_probabilities(tree, reverse_cluster_map, labels)
553579 stabilities = get_stability_scores(labels, clusters, stability, max_lambda)
554580
555581 return (labels, probs, stabilities)
556-
557-
558-
559-
0 commit comments