Skip to content

Commit a1d0d0f

Browse files
author
Guillaume Lemaitre
committed
PEP8 tree
1 parent 633b1b9 commit a1d0d0f

File tree

1 file changed

+64
-42
lines changed

1 file changed

+64
-42
lines changed

hdbscan/_hdbscan_tree.pyx

Lines changed: 64 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
#cython: boundscheck=False, nonecheck=False, initializedcheck=False
1+
# cython: boundscheck=False
2+
# cython: nonecheck=False
3+
# cython: initializedcheck=False
24
# Tree handling (condensing, finding stable clusters) for hdbscan
35
# Authors: Leland McInnes
46
# License: 3-clause BSD
@@ -8,8 +10,9 @@ cimport numpy as np
810

911
cdef np.double_t INFTY = np.inf
1012

11-
cdef list bfs_from_hierarchy(np.ndarray[np.double_t, ndim=2] hierarchy, np.intp_t bfs_root):
1213

14+
cdef list bfs_from_hierarchy(np.ndarray[np.double_t, ndim=2] hierarchy,
15+
np.intp_t bfs_root):
1316
"""
1417
Perform a breadth first search on a tree in scipy hclust format.
1518
"""
@@ -29,12 +32,14 @@ cdef list bfs_from_hierarchy(np.ndarray[np.double_t, ndim=2] hierarchy, np.intp_
2932
while to_process:
3033
result.extend(to_process)
3134
to_process = [x - num_points for x in
32-
to_process if x >= num_points]
35+
to_process if x >= num_points]
3336
if to_process:
34-
to_process = hierarchy[to_process,:2].flatten().astype(np.intp).tolist()
37+
to_process = hierarchy[to_process,
38+
:2].flatten().astype(np.intp).tolist()
3539

3640
return result
3741

42+
3843
cpdef np.ndarray condense_tree(np.ndarray[np.double_t, ndim=2] hierarchy,
3944
np.intp_t min_cluster_size=10):
4045

@@ -92,43 +97,48 @@ cpdef np.ndarray condense_tree(np.ndarray[np.double_t, ndim=2] hierarchy,
9297
if left_count >= min_cluster_size and right_count >= min_cluster_size:
9398
relabel[left] = next_label
9499
next_label += 1
95-
result_list.append((relabel[node], relabel[left], lambda_value, left_count))
100+
result_list.append((relabel[node], relabel[left], lambda_value,
101+
left_count))
96102

97103
relabel[right] = next_label
98104
next_label += 1
99-
result_list.append((relabel[node], relabel[right], lambda_value, right_count))
105+
result_list.append((relabel[node], relabel[right], lambda_value,
106+
right_count))
100107

101108
elif left_count < min_cluster_size and right_count < min_cluster_size:
102109
for sub_node in bfs_from_hierarchy(hierarchy, left):
103110
if sub_node < num_points:
104-
result_list.append((relabel[node], sub_node, lambda_value, 1))
111+
result_list.append((relabel[node], sub_node,
112+
lambda_value, 1))
105113
ignore[sub_node] = True
106114

107115
for sub_node in bfs_from_hierarchy(hierarchy, right):
108116
if sub_node < num_points:
109-
result_list.append((relabel[node], sub_node, lambda_value, 1))
117+
result_list.append((relabel[node], sub_node,
118+
lambda_value, 1))
110119
ignore[sub_node] = True
111120

112121
elif left_count < min_cluster_size:
113122
relabel[right] = relabel[node]
114123
for sub_node in bfs_from_hierarchy(hierarchy, left):
115124
if sub_node < num_points:
116-
result_list.append((relabel[node], sub_node, lambda_value, 1))
125+
result_list.append((relabel[node], sub_node,
126+
lambda_value, 1))
117127
ignore[sub_node] = True
118128

119129
else:
120130
relabel[left] = relabel[node]
121131
for sub_node in bfs_from_hierarchy(hierarchy, right):
122132
if sub_node < num_points:
123-
result_list.append((relabel[node], sub_node, lambda_value, 1))
133+
result_list.append((relabel[node], sub_node,
134+
lambda_value, 1))
124135
ignore[sub_node] = True
125136

126-
return np.array(result_list, dtype=[
127-
('parent', np.intp),
137+
return np.array(result_list, dtype=[('parent', np.intp),
128138
('child', np.intp),
129139
('lambda_val', float),
130-
('child_size', np.intp)
131-
])
140+
('child_size', np.intp)])
141+
132142

133143
cpdef dict compute_stability(np.ndarray condensed_tree):
134144

@@ -154,12 +164,14 @@ cpdef dict compute_stability(np.ndarray condensed_tree):
154164

155165
cdef np.intp_t largest_child = condensed_tree['child'].max()
156166
cdef np.intp_t smallest_cluster = condensed_tree['parent'].min()
157-
cdef np.intp_t num_clusters = condensed_tree['parent'].max() - smallest_cluster + 1
167+
cdef np.intp_t num_clusters = (condensed_tree['parent'].max() -
168+
smallest_cluster + 1)
158169

159170
if largest_child < smallest_cluster:
160171
largest_child = smallest_cluster
161172

162-
sorted_child_data = np.sort(condensed_tree[['child', 'lambda_val']], axis=0)
173+
sorted_child_data = np.sort(condensed_tree[['child', 'lambda_val']],
174+
axis=0)
163175
births_arr = np.nan * np.ones(largest_child + 1, dtype=np.double)
164176
births = (<np.double_t *> births_arr.data)
165177
sorted_children = sorted_child_data['child'].copy()
@@ -201,10 +213,13 @@ cpdef dict compute_stability(np.ndarray condensed_tree):
201213

202214
result_arr[result_index] += (lambda_ - births[parent]) * child_size
203215

204-
result_pre_dict = np.vstack((np.arange(smallest_cluster, condensed_tree['parent'].max() + 1), result_arr)).T
216+
result_pre_dict = np.vstack((np.arange(smallest_cluster,
217+
condensed_tree['parent'].max() + 1),
218+
result_arr)).T
205219

206220
return dict(result_pre_dict)
207221

222+
208223
cdef list bfs_from_cluster_tree(np.ndarray tree, np.intp_t bfs_root):
209224

210225
cdef list result
@@ -219,6 +234,7 @@ cdef list bfs_from_cluster_tree(np.ndarray tree, np.intp_t bfs_root):
219234

220235
return result
221236

237+
222238
cdef max_lambdas(np.ndarray tree):
223239

224240
cdef np.ndarray sorted_parent_data
@@ -261,16 +277,18 @@ cdef max_lambdas(np.ndarray tree):
261277

262278
return deaths_arr
263279

280+
264281
cdef class TreeUnionFind (object):
265282

266283
cdef np.ndarray _data_arr
267-
cdef np.intp_t[:,::1] _data
284+
cdef np.intp_t[:, ::1] _data
268285
cdef np.ndarray is_component
269286

270287
def __init__(self, size):
271288
self._data_arr = np.zeros((size, 2), dtype=np.intp)
272289
self._data_arr.T[0] = np.arange(size)
273-
self._data = (<np.intp_t[:size, :2:1]> (<np.intp_t *> self._data_arr.data))
290+
self._data = (<np.intp_t[:size, :2:1]> (
291+
<np.intp_t *> self._data_arr.data))
274292
self.is_component = np.ones(size, dtype=np.bool)
275293

276294
cdef union_(self, np.intp_t x, np.intp_t y):
@@ -296,9 +314,11 @@ cdef class TreeUnionFind (object):
296314
cdef np.ndarray[np.intp_t, ndim=1] components(self):
297315
return self.is_component.nonzero()[0]
298316

299-
cpdef np.ndarray[np.intp_t, ndim=1] labelling_at_cut(np.ndarray linkage,
300-
np.double_t cut,
301-
np.intp_t min_cluster_size):
317+
318+
cpdef np.ndarray[np.intp_t, ndim=1] labelling_at_cut(
319+
np.ndarray linkage,
320+
np.double_t cut,
321+
np.intp_t min_cluster_size):
302322

303323
cdef np.intp_t root
304324
cdef np.intp_t num_points
@@ -311,7 +331,6 @@ cpdef np.ndarray[np.intp_t, ndim=1] labelling_at_cut(np.ndarray linkage,
311331
cdef np.intp_t cluster
312332
cdef np.intp_t cluster_id
313333

314-
315334
root = 2 * linkage.shape[0]
316335
num_points = root // 2 + 1
317336

@@ -327,14 +346,13 @@ cpdef np.ndarray[np.intp_t, ndim=1] labelling_at_cut(np.ndarray linkage,
327346
union_find.union_(<np.intp_t> row[1], cluster)
328347
cluster += 1
329348

330-
331349
cluster_size = np.zeros(cluster, dtype=np.intp)
332350
for n in range(num_points):
333351
cluster = union_find.find(n)
334352
cluster_size[cluster] += 1
335353
result[n] = cluster
336354

337-
cluster_label_map = {-1:-1}
355+
cluster_label_map = {-1: -1}
338356
cluster_label = 0
339357
unique_labels = np.unique(result_arr)
340358

@@ -350,10 +368,12 @@ cpdef np.ndarray[np.intp_t, ndim=1] labelling_at_cut(np.ndarray linkage,
350368

351369
return result_arr
352370

353-
cdef np.ndarray[np.intp_t, ndim=1] do_labelling(np.ndarray tree,
354-
set clusters,
355-
dict cluster_label_map,
356-
np.intp_t allow_single_cluster):
371+
372+
cdef np.ndarray[np.intp_t, ndim=1] do_labelling(
373+
np.ndarray tree,
374+
set clusters,
375+
dict cluster_label_map,
376+
np.intp_t allow_single_cluster):
357377

358378
cdef np.intp_t root_cluster
359379
cdef np.ndarray[np.intp_t, ndim=1] result_arr
@@ -373,7 +393,6 @@ cdef np.ndarray[np.intp_t, ndim=1] do_labelling(np.ndarray tree,
373393
result_arr = np.empty(root_cluster, dtype=np.intp)
374394
result = (<np.intp_t *> result_arr.data)
375395

376-
377396
union_find = TreeUnionFind(parent_array.max() + 1)
378397

379398
for n in range(tree.shape[0]):
@@ -398,6 +417,7 @@ cdef np.ndarray[np.intp_t, ndim=1] do_labelling(np.ndarray tree,
398417

399418
return result_arr
400419

420+
401421
cdef get_probabilities(np.ndarray tree, dict cluster_map, np.ndarray labels):
402422

403423
cdef np.ndarray[np.double_t, ndim=1] result
@@ -441,6 +461,7 @@ cdef get_probabilities(np.ndarray tree, dict cluster_map, np.ndarray labels):
441461

442462
return result
443463

464+
444465
cpdef np.ndarray[np.double_t, ndim=1] outlier_scores(np.ndarray tree):
445466

446467
cdef np.ndarray[np.double_t, ndim=1] result
@@ -463,7 +484,7 @@ cpdef np.ndarray[np.double_t, ndim=1] outlier_scores(np.ndarray tree):
463484
result = np.zeros(root_cluster, dtype=np.double)
464485

465486
topological_sort_order = np.argsort(parent_array)
466-
#topologically_sorted_tree = tree[topological_sort_order]
487+
# topologically_sorted_tree = tree[topological_sort_order]
467488

468489
for n in topological_sort_order:
469490
cluster = child_array[n]
@@ -489,6 +510,7 @@ cpdef np.ndarray[np.double_t, ndim=1] outlier_scores(np.ndarray tree):
489510

490511
return result
491512

513+
492514
cpdef np.ndarray get_stability_scores(np.ndarray labels, set clusters,
493515
dict stability, np.double_t max_lambda):
494516

@@ -501,7 +523,9 @@ cpdef np.ndarray get_stability_scores(np.ndarray labels, set clusters,
501523

502524
return result
503525

504-
cpdef tuple get_clusters(np.ndarray tree, dict stability, allow_single_cluster=False):
526+
527+
cpdef tuple get_clusters(np.ndarray tree, dict stability,
528+
allow_single_cluster=False):
505529
"""
506530
The tree is assumed to have numeric node ids such that a reverse numeric
507531
sort is equivalent to a topological sort.
@@ -525,17 +549,19 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability, allow_single_cluster=F
525549
if allow_single_cluster:
526550
node_list = sorted(stability.keys(), reverse=True)
527551
else:
528-
node_list = sorted(stability.keys(), reverse=True)[:-1] # (exclude root)
552+
node_list = sorted(stability.keys(), reverse=True)[:-1]
553+
# (exclude root)
529554

530555
cluster_tree = tree[tree['child_size'] > 1]
531-
is_cluster = {cluster:True for cluster in node_list}
556+
is_cluster = {cluster: True for cluster in node_list}
532557
num_points = np.max(tree[tree['child_size'] == 1]['child']) + 1
533558
max_lambda = np.max(tree['lambda_val'])
534559

535560
for node in node_list:
536561
child_selection = (cluster_tree['parent'] == node)
537-
subtree_stability = np.sum([stability[child] for
538-
child in cluster_tree['child'][child_selection]])
562+
subtree_stability = np.sum([
563+
stability[child] for
564+
child in cluster_tree['child'][child_selection]])
539565
if subtree_stability > stability[node]:
540566
is_cluster[node] = False
541567
stability[node] = subtree_stability
@@ -545,15 +571,11 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability, allow_single_cluster=F
545571
is_cluster[sub_node] = False
546572

547573
clusters = set([c for c in is_cluster if is_cluster[c]])
548-
cluster_map = {c:n for n, c in enumerate(clusters)}
549-
reverse_cluster_map = {n:c for n, c in enumerate(clusters)}
574+
cluster_map = {c: n for n, c in enumerate(clusters)}
575+
reverse_cluster_map = {n: c for n, c in enumerate(clusters)}
550576

551577
labels = do_labelling(tree, clusters, cluster_map, allow_single_cluster)
552578
probs = get_probabilities(tree, reverse_cluster_map, labels)
553579
stabilities = get_stability_scores(labels, clusters, stability, max_lambda)
554580

555581
return (labels, probs, stabilities)
556-
557-
558-
559-

0 commit comments

Comments
 (0)