Skip to content

Commit 1a695d4

Browse files
committed
Merge remote-tracking branch 'origin/master'
# Conflicts: # hdbscan/hdbscan_.py # hdbscan/robust_single_linkage_.py
2 parents fdf2331 + 1f0ae04 commit 1a695d4

File tree

9 files changed

+978
-676
lines changed

9 files changed

+978
-676
lines changed

hdbscan/_hdbscan_boruvka.pyx

Lines changed: 302 additions & 196 deletions
Large diffs are not rendered by default.

hdbscan/_hdbscan_linkage.pyx

Lines changed: 30 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
#cython: boundscheck=False, nonecheck=False
1+
# cython: boundscheck=False
2+
# cython: nonecheck=False
23
# Minimum spanning tree single linkage implementation for hdbscan
34
# Authors: Leland McInnes, Steve Astels
45
# License: 3-clause BSD
@@ -10,8 +11,10 @@ from libc.float cimport DBL_MAX
1011

1112
from dist_metrics cimport DistanceMetric
1213

14+
1315
cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core(
14-
np.ndarray[np.double_t, ndim=2] distance_matrix):
16+
np.ndarray[np.double_t,
17+
ndim=2] distance_matrix):
1518

1619
cdef np.ndarray[np.intp_t, ndim=1] node_labels
1720
cdef np.ndarray[np.intp_t, ndim=1] current_labels
@@ -32,7 +35,7 @@ cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core(
3235
current_node = 0
3336
current_distances = np.infty * np.ones(distance_matrix.shape[0])
3437
current_labels = node_labels
35-
for i in range(1,node_labels.shape[0]):
38+
for i in range(1, node_labels.shape[0]):
3639
label_filter = current_labels != current_node
3740
current_labels = current_labels[label_filter]
3841
left = current_distances[label_filter]
@@ -48,12 +51,14 @@ cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core(
4851

4952
return result
5053

54+
5155
cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core_vector(
52-
np.ndarray[np.double_t, ndim=2, mode='c'] raw_data,
53-
np.ndarray[np.double_t, ndim=1, mode='c'] core_distances,
54-
DistanceMetric dist_metric,
55-
np.double_t alpha=1.0):
56+
np.ndarray[np.double_t, ndim=2, mode='c'] raw_data,
57+
np.ndarray[np.double_t, ndim=1, mode='c'] core_distances,
58+
DistanceMetric dist_metric,
59+
np.double_t alpha=1.0):
5660

61+
# Add a comment
5762
cdef np.ndarray[np.double_t, ndim=1] current_distances_arr
5863
cdef np.ndarray[np.int8_t, ndim=1] in_tree_arr
5964
cdef np.ndarray[np.double_t, ndim=2] result_arr
@@ -83,15 +88,16 @@ cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core_vector(
8388
dim = raw_data.shape[0]
8489
num_features = raw_data.shape[1]
8590

86-
raw_data_view = (<np.double_t [:raw_data.shape[0], :raw_data.shape[1]:1]> (<np.double_t *> raw_data.data))
91+
raw_data_view = (<np.double_t[:raw_data.shape[0], :raw_data.shape[1]:1]> (
92+
<np.double_t *> raw_data.data))
8793
raw_data_ptr = (<np.double_t *> &raw_data_view[0, 0])
8894

8995
result_arr = np.zeros((dim - 1, 3))
9096
in_tree_arr = np.zeros(dim, dtype=np.int8)
9197
current_node = 0
9298
current_distances_arr = np.infty * np.ones(dim)
9399

94-
result = (<np.double_t [:dim - 1, :3:1]> (<np.double_t *> result_arr.data))
100+
result = (<np.double_t[:dim - 1, :3:1]> (<np.double_t *> result_arr.data))
95101
in_tree = (<np.int8_t *> in_tree_arr.data)
96102
current_distances = (<np.double_t *> current_distances_arr.data)
97103
current_core_distances = (<np.double_t *> core_distances.data)
@@ -110,15 +116,18 @@ cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core_vector(
110116
continue
111117

112118
right_value = current_distances[j]
113-
left_value = dist_metric.dist(&raw_data_ptr[num_features * current_node],
119+
left_value = dist_metric.dist(&raw_data_ptr[num_features *
120+
current_node],
114121
&raw_data_ptr[num_features * j],
115122
num_features)
116123

117124
if alpha != 1.0:
118125
left_value /= alpha
119126

120127
core_value = core_distances[j]
121-
if current_node_core_distance > right_value or core_value > right_value or left_value > right_value:
128+
if (current_node_core_distance > right_value or
129+
core_value > right_value or
130+
left_value > right_value):
122131
if right_value < new_distance:
123132
new_distance = right_value
124133
new_node = j
@@ -148,6 +157,7 @@ cpdef np.ndarray[np.double_t, ndim=2] mst_linkage_core_vector(
148157

149158
return result_arr
150159

160+
151161
cdef class UnionFind (object):
152162

153163
cdef np.ndarray parent_arr
@@ -183,6 +193,7 @@ cdef class UnionFind (object):
183193
p, self.parent_arr[p] = self.parent_arr[p], n
184194
return n
185195

196+
186197
cpdef np.ndarray[np.double_t, ndim=2] label(np.ndarray[np.double_t, ndim=2] L):
187198

188199
cdef np.ndarray[np.double_t, ndim=2] result_arr
@@ -192,7 +203,8 @@ cpdef np.ndarray[np.double_t, ndim=2] label(np.ndarray[np.double_t, ndim=2] L):
192203
cdef np.double_t delta
193204

194205
result_arr = np.zeros((L.shape[0], L.shape[1] + 1))
195-
result = (<np.double_t[:L.shape[0], :4:1]> (<np.double_t *> result_arr.data))
206+
result = (<np.double_t[:L.shape[0], :4:1]> (
207+
<np.double_t *> result_arr.data))
196208
N = L.shape[0] + 1
197209
U = UnionFind(N)
198210

@@ -208,18 +220,18 @@ cpdef np.ndarray[np.double_t, ndim=2] label(np.ndarray[np.double_t, ndim=2] L):
208220
result[index][1] = bb
209221
result[index][2] = delta
210222
result[index][3] = U.size[aa] + U.size[bb]
211-
223+
212224
U.union(aa, bb)
213-
225+
214226
return result_arr
215227

228+
216229
cpdef np.ndarray[np.double_t, ndim=2] single_linkage(distance_matrix):
217-
230+
218231
cdef np.ndarray[np.double_t, ndim=2] hierarchy
219232
cdef np.ndarray[np.double_t, ndim=2] for_labelling
220-
233+
221234
hierarchy = mst_linkage_core(distance_matrix)
222235
for_labelling = hierarchy[np.argsort(hierarchy.T[2]), :]
223-
return label(for_labelling)
224236

225-
237+
return label(for_labelling)

hdbscan/_hdbscan_reachability.pyx

Lines changed: 40 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
#cython: boundscheck=False, nonecheck=False, initializedcheck=False
1+
# cython: boundscheck=False
2+
# cython: nonecheck=False
3+
# cython: initializedcheck=False
24
# mutual reachability distance compiutations
35
# Authors: Leland McInnes
46
# License: 3-clause BSD
@@ -11,51 +13,54 @@ from scipy.sparse import lil_matrix as sparse_matrix
1113
from sklearn.neighbors import KDTree, BallTree
1214
import gc
1315

16+
1417
def mutual_reachability(distance_matrix, min_points=5, alpha=1.0):
1518
"""Compute the weighted adjacency matrix of the mutual reachability
1619
graph of a distance matrix.
17-
20+
1821
Parameters
1922
----------
20-
distance_matrix : array [n_samples, n_samples]
23+
distance_matrix : ndarray, shape (n_samples, n_samples)
2124
Array of distances between samples.
22-
23-
min_points : int optional
25+
26+
min_points : int, optional (default=5)
2427
The number of points in a neighbourhood for a point to be considered
25-
a core point. (defaults to 5)
28+
a core point.
2629
2730
Returns
2831
-------
29-
mututal_reachability: array [n_samples, n_samples]
32+
mututal_reachability: ndarray, shape (n_samples, n_samples)
3033
Weighted adjacency matrix of the mutual reachability graph.
31-
34+
3235
References
3336
----------
34-
R. Campello, D. Moulavi, and J. Sander, "Density-Based Clustering Based on
35-
Hierarchical Density Estimates"
36-
In: Advances in Knowledge Discovery and Data Mining, Springer, pp 160-172.
37-
2013
37+
.. [1] Campello, R. J., Moulavi, D., & Sander, J. (2013, April).
38+
Density-based clustering based on hierarchical density estimates.
39+
In Pacific-Asia Conference on Knowledge Discovery and Data Mining
40+
(pp. 160-172). Springer Berlin Heidelberg.
3841
"""
3942
size = distance_matrix.shape[0]
4043
min_points = min(size - 1, min_points)
4144
try:
42-
core_distances = np.partition(distance_matrix,
43-
min_points,
45+
core_distances = np.partition(distance_matrix,
46+
min_points,
4447
axis=0)[min_points]
4548
except AttributeError:
4649
core_distances = np.sort(distance_matrix,
4750
axis=0)[min_points]
4851

4952
if alpha != 1.0:
5053
distance_matrix = distance_matrix / alpha
51-
52-
stage1 = np.where(core_distances > distance_matrix,
54+
55+
stage1 = np.where(core_distances > distance_matrix,
5356
core_distances, distance_matrix)
5457
result = np.where(core_distances > stage1.T,
5558
core_distances.T, stage1.T).T
5659
return result
5760

58-
cpdef sparse_mutual_reachability(object lil_matrix, np.intp_t min_points=5, float alpha=1.0):
61+
62+
cpdef sparse_mutual_reachability(object lil_matrix, np.intp_t min_points=5,
63+
float alpha=1.0):
5964

6065
cdef np.intp_t i
6166
cdef np.intp_t j
@@ -88,7 +93,9 @@ cpdef sparse_mutual_reachability(object lil_matrix, np.intp_t min_points=5, floa
8893

8994
return result.tocsr()
9095

91-
def kdtree_mutual_reachability(X, distance_matrix, metric, p=2, min_points=5, alpha=1.0, **kwargs):
96+
97+
def kdtree_mutual_reachability(X, distance_matrix, metric, p=2, min_points=5,
98+
alpha=1.0, **kwargs):
9299
dim = distance_matrix.shape[0]
93100
min_points = min(dim - 1, min_points)
94101

@@ -97,7 +104,7 @@ def kdtree_mutual_reachability(X, distance_matrix, metric, p=2, min_points=5, al
97104
else:
98105
tree = KDTree(X, metric=metric, **kwargs)
99106

100-
core_distances = tree.query(X, k=min_points)[0][:,-1]
107+
core_distances = tree.query(X, k=min_points)[0][:, -1]
101108

102109
if alpha != 1.0:
103110
distance_matrix = distance_matrix / alpha
@@ -108,13 +115,15 @@ def kdtree_mutual_reachability(X, distance_matrix, metric, p=2, min_points=5, al
108115
core_distances.T, stage1.T).T
109116
return result
110117

111-
def balltree_mutual_reachability(X, distance_matrix, metric, p=2, min_points=5, alpha=1.0, **kwargs):
118+
119+
def balltree_mutual_reachability(X, distance_matrix, metric, p=2, min_points=5,
120+
alpha=1.0, **kwargs):
112121
dim = distance_matrix.shape[0]
113122
min_points = min(dim - 1, min_points)
114123

115124
tree = BallTree(X, metric=metric, **kwargs)
116125

117-
core_distances = tree.query(X, k=min_points)[0][:,-1]
126+
core_distances = tree.query(X, k=min_points)[0][:, -1]
118127

119128
if alpha != 1.0:
120129
distance_matrix = distance_matrix / alpha
@@ -125,8 +134,10 @@ def balltree_mutual_reachability(X, distance_matrix, metric, p=2, min_points=5,
125134
core_distances.T, stage1.T).T
126135
return result
127136

137+
128138
cdef np.ndarray[np.double_t, ndim=1] mutual_reachability_from_pdist(
129-
np.ndarray[np.double_t, ndim=1] core_distances, np.ndarray[np.double_t, ndim=1] dists, np.intp_t dim):
139+
np.ndarray[np.double_t, ndim=1] core_distances,
140+
np.ndarray[np.double_t, ndim=1] dists, np.intp_t dim):
130141

131142
cdef np.intp_t i
132143
cdef np.intp_t j
@@ -148,7 +159,8 @@ cdef np.ndarray[np.double_t, ndim=1] mutual_reachability_from_pdist(
148159
return dists
149160

150161

151-
def kdtree_pdist_mutual_reachability(X, metric, p=2, min_points=5, alpha=1.0, **kwargs):
162+
def kdtree_pdist_mutual_reachability(X, metric, p=2, min_points=5, alpha=1.0,
163+
**kwargs):
152164

153165
dim = X.shape[0]
154166
min_points = min(dim - 1, min_points)
@@ -158,7 +170,7 @@ def kdtree_pdist_mutual_reachability(X, metric, p=2, min_points=5, alpha=1.0, *
158170
else:
159171
tree = KDTree(X, metric=metric, **kwargs)
160172

161-
core_distances = tree.query(X, k=min_points)[0][:,-1]
173+
core_distances = tree.query(X, k=min_points)[0][:, -1]
162174

163175
del tree
164176
gc.collect()
@@ -172,14 +184,16 @@ def kdtree_pdist_mutual_reachability(X, metric, p=2, min_points=5, alpha=1.0, *
172184

173185
return dists
174186

175-
def balltree_pdist_mutual_reachability(X, metric, p=2, min_points=5, alpha=1.0, **kwargs):
187+
188+
def balltree_pdist_mutual_reachability(X, metric, p=2, min_points=5, alpha=1.0,
189+
**kwargs):
176190

177191
dim = X.shape[0]
178192
min_points = min(dim - 1, min_points)
179193

180194
tree = BallTree(X, metric=metric, **kwargs)
181195

182-
core_distances = tree.query(X, k=min_points)[0][:,-1]
196+
core_distances = tree.query(X, k=min_points)[0][:, -1]
183197

184198
del tree
185199
gc.collect()

0 commit comments

Comments
 (0)