Skip to content

Commit a525f04

Browse files
committed
Start documenting boruvka code more.
1 parent ed6743f commit a525f04

File tree

1 file changed

+199
-3
lines changed

1 file changed

+199
-3
lines changed

hdbscan/_hdbscan_boruvka.pyx

Lines changed: 199 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,9 @@ cdef inline np.double_t kdtree_min_dist_dual(dist_metrics.DistanceMetric metric,
122122

123123
return metric._rdist_to_dist(rdist)
124124

125+
# As above, but this time we use the rdist as per the kdtree
126+
# implementation. This allows us to release the GIL over
127+
# larger sections of code
125128
cdef inline np.double_t kdtree_min_rdist_dual(dist_metrics.DistanceMetric metric,
126129
np.intp_t node1,
127130
np.intp_t node2,
@@ -154,8 +157,24 @@ cdef inline np.double_t kdtree_min_rdist_dual(dist_metrics.DistanceMetric metric
154157

155158
return rdist
156159

157-
158160
cdef class BoruvkaUnionFind (object):
161+
"""Efficient union find implementation.
162+
163+
Parameters
164+
----------
165+
166+
size : int
167+
The total size of the set of objects to
168+
track via the union find structure.
169+
170+
Attributes
171+
----------
172+
173+
is_component : array of bool; shape (size, 1)
174+
Array specifying whether each element of the
175+
set is the root node, or identifier for
176+
a component.
177+
"""
159178

160179
cdef np.ndarray _data_arr
161180
cdef np.intp_t[:,::1] _data
@@ -168,6 +187,7 @@ cdef class BoruvkaUnionFind (object):
168187
self.is_component = np.ones(size, dtype=np.bool)
169188

170189
cdef int union_(self, np.intp_t x, np.intp_t y) except -1:
190+
"""Union together elements x and y"""
171191
cdef np.intp_t x_root = self.find(x)
172192
cdef np.intp_t y_root = self.find(y)
173193

@@ -182,12 +202,14 @@ cdef class BoruvkaUnionFind (object):
182202
return 0
183203

184204
cdef np.intp_t find(self, np.intp_t x) except -1:
205+
"""Find the root or identifier for the component that x is in"""
185206
if self._data[x, 0] != x:
186207
self._data[x, 0] = self.find(self._data[x, 0])
187208
self.is_component[x] = False
188209
return self._data[x, 0]
189210

190211
cdef np.ndarray[np.intp_t, ndim=1] components(self):
212+
"""Return an array of all component roots/identifiers"""
191213
return self.is_component.nonzero()[0]
192214

193215
def _core_dist_query(tree, data, min_samples):
@@ -196,6 +218,32 @@ def _core_dist_query(tree, data, min_samples):
196218
cdef class KDTreeBoruvkaAlgorithm (object):
197219
"""A Dual Tree Boruvka Algorithm implemented for the sklearn
198220
KDTree space tree implementation.
221+
222+
Parameters
223+
----------
224+
225+
tree : KDTree
226+
The kd-tree to run Dual Tree Boruvka over.
227+
228+
min_samples : int (default 5)
229+
The min_samples parameter of HDBSCAN used to
230+
determine core distances.
231+
232+
metric : string (default 'euclidean')
233+
The metric used to compute distances for the tree
234+
235+
leaf_size : int (default 20)
236+
The Boruvka algorithm benefits from a smaller leaf size than
237+
standard kd-tree nearest neighbor searches. The tree passed in
238+
is used for a kNN search for core distance. A second tree is
239+
constructed with a smaller leaf size for Boruvka; this is that
240+
leaf size.
241+
242+
alpha : float (default 1.0)
243+
The alpha distance scaling parameter as per Robust Single Linkage.
244+
245+
**kwargs :
246+
Keyword args passed to the metric.
199247
"""
200248

201249
cdef object tree
@@ -295,8 +343,8 @@ cdef class KDTreeBoruvkaAlgorithm (object):
295343
self.core_distance_ptr = <np.double_t *> &self.core_distance[0]
296344
self.bounds_ptr = <np.double_t *> &self.bounds[0]
297345

298-
@cython.profile(True)
299346
cdef _compute_bounds(self):
347+
"""Initialize core distances"""
300348

301349
cdef np.intp_t n
302350

@@ -329,6 +377,8 @@ cdef class KDTreeBoruvkaAlgorithm (object):
329377
self.bounds_arr[n] = <np.double_t> DBL_MAX
330378

331379
cdef _initialize_components(self):
380+
"""Initialize components of the min spanning tree (eventually there
381+
is only one component; initially each point is its own component)"""
332382

333383
cdef np.intp_t n
334384

@@ -342,6 +392,10 @@ cdef class KDTreeBoruvkaAlgorithm (object):
342392
self.component_of_node[n] = -(n+1)
343393

344394
cdef int update_components(self) except -1:
395+
"""Having found the nearest neighbor not in the same component for
396+
each current component (via tree traversal), run through adding
397+
edges to the min spanning tree and recomputing components via
398+
union find."""
345399

346400
cdef np.intp_t source
347401
cdef np.intp_t sink
@@ -412,6 +466,10 @@ cdef class KDTreeBoruvkaAlgorithm (object):
412466
return self.components.shape[0]
413467

414468
cdef int dual_tree_traversal(self, np.intp_t node1, np.intp_t node2) nogil except -1:
469+
"""Perform a dual tree traversal, pruning wherever possible, to find the nearest
470+
neighbor not in the same component for each component. This is akin to a
471+
standard dual tree NN search, but we also prune whenever all points in query
472+
and reference nodes are in the same component."""
415473

416474
cdef np.intp_t[::1] point_indices1, point_indices2
417475

@@ -452,9 +510,13 @@ cdef class KDTreeBoruvkaAlgorithm (object):
452510
cdef np.double_t left_dist
453511
cdef np.double_t right_dist
454512

513+
# Compute the distance between the query and reference nodes
455514
node_dist = kdtree_min_rdist_dual(self.dist,
456515
node1, node2, self.node_bounds, self.num_features)
457516

517+
# If the distance between the nodes is less than the current bound for the query
518+
# and the nodes are not in the same component continue; otherwise we get to prune
519+
# this branch and return early.
458520
if node_dist < self.bounds_ptr[node1]:
459521
if self.component_of_node_ptr[node1] == self.component_of_node_ptr[node2] and \
460522
self.component_of_node_ptr[node1] >= 0:
@@ -463,6 +525,32 @@ cdef class KDTreeBoruvkaAlgorithm (object):
463525
return 0
464526

465527

528+
# Case 1: Both nodes are leaves
529+
# for each pair of points in node1 x node2 we need
530+
# to compute the distance and see if it better than
531+
# the current nearest neighbor for the component of
532+
# the point in the query node.
533+
#
534+
# We get to take some shortcuts:
535+
# - if the core distance for a point is larger than
536+
# the distance to the nearst neighbor of the
537+
# component of the point ... then we can't get
538+
# a better mutual reachability distance and we
539+
# can skip computing anything for that point
540+
# - if the points are in the same component we
541+
# don't have to compute the distance.
542+
#
543+
# We also have some catches:
544+
# - we need to compute mutual reachability distance
545+
# not just the ordinary distance; this involves
546+
# fiddling with core distances.
547+
# - We need to scale distances according to alpha,
548+
# but don't want to lose performance in the case
549+
# that alpha is 1.0.
550+
#
551+
# Finally we can compute new bounds for the query node
552+
# based on the distances found here, so do that and
553+
# propagate the results up the tree.
466554
if node1_info.is_leaf and node2_info.is_leaf:
467555

468556
new_upper_bound = 0.0
@@ -506,6 +594,9 @@ cdef class KDTreeBoruvkaAlgorithm (object):
506594
new_upper_bound = max(new_upper_bound, self.candidate_distance_ptr[component1])
507595
new_lower_bound = min(new_lower_bound, self.candidate_distance_ptr[component1])
508596

597+
# Compute new bounds for the query node, and
598+
# then propagate the results of that computation
599+
# up the tree.
509600
new_bound = min(new_upper_bound, new_lower_bound + 2 * node1_info.radius)
510601
if new_bound < self.bounds_ptr[node1]:
511602
self.bounds_ptr[node1] = new_bound
@@ -536,7 +627,13 @@ cdef class KDTreeBoruvkaAlgorithm (object):
536627
else:
537628
break
538629

539-
630+
# Case 2a: The query node is a leaf, or is smaller than
631+
# the reference node.
632+
#
633+
# We descend in the reference tree. We first
634+
# compute distances between nodes to determine
635+
# whether we should prioritise the left or
636+
# right branch in the reference tree.
540637
elif node1_info.is_leaf or (not node2_info.is_leaf
541638
and node2_info.radius > node1_info.radius):
542639

@@ -559,6 +656,14 @@ cdef class KDTreeBoruvkaAlgorithm (object):
559656
else:
560657
self.dual_tree_traversal(node1, right)
561658
self.dual_tree_traversal(node1, left)
659+
660+
# Case 2b: The reference node is a leaf, or is smaller than
661+
# the query node.
662+
#
663+
# We descend in the query tree. We first
664+
# compute distances between nodes to determine
665+
# whether we should prioritise the left or
666+
# right branch in the query tree.
562667
else:
563668
left = 2 * node1 + 1
564669
right = 2 * node1 + 2
@@ -584,6 +689,8 @@ cdef class KDTreeBoruvkaAlgorithm (object):
584689
return 0
585690

586691
def spanning_tree(self):
692+
"""Compute the minimum spanning tree of the data held by
693+
the tree passed in at construction"""
587694

588695
#cdef np.intp_t num_components
589696
#cdef np.intp_t num_nodes
@@ -597,6 +704,35 @@ cdef class KDTreeBoruvkaAlgorithm (object):
597704
return self.edges
598705

599706
cdef class BallTreeBoruvkaAlgorithm (object):
707+
"""A Dual Tree Boruvka Algorithm implemented for the sklearn
708+
BallTree space tree implementation.
709+
710+
Parameters
711+
----------
712+
713+
tree : BallTree
714+
The ball-tree to run Dual Tree Boruvka over.
715+
716+
min_samples : int (default 5)
717+
The min_samples parameter of HDBSCAN used to
718+
determine core distances.
719+
720+
metric : string (default 'euclidean')
721+
The metric used to compute distances for the tree
722+
723+
leaf_size : int (default 20)
724+
The Boruvka algorithm benefits from a smaller leaf size than
725+
standard kd-tree nearest neighbor searches. The tree passed in
726+
is used for a kNN search for core distance. A second tree is
727+
constructed with a smaller leaf size for Boruvka; this is that
728+
leaf size.
729+
730+
alpha : float (default 1.0)
731+
The alpha distance scaling parameter as per Robust Single Linkage.
732+
733+
**kwargs :
734+
Keyword args passed to the metric.
735+
"""
600736

601737
cdef object tree
602738
cdef object core_dist_tree
@@ -695,6 +831,7 @@ cdef class BallTreeBoruvkaAlgorithm (object):
695831
self.bounds_ptr = <np.double_t *> &self.bounds[0]
696832

697833
cdef _compute_bounds(self):
834+
"""Initialize core distances"""
698835

699836
cdef np.intp_t n
700837

@@ -724,6 +861,8 @@ cdef class BallTreeBoruvkaAlgorithm (object):
724861
self.bounds_arr[n] = <np.double_t> DBL_MAX
725862

726863
cdef _initialize_components(self):
864+
"""Initialize components of the min spanning tree (eventually there
865+
is only one component; initially each point is its own component)"""
727866

728867
cdef np.intp_t n
729868

@@ -737,6 +876,10 @@ cdef class BallTreeBoruvkaAlgorithm (object):
737876
self.component_of_node[n] = -(n+1)
738877

739878
cdef update_components(self):
879+
"""Having found the nearest neighbor not in the same component for
880+
each current component (via tree traversal), run through adding
881+
edges to the min spanning tree and recomputing components via
882+
union find."""
740883

741884
cdef np.intp_t source
742885
cdef np.intp_t sink
@@ -807,6 +950,10 @@ cdef class BallTreeBoruvkaAlgorithm (object):
807950
return self.components.shape[0]
808951

809952
cdef int dual_tree_traversal(self, np.intp_t node1, np.intp_t node2) except -1:
953+
"""Perform a dual tree traversal, pruning wherever possible, to find the nearest
954+
neighbor not in the same component for each component. This is akin to a
955+
standard dual tree NN search, but we also prune whenever all points in query
956+
and reference nodes are in the same component."""
810957

811958
cdef np.intp_t[::1] point_indices1, point_indices2
812959

@@ -850,6 +997,9 @@ cdef class BallTreeBoruvkaAlgorithm (object):
850997
node_dist = balltree_min_dist_dual(node1_info.radius, node2_info.radius,
851998
node1, node2, self.centroid_distances)
852999

1000+
# If the distance between the nodes is less than the current bound for the query
1001+
# and the nodes are not in the same component continue; otherwise we get to prune
1002+
# this branch and return early.
8531003
if node_dist < self.bounds_ptr[node1]:
8541004
if self.component_of_node_ptr[node1] == self.component_of_node_ptr[node2] and \
8551005
self.component_of_node_ptr[node1] >= 0:
@@ -858,6 +1008,32 @@ cdef class BallTreeBoruvkaAlgorithm (object):
8581008
return 0
8591009

8601010

1011+
# Case 1: Both nodes are leaves
1012+
# for each pair of points in node1 x node2 we need
1013+
# to compute the distance and see if it better than
1014+
# the current nearest neighbor for the component of
1015+
# the point in the query node.
1016+
#
1017+
# We get to take some shortcuts:
1018+
# - if the core distance for a point is larger than
1019+
# the distance to the nearst neighbor of the
1020+
# component of the point ... then we can't get
1021+
# a better mutual reachability distance and we
1022+
# can skip computing anything for that point
1023+
# - if the points are in the same component we
1024+
# don't have to compute the distance.
1025+
#
1026+
# We also have some catches:
1027+
# - we need to compute mutual reachability distance
1028+
# not just the ordinary distance; this involves
1029+
# fiddling with core distances.
1030+
# - We need to scale distances according to alpha,
1031+
# but don't want to lose performance in the case
1032+
# that alpha is 1.0.
1033+
#
1034+
# Finally we can compute new bounds for the query node
1035+
# based on the distances found here, so do that and
1036+
# propagate the results up the tree.
8611037
if node1_info.is_leaf and node2_info.is_leaf:
8621038

8631039
new_bound = 0.0
@@ -900,6 +1076,9 @@ cdef class BallTreeBoruvkaAlgorithm (object):
9001076
new_upper_bound = max(new_upper_bound, self.candidate_distance_ptr[component1])
9011077
new_lower_bound = min(new_lower_bound, self.candidate_distance_ptr[component1])
9021078

1079+
# Compute new bounds for the query node, and
1080+
# then propagate the results of that computation
1081+
# up the tree.
9031082
new_bound = min(new_upper_bound, new_lower_bound + 2 * node1_info.radius)
9041083
if new_bound < self.bounds_ptr[node1]:
9051084
self.bounds_ptr[node1] = new_bound
@@ -931,6 +1110,13 @@ cdef class BallTreeBoruvkaAlgorithm (object):
9311110
break
9321111

9331112

1113+
# Case 2a: The query node is a leaf, or is smaller than
1114+
# the reference node.
1115+
#
1116+
# We descend in the reference tree. We first
1117+
# compute distances between nodes to determine
1118+
# whether we should prioritise the left or
1119+
# right branch in the reference tree.
9341120
elif node1_info.is_leaf or (not node2_info.is_leaf
9351121
and node2_info.radius > node1_info.radius):
9361122

@@ -953,6 +1139,14 @@ cdef class BallTreeBoruvkaAlgorithm (object):
9531139
else:
9541140
self.dual_tree_traversal(node1, right)
9551141
self.dual_tree_traversal(node1, left)
1142+
1143+
# Case 2b: The reference node is a leaf, or is smaller than
1144+
# the query node.
1145+
#
1146+
# We descend in the query tree. We first
1147+
# compute distances between nodes to determine
1148+
# whether we should prioritise the left or
1149+
# right branch in the query tree.
9561150
else:
9571151
left = 2 * node1 + 1
9581152
right = 2 * node1 + 2
@@ -978,6 +1172,8 @@ cdef class BallTreeBoruvkaAlgorithm (object):
9781172
return 0
9791173

9801174
cpdef spanning_tree(self):
1175+
"""Compute the minimum spanning tree of the data held by
1176+
the tree passed in at construction"""
9811177

9821178
cdef np.intp_t num_components
9831179
cdef np.intp_t num_nodes

0 commit comments

Comments
 (0)