@@ -122,6 +122,9 @@ cdef inline np.double_t kdtree_min_dist_dual(dist_metrics.DistanceMetric metric,
122122
123123 return metric._rdist_to_dist(rdist)
124124
125+ # As above, but this time we use the rdist as per the kdtree
126+ # implementation. This allows us to release the GIL over
127+ # larger sections of code
125128cdef inline np.double_t kdtree_min_rdist_dual(dist_metrics.DistanceMetric metric,
126129 np.intp_t node1,
127130 np.intp_t node2,
@@ -154,8 +157,24 @@ cdef inline np.double_t kdtree_min_rdist_dual(dist_metrics.DistanceMetric metric
154157
155158 return rdist
156159
157-
158160cdef class BoruvkaUnionFind (object ):
161+ """ Efficient union find implementation.
162+
163+ Parameters
164+ ----------
165+
166+ size : int
167+ The total size of the set of objects to
168+ track via the union find structure.
169+
170+ Attributes
171+ ----------
172+
173+ is_component : array of bool; shape (size, 1)
174+ Array specifying whether each element of the
175+ set is the root node, or identifier for
176+ a component.
177+ """
159178
160179 cdef np.ndarray _data_arr
161180 cdef np.intp_t[:,::1 ] _data
@@ -168,6 +187,7 @@ cdef class BoruvkaUnionFind (object):
168187 self .is_component = np.ones(size, dtype = np.bool)
169188
170189 cdef int union_(self , np.intp_t x, np.intp_t y) except - 1 :
190+ """ Union together elements x and y"""
171191 cdef np.intp_t x_root = self .find(x)
172192 cdef np.intp_t y_root = self .find(y)
173193
@@ -182,12 +202,14 @@ cdef class BoruvkaUnionFind (object):
182202 return 0
183203
184204 cdef np.intp_t find(self , np.intp_t x) except - 1 :
205+ """ Find the root or identifier for the component that x is in"""
185206 if self ._data[x, 0 ] != x:
186207 self ._data[x, 0 ] = self .find(self ._data[x, 0 ])
187208 self .is_component[x] = False
188209 return self ._data[x, 0 ]
189210
190211 cdef np.ndarray[np.intp_t, ndim= 1 ] components(self ):
212+ """ Return an array of all component roots/identifiers"""
191213 return self .is_component.nonzero()[0 ]
192214
193215def _core_dist_query (tree , data , min_samples ):
@@ -196,6 +218,32 @@ def _core_dist_query(tree, data, min_samples):
196218cdef class KDTreeBoruvkaAlgorithm (object ):
197219 """ A Dual Tree Boruvka Algorithm implemented for the sklearn
198220 KDTree space tree implementation.
221+
222+ Parameters
223+ ----------
224+
225+ tree : KDTree
226+ The kd-tree to run Dual Tree Boruvka over.
227+
228+ min_samples : int (default 5)
229+ The min_samples parameter of HDBSCAN used to
230+ determine core distances.
231+
232+ metric : string (default 'euclidean')
233+ The metric used to compute distances for the tree
234+
235+ leaf_size : int (default 20)
236+ The Boruvka algorithm benefits from a smaller leaf size than
237+ standard kd-tree nearest neighbor searches. The tree passed in
238+ is used for a kNN search for core distance. A second tree is
239+ constructed with a smaller leaf size for Boruvka; this is that
240+ leaf size.
241+
242+ alpha : float (default 1.0)
243+ The alpha distance scaling parameter as per Robust Single Linkage.
244+
245+ **kwargs :
246+ Keyword args passed to the metric.
199247 """
200248
201249 cdef object tree
@@ -295,8 +343,8 @@ cdef class KDTreeBoruvkaAlgorithm (object):
295343 self .core_distance_ptr = < np.double_t * > & self .core_distance[0 ]
296344 self .bounds_ptr = < np.double_t * > & self .bounds[0 ]
297345
298- @ cython.profile (True )
299346 cdef _compute_bounds(self ):
347+ """ Initialize core distances"""
300348
301349 cdef np.intp_t n
302350
@@ -329,6 +377,8 @@ cdef class KDTreeBoruvkaAlgorithm (object):
329377 self .bounds_arr[n] = < np.double_t> DBL_MAX
330378
331379 cdef _initialize_components(self ):
380+ """ Initialize components of the min spanning tree (eventually there
381+ is only one component; initially each point is its own component)"""
332382
333383 cdef np.intp_t n
334384
@@ -342,6 +392,10 @@ cdef class KDTreeBoruvkaAlgorithm (object):
342392 self .component_of_node[n] = - (n+ 1 )
343393
344394 cdef int update_components(self ) except - 1 :
395+ """ Having found the nearest neighbor not in the same component for
396+ each current component (via tree traversal), run through adding
397+ edges to the min spanning tree and recomputing components via
398+ union find."""
345399
346400 cdef np.intp_t source
347401 cdef np.intp_t sink
@@ -412,6 +466,10 @@ cdef class KDTreeBoruvkaAlgorithm (object):
412466 return self .components.shape[0 ]
413467
414468 cdef int dual_tree_traversal(self , np.intp_t node1, np.intp_t node2) nogil except - 1 :
469+ """ Perform a dual tree traversal, pruning wherever possible, to find the nearest
470+ neighbor not in the same component for each component. This is akin to a
471+ standard dual tree NN search, but we also prune whenever all points in query
472+ and reference nodes are in the same component."""
415473
416474 cdef np.intp_t[::1 ] point_indices1, point_indices2
417475
@@ -452,9 +510,13 @@ cdef class KDTreeBoruvkaAlgorithm (object):
452510 cdef np.double_t left_dist
453511 cdef np.double_t right_dist
454512
513+ # Compute the distance between the query and reference nodes
455514 node_dist = kdtree_min_rdist_dual(self .dist,
456515 node1, node2, self .node_bounds, self .num_features)
457516
517+ # If the distance between the nodes is less than the current bound for the query
518+ # and the nodes are not in the same component continue; otherwise we get to prune
519+ # this branch and return early.
458520 if node_dist < self .bounds_ptr[node1]:
459521 if self .component_of_node_ptr[node1] == self .component_of_node_ptr[node2] and \
460522 self .component_of_node_ptr[node1] >= 0 :
@@ -463,6 +525,32 @@ cdef class KDTreeBoruvkaAlgorithm (object):
463525 return 0
464526
465527
528+ # Case 1: Both nodes are leaves
529+ # for each pair of points in node1 x node2 we need
530+ # to compute the distance and see if it better than
531+ # the current nearest neighbor for the component of
532+ # the point in the query node.
533+ #
534+ # We get to take some shortcuts:
535+ # - if the core distance for a point is larger than
536+ # the distance to the nearst neighbor of the
537+ # component of the point ... then we can't get
538+ # a better mutual reachability distance and we
539+ # can skip computing anything for that point
540+ # - if the points are in the same component we
541+ # don't have to compute the distance.
542+ #
543+ # We also have some catches:
544+ # - we need to compute mutual reachability distance
545+ # not just the ordinary distance; this involves
546+ # fiddling with core distances.
547+ # - We need to scale distances according to alpha,
548+ # but don't want to lose performance in the case
549+ # that alpha is 1.0.
550+ #
551+ # Finally we can compute new bounds for the query node
552+ # based on the distances found here, so do that and
553+ # propagate the results up the tree.
466554 if node1_info.is_leaf and node2_info.is_leaf:
467555
468556 new_upper_bound = 0.0
@@ -506,6 +594,9 @@ cdef class KDTreeBoruvkaAlgorithm (object):
506594 new_upper_bound = max (new_upper_bound, self .candidate_distance_ptr[component1])
507595 new_lower_bound = min (new_lower_bound, self .candidate_distance_ptr[component1])
508596
597+ # Compute new bounds for the query node, and
598+ # then propagate the results of that computation
599+ # up the tree.
509600 new_bound = min (new_upper_bound, new_lower_bound + 2 * node1_info.radius)
510601 if new_bound < self .bounds_ptr[node1]:
511602 self .bounds_ptr[node1] = new_bound
@@ -536,7 +627,13 @@ cdef class KDTreeBoruvkaAlgorithm (object):
536627 else :
537628 break
538629
539-
630+ # Case 2a: The query node is a leaf, or is smaller than
631+ # the reference node.
632+ #
633+ # We descend in the reference tree. We first
634+ # compute distances between nodes to determine
635+ # whether we should prioritise the left or
636+ # right branch in the reference tree.
540637 elif node1_info.is_leaf or (not node2_info.is_leaf
541638 and node2_info.radius > node1_info.radius):
542639
@@ -559,6 +656,14 @@ cdef class KDTreeBoruvkaAlgorithm (object):
559656 else :
560657 self .dual_tree_traversal(node1, right)
561658 self .dual_tree_traversal(node1, left)
659+
660+ # Case 2b: The reference node is a leaf, or is smaller than
661+ # the query node.
662+ #
663+ # We descend in the query tree. We first
664+ # compute distances between nodes to determine
665+ # whether we should prioritise the left or
666+ # right branch in the query tree.
562667 else :
563668 left = 2 * node1 + 1
564669 right = 2 * node1 + 2
@@ -584,6 +689,8 @@ cdef class KDTreeBoruvkaAlgorithm (object):
584689 return 0
585690
586691 def spanning_tree (self ):
692+ """ Compute the minimum spanning tree of the data held by
693+ the tree passed in at construction"""
587694
588695 # cdef np.intp_t num_components
589696 # cdef np.intp_t num_nodes
@@ -597,6 +704,35 @@ cdef class KDTreeBoruvkaAlgorithm (object):
597704 return self .edges
598705
599706cdef class BallTreeBoruvkaAlgorithm (object ):
707+ """ A Dual Tree Boruvka Algorithm implemented for the sklearn
708+ BallTree space tree implementation.
709+
710+ Parameters
711+ ----------
712+
713+ tree : BallTree
714+ The ball-tree to run Dual Tree Boruvka over.
715+
716+ min_samples : int (default 5)
717+ The min_samples parameter of HDBSCAN used to
718+ determine core distances.
719+
720+ metric : string (default 'euclidean')
721+ The metric used to compute distances for the tree
722+
723+ leaf_size : int (default 20)
724+ The Boruvka algorithm benefits from a smaller leaf size than
725+ standard kd-tree nearest neighbor searches. The tree passed in
726+ is used for a kNN search for core distance. A second tree is
727+ constructed with a smaller leaf size for Boruvka; this is that
728+ leaf size.
729+
730+ alpha : float (default 1.0)
731+ The alpha distance scaling parameter as per Robust Single Linkage.
732+
733+ **kwargs :
734+ Keyword args passed to the metric.
735+ """
600736
601737 cdef object tree
602738 cdef object core_dist_tree
@@ -695,6 +831,7 @@ cdef class BallTreeBoruvkaAlgorithm (object):
695831 self .bounds_ptr = < np.double_t * > & self .bounds[0 ]
696832
697833 cdef _compute_bounds(self ):
834+ """ Initialize core distances"""
698835
699836 cdef np.intp_t n
700837
@@ -724,6 +861,8 @@ cdef class BallTreeBoruvkaAlgorithm (object):
724861 self .bounds_arr[n] = < np.double_t> DBL_MAX
725862
726863 cdef _initialize_components(self ):
864+ """ Initialize components of the min spanning tree (eventually there
865+ is only one component; initially each point is its own component)"""
727866
728867 cdef np.intp_t n
729868
@@ -737,6 +876,10 @@ cdef class BallTreeBoruvkaAlgorithm (object):
737876 self .component_of_node[n] = - (n+ 1 )
738877
739878 cdef update_components(self ):
879+ """ Having found the nearest neighbor not in the same component for
880+ each current component (via tree traversal), run through adding
881+ edges to the min spanning tree and recomputing components via
882+ union find."""
740883
741884 cdef np.intp_t source
742885 cdef np.intp_t sink
@@ -807,6 +950,10 @@ cdef class BallTreeBoruvkaAlgorithm (object):
807950 return self .components.shape[0 ]
808951
809952 cdef int dual_tree_traversal(self , np.intp_t node1, np.intp_t node2) except - 1 :
953+ """ Perform a dual tree traversal, pruning wherever possible, to find the nearest
954+ neighbor not in the same component for each component. This is akin to a
955+ standard dual tree NN search, but we also prune whenever all points in query
956+ and reference nodes are in the same component."""
810957
811958 cdef np.intp_t[::1 ] point_indices1, point_indices2
812959
@@ -850,6 +997,9 @@ cdef class BallTreeBoruvkaAlgorithm (object):
850997 node_dist = balltree_min_dist_dual(node1_info.radius, node2_info.radius,
851998 node1, node2, self .centroid_distances)
852999
1000+ # If the distance between the nodes is less than the current bound for the query
1001+ # and the nodes are not in the same component continue; otherwise we get to prune
1002+ # this branch and return early.
8531003 if node_dist < self .bounds_ptr[node1]:
8541004 if self .component_of_node_ptr[node1] == self .component_of_node_ptr[node2] and \
8551005 self .component_of_node_ptr[node1] >= 0 :
@@ -858,6 +1008,32 @@ cdef class BallTreeBoruvkaAlgorithm (object):
8581008 return 0
8591009
8601010
1011+ # Case 1: Both nodes are leaves
1012+ # for each pair of points in node1 x node2 we need
1013+ # to compute the distance and see if it better than
1014+ # the current nearest neighbor for the component of
1015+ # the point in the query node.
1016+ #
1017+ # We get to take some shortcuts:
1018+ # - if the core distance for a point is larger than
1019+ # the distance to the nearst neighbor of the
1020+ # component of the point ... then we can't get
1021+ # a better mutual reachability distance and we
1022+ # can skip computing anything for that point
1023+ # - if the points are in the same component we
1024+ # don't have to compute the distance.
1025+ #
1026+ # We also have some catches:
1027+ # - we need to compute mutual reachability distance
1028+ # not just the ordinary distance; this involves
1029+ # fiddling with core distances.
1030+ # - We need to scale distances according to alpha,
1031+ # but don't want to lose performance in the case
1032+ # that alpha is 1.0.
1033+ #
1034+ # Finally we can compute new bounds for the query node
1035+ # based on the distances found here, so do that and
1036+ # propagate the results up the tree.
8611037 if node1_info.is_leaf and node2_info.is_leaf:
8621038
8631039 new_bound = 0.0
@@ -900,6 +1076,9 @@ cdef class BallTreeBoruvkaAlgorithm (object):
9001076 new_upper_bound = max (new_upper_bound, self .candidate_distance_ptr[component1])
9011077 new_lower_bound = min (new_lower_bound, self .candidate_distance_ptr[component1])
9021078
1079+ # Compute new bounds for the query node, and
1080+ # then propagate the results of that computation
1081+ # up the tree.
9031082 new_bound = min (new_upper_bound, new_lower_bound + 2 * node1_info.radius)
9041083 if new_bound < self .bounds_ptr[node1]:
9051084 self .bounds_ptr[node1] = new_bound
@@ -931,6 +1110,13 @@ cdef class BallTreeBoruvkaAlgorithm (object):
9311110 break
9321111
9331112
1113+ # Case 2a: The query node is a leaf, or is smaller than
1114+ # the reference node.
1115+ #
1116+ # We descend in the reference tree. We first
1117+ # compute distances between nodes to determine
1118+ # whether we should prioritise the left or
1119+ # right branch in the reference tree.
9341120 elif node1_info.is_leaf or (not node2_info.is_leaf
9351121 and node2_info.radius > node1_info.radius):
9361122
@@ -953,6 +1139,14 @@ cdef class BallTreeBoruvkaAlgorithm (object):
9531139 else :
9541140 self .dual_tree_traversal(node1, right)
9551141 self .dual_tree_traversal(node1, left)
1142+
1143+ # Case 2b: The reference node is a leaf, or is smaller than
1144+ # the query node.
1145+ #
1146+ # We descend in the query tree. We first
1147+ # compute distances between nodes to determine
1148+ # whether we should prioritise the left or
1149+ # right branch in the query tree.
9561150 else :
9571151 left = 2 * node1 + 1
9581152 right = 2 * node1 + 2
@@ -978,6 +1172,8 @@ cdef class BallTreeBoruvkaAlgorithm (object):
9781172 return 0
9791173
9801174 cpdef spanning_tree(self ):
1175+ """ Compute the minimum spanning tree of the data held by
1176+ the tree passed in at construction"""
9811177
9821178 cdef np.intp_t num_components
9831179 cdef np.intp_t num_nodes
0 commit comments