|
| 1 | +import numba |
| 2 | +import numpy as np |
| 3 | +from collections import namedtuple |
| 4 | + |
| 5 | +from .disjoint_set import ds_rank_create |
| 6 | +from .hdbscan import clusters_from_spanning_tree |
| 7 | +from .cluster_trees import empty_condensed_tree |
| 8 | +from .boruvka import merge_components, update_point_components |
| 9 | + |
| 10 | +CoreGraph = namedtuple("CoreGraph", ["weights", "distances", "indices", "indptr"]) |
| 11 | + |
| 12 | + |
| 13 | +@numba.njit(parallel=True) |
| 14 | +def knn_mst_union(neighbors, core_distances, min_spanning_tree, lens_values): |
| 15 | + # List of dictionaries of child: (weight, distance) |
| 16 | + graph = [ |
| 17 | + {np.int32(0): (np.float64(0.0), np.float64(0.0)) for _ in range(0)} |
| 18 | + for _ in range(neighbors.shape[0]) |
| 19 | + ] |
| 20 | + |
| 21 | + # Add knn edges |
| 22 | + for point in numba.prange(len(core_distances)): |
| 23 | + children = graph[point] |
| 24 | + parent_lens = lens_values[point] |
| 25 | + parent_dist = core_distances[point] |
| 26 | + for child in neighbors[point]: |
| 27 | + if child < 0: |
| 28 | + continue |
| 29 | + children[child] = ( |
| 30 | + max(parent_lens, lens_values[child]), |
| 31 | + max(parent_dist, core_distances[child]), |
| 32 | + ) |
| 33 | + |
| 34 | + # Add non-knn mst edges |
| 35 | + for parent, child, distance in min_spanning_tree: |
| 36 | + parent = np.int32(parent) |
| 37 | + child = np.int32(child) |
| 38 | + children = graph[parent] |
| 39 | + if child in children: |
| 40 | + continue |
| 41 | + children[child] = (max(lens_values[parent], lens_values[child]), distance) |
| 42 | + |
| 43 | + return graph |
| 44 | + |
| 45 | + |
| 46 | +@numba.njit(parallel=True) |
| 47 | +def sort_by_lens(graph): |
| 48 | + for point in numba.prange(len(graph)): |
| 49 | + graph[point] = { |
| 50 | + k: v for k, v in sorted(graph[point].items(), key=lambda item: item[1][0]) |
| 51 | + } |
| 52 | + return graph |
| 53 | + |
| 54 | + |
| 55 | +@numba.njit(parallel=True) |
| 56 | +def apply_lens(core_graph, lens_values): |
| 57 | + # Apply new lens to the graph |
| 58 | + for point in numba.prange(len(lens_values)): |
| 59 | + children = core_graph[point] |
| 60 | + point_lens = lens_values[point] |
| 61 | + for child, value in children.items(): |
| 62 | + children[child] = (max(point_lens, lens_values[child]), value[1]) |
| 63 | + return sort_by_lens(core_graph) |
| 64 | + |
| 65 | + |
| 66 | +@numba.njit() |
| 67 | +def flatten_to_csr(graph): |
| 68 | + # Count children to form indptr |
| 69 | + num_points = len(graph) |
| 70 | + indptr = np.empty(num_points + 1, dtype=np.int32) |
| 71 | + indptr[0] = 0 |
| 72 | + for i, children in enumerate(graph): |
| 73 | + indptr[i + 1] = indptr[i] + len(children) |
| 74 | + |
| 75 | + # Flatten children to form indices, weights, and distances |
| 76 | + weights = np.empty(indptr[-1], dtype=np.float32) |
| 77 | + distances = np.empty(indptr[-1], dtype=np.float32) |
| 78 | + indices = np.empty(indptr[-1], dtype=np.int32) |
| 79 | + for point in numba.prange(num_points): |
| 80 | + start = indptr[point] |
| 81 | + children = graph[point] |
| 82 | + for j, (child, (weight, distance)) in enumerate(children.items()): |
| 83 | + weights[start + j] = weight |
| 84 | + distances[start + j] = distance |
| 85 | + indices[start + j] = child |
| 86 | + |
| 87 | + # Return as named csr tuple |
| 88 | + return CoreGraph(weights, distances, indices, indptr) |
| 89 | + |
| 90 | + |
| 91 | +@numba.njit(locals={"parent": numba.types.int32}) |
| 92 | +def select_components(graph, point_components): |
| 93 | + component_edges = { |
| 94 | + np.int64(0): (np.int32(0), np.int32(1), np.float32(0.0)) for _ in range(0) |
| 95 | + } |
| 96 | + |
| 97 | + # Find the best edges from each component |
| 98 | + for parent, (children, from_component) in enumerate(zip(graph, point_components)): |
| 99 | + if len(children) == 0: |
| 100 | + continue |
| 101 | + neighbor = next(iter(children.keys())) |
| 102 | + distance = np.float32(children[neighbor][0]) |
| 103 | + if from_component in component_edges: |
| 104 | + if distance < component_edges[from_component][2]: |
| 105 | + component_edges[from_component] = (parent, neighbor, distance) |
| 106 | + else: |
| 107 | + component_edges[from_component] = (parent, neighbor, distance) |
| 108 | + |
| 109 | + return component_edges |
| 110 | + |
| 111 | + |
| 112 | +@numba.njit() # enabling parallel breaks this function |
| 113 | +def update_graph_components(graph, point_components): |
| 114 | + # deleting from dictionary during iteration breaks in numba. |
| 115 | + for point in numba.prange(len(graph)): |
| 116 | + graph[point] = { |
| 117 | + child: (weight, distance) |
| 118 | + for child, (weight, distance) in graph[point].items() |
| 119 | + if point_components[child] != point_components[point] |
| 120 | + } |
| 121 | + |
| 122 | + |
| 123 | +@numba.njit() |
| 124 | +def minimum_spanning_tree(graph, overwrite=False): |
| 125 | + """ |
| 126 | + Implements Boruvka on lod-style graph with multiple connected components. |
| 127 | + """ |
| 128 | + if not overwrite: |
| 129 | + graph = [children for children in graph] |
| 130 | + |
| 131 | + disjoint_set = ds_rank_create(len(graph)) |
| 132 | + point_components = np.arange(len(graph)) |
| 133 | + n_components = len(point_components) |
| 134 | + |
| 135 | + edges_list = [np.empty((0, 3), dtype=np.float64) for _ in range(0)] |
| 136 | + while n_components > 1: |
| 137 | + new_edges = merge_components( |
| 138 | + disjoint_set, |
| 139 | + select_components(graph, point_components), |
| 140 | + ) |
| 141 | + if new_edges.shape[0] == 0: |
| 142 | + break |
| 143 | + |
| 144 | + edges_list.append(new_edges) |
| 145 | + update_point_components(disjoint_set, point_components) |
| 146 | + update_graph_components(graph, point_components) |
| 147 | + n_components -= new_edges.shape[0] |
| 148 | + |
| 149 | + counter = 0 |
| 150 | + num_edges = sum([edges.shape[0] for edges in edges_list]) |
| 151 | + result = np.empty((num_edges, 3), dtype=np.float64) |
| 152 | + for edges in edges_list: |
| 153 | + result[counter : counter + edges.shape[0]] = edges |
| 154 | + counter += edges.shape[0] |
| 155 | + return n_components, point_components, result |
| 156 | + |
| 157 | + |
| 158 | +@numba.njit() |
| 159 | +def core_graph_spanning_tree(neighbors, core_distances, min_spanning_tree, lens): |
| 160 | + graph = sort_by_lens( |
| 161 | + knn_mst_union(neighbors, core_distances, min_spanning_tree, lens) |
| 162 | + ) |
| 163 | + return (*minimum_spanning_tree(graph), flatten_to_csr(graph)) |
| 164 | + |
| 165 | + |
| 166 | +def core_graph_clusters( |
| 167 | + lens, |
| 168 | + neighbors, |
| 169 | + core_distances, |
| 170 | + min_spanning_tree, |
| 171 | + **kwargs, |
| 172 | +): |
| 173 | + num_components, component_labels, lensed_mst, graph = core_graph_spanning_tree( |
| 174 | + neighbors, core_distances, min_spanning_tree, lens |
| 175 | + ) |
| 176 | + if num_components > 1: |
| 177 | + for i, label in enumerate(np.unique(component_labels)): |
| 178 | + component_labels[component_labels == label] = i |
| 179 | + return ( |
| 180 | + component_labels, |
| 181 | + np.ones(len(component_labels), dtype=np.float32), |
| 182 | + np.empty((0, 4)), |
| 183 | + empty_condensed_tree(), |
| 184 | + lensed_mst, |
| 185 | + graph, |
| 186 | + ) |
| 187 | + |
| 188 | + return ( |
| 189 | + *clusters_from_spanning_tree(lensed_mst, **kwargs), |
| 190 | + graph, |
| 191 | + ) |
| 192 | + |
| 193 | + |
| 194 | +def core_graph_to_rec_array(graph): |
| 195 | + result = np.empty( |
| 196 | + graph.indptr[-1], |
| 197 | + dtype=[ |
| 198 | + ("parent", np.int32), |
| 199 | + ("child", np.int32), |
| 200 | + ("weight", np.float32), |
| 201 | + ("distance", np.float32), |
| 202 | + ], |
| 203 | + ) |
| 204 | + result["parent"] = np.repeat( |
| 205 | + np.arange(len(graph.indptr) - 1), np.diff(graph.indptr) |
| 206 | + ) |
| 207 | + result["child"] = graph.indices |
| 208 | + result["weight"] = graph.weights |
| 209 | + result["distance"] = graph.distances |
| 210 | + return result |
| 211 | + |
| 212 | + |
| 213 | +def core_graph_to_edge_list(graph): |
| 214 | + result = np.empty((graph.indptr[-1], 4), dtype=np.float64) |
| 215 | + result[:, 0] = np.repeat(np.arange(len(graph.indptr) - 1), np.diff(graph.indptr)) |
| 216 | + result[:, 1] = graph.indices |
| 217 | + result[:, 2] = graph.weights |
| 218 | + result[:, 3] = graph.distances |
| 219 | + return result |
0 commit comments