Big speep-up in DBSCAN-based cluster breaking using parallel + no GIL

francois-drielsma · francois-drielsma · commit 7b9214f88fa6 · 2025-04-08T15:31:46.000-07:00
diff --git a/spine/utils/cluster/label.py b/spine/utils/cluster/label.py
@@ -47,7 +47,7 @@ def __init__(self, break_eps=1.1, break_metric='chebyshev',
             Distance scale used in the break up procedure
         break_metric : str, default 'chebyshev'
             Distance metric used in the break up produce
-        break_classes : List[int], default 
+        break_classes : List[int], default
                         [SHOWR_SHP, TRACK_SHP, MICHL_SHP, DELTA_SHP]
             Classes to run DBSCAN on to break up
         """
@@ -141,12 +141,12 @@ def _process(self, clust_label, seg_label, seg_pred, ghost_pred=None):
         if not len(clust_label):
             if ghost_pred is None:
                 shape = (len(coords), num_cols)
-                dummy_labels = -1 * self._ones(shape)
+                dummy_labels = -self._ones(shape)
                 dummy_labels[:, :VALUE_COL] = coords
 
             else:
                 shape = (len(deghost_index), num_cols)
-                dummy_labels = -1 * self._ones(shape)
+                dummy_labels = -self._ones(shape)
                 dummy_labels[:, :VALUE_COL] = coords[deghost_index]
 
             return dummy_labels
@@ -159,7 +159,7 @@ def _process(self, clust_label, seg_label, seg_pred, ghost_pred=None):
             seg_pred = seg_pred_long
 
         # Prepare new labels
-        new_label = -1. * self._ones((len(coords), num_cols))
+        new_label = -self._ones((len(coords), num_cols))
         new_label[:, :VALUE_COL] = coords
 
         # Check if the segment labels and predictions are compatible. If they are
@@ -172,7 +172,7 @@ def _process(self, clust_label, seg_label, seg_pred, ghost_pred=None):
         true_deghost = seg_label < GHOST_SHP
         seg_mismatch = ~compat_mat[(seg_pred, seg_label)]
         new_label[true_deghost] = clust_label
-        new_label[seg_mismatch & true_deghost, VALUE_COL:] = -self._ones(1)
+        new_label[true_deghost & seg_mismatch, VALUE_COL:] = -self._ones(1)
 
         # For mismatched predictions, attempt to find a touching instance of the
         # same class to assign it sensible cluster labels.
@@ -182,7 +182,7 @@ def _process(self, clust_label, seg_label, seg_pred, ghost_pred=None):
                 continue
 
             # Restrict to points in this class that have incompatible segment
-            # labels. Track points do not mix, EM points are allowed to. 
+            # labels. Track points do not mix, EM points are allowed to.
             bad_index = self._where(
                     (seg_pred == s) & (~true_deghost | seg_mismatch))[0]
             if len(bad_index) == 0:
@@ -211,7 +211,7 @@ def _process(self, clust_label, seg_label, seg_pred, ghost_pred=None):
                 if tagged_voxels_count > 0:
                     # Use the label of the touching true voxel
                     additional_clust_label = self._cat(
-                            [X_pred[select_index], 
+                            [X_pred[select_index],
                              X_true[closest_ids[select_index], VALUE_COL:]], 1)
                     new_label[bad_index[select_index]] = additional_clust_label
 
diff --git a/spine/utils/gnn/cluster.py b/spine/utils/gnn/cluster.py
@@ -364,39 +364,38 @@ def break_clusters(data, clusts, eps, metric):
     if not len(clusts):
         return np.copy(data[:, CLUST_COL])
 
-    return _break_clusters(data, clusts, eps, metric)
+    # Break labels
+    break_labels = _break_clusters(data, clusts, eps, metric)
 
-@nb.njit(cache=True)
+    # Offset individual broken labels to prevent overlap
+    labels = np.copy(data[:, CLUST_COL])
+    offset = np.max(labels) + 1
+    for k, clust in enumerate(clusts):
+        # Update IDs, offset
+        ids = break_labels[clust]
+        labels[clust] = offset + ids
+        offset += len(np.unique(ids))
+
+    return labels
+
+@nb.njit(cache=True, parallel=True, nogil=True)
 def _break_clusters(data: nb.float64[:,:],
                     clusts: nb.types.List(nb.int64[:]),
                     eps: nb.float64,
                     metric: str) -> nb.float64[:]:
-    # Get the relevant data products
-    points = data[:, COORD_COLS]
-    labels = data[:, CLUST_COL]
-
     # Loop over clusters to break, run DBSCAN
-    break_ids = np.full_like(labels, -1)
-    ids = np.arange(len(clusts)).astype(np.int64)
-    for k in range(len(clusts)):
+    break_labels = np.full(len(data), -1, dtype=data.dtype)
+    points = data[:, COORD_COLS]
+    for k in nb.prange(len(clusts)):
         # Restrict the points to those in the cluster
-        clust = clusts[ids[k]]
+        clust = clusts[k]
         points_c = points[clust]
 
         # Run DBSCAN on the cluster, update labels
         clust_ids = nbl.dbscan(points_c, eps=eps, metric=metric)
 
         # Store the breaking IDs
-        break_ids[clust] = clust_ids
-
-    # Update the break IDs to ensure no overlap (has to be sequential)
-    break_labels = np.copy(labels)
-    offset = np.max(labels) + 1
-    for k, clust in enumerate(clusts):
-        # Update IDs, offset
-        ids = break_ids[clust]
-        break_labels[clust] = offset + ids
-        offset += len(np.unique(ids))
+        break_labels[clust] = clust_ids
 
     return break_labels