Added graph connected components for postprocessing

schilling40 · schilling40 · commit c6e2b0434128 · 2025-05-07T12:00:31.000+02:00
diff --git a/flamingo_tools/segmentation/postprocessing.py b/flamingo_tools/segmentation/postprocessing.py
@@ -1,11 +1,12 @@
+import math
 import multiprocessing as mp
-import os
 from concurrent import futures
-from typing import Callable, Tuple, Optional
+from typing import Callable, List, Optional, Tuple
 
 import elf.parallel as parallel
 import numpy as np
 import nifty.tools as nt
+import networkx as nx
 import pandas as pd
 
 from elf.io import open_file
@@ -258,16 +259,16 @@ def erode_subset(
 def downscaled_centroids(
     table: pd.DataFrame,
     scale_factor: int,
-    ref_dimensions: Optional[Tuple[float,float,float]] = None,
-    capped: Optional[bool] = True,
+    ref_dimensions: Optional[Tuple[float, float, float]] = None,
+    downsample_mode: Optional[str] = "accumulated",
 ) -> np.typing.NDArray:
     """Downscale centroids in dataframe.
 
     Args:
         table: Dataframe of segmentation table.
         scale_factor: Factor for downscaling coordinates.
         ref_dimensions: Reference dimensions for downscaling. Taken from centroids if not supplied.
-        capped: Flag for capping output of array at 1 for the creation of a binary mask.
+        downsample_mode: Flag for downsampling, either 'accumulated', 'capped', or 'components'
 
     Returns:
         The downscaled array
@@ -284,23 +285,35 @@ def downscaled_centroids(
         bounding_dimensions_scaled = tuple([round(b // scale_factor + 1) for b in ref_dimensions])
         new_array = np.zeros(bounding_dimensions_scaled)
 
-    for c in centroids_scaled:
-        new_array[int(c[0]), int(c[1]), int(c[2])] += 1
+    if downsample_mode == "accumulated":
+        for c in centroids_scaled:
+            new_array[int(c[0]), int(c[1]), int(c[2])] += 1
 
-    array_downscaled = np.round(new_array).astype(int)
+    elif downsample_mode == "capped":
+        new_array = np.round(new_array).astype(int)
+        new_array[new_array >= 1] = 1
 
-    if capped:
-        array_downscaled[array_downscaled >= 1] = 1
+    elif downsample_mode == "components":
+        if "component_labels" not in table.columns:
+            raise KeyError("Dataframe must continue key 'component_labels' for downsampling with mode 'components'.")
+        component_labels = list(table["component_labels"])
+        for comp, centr in zip(component_labels, centroids_scaled):
+            if comp != 0:
+                new_array[int(centr[0]), int(centr[1]), int(centr[2])] = comp
+        new_array = np.round(new_array).astype(int)
 
-    return array_downscaled
+    else:
+        raise ValueError("Choose one of the downsampling modes 'accumulated', 'capped', or 'components'.")
+
+    return new_array
 
 
 def coordinates_in_downscaled_blocks(
     table: pd.DataFrame,
     down_array: np.typing.NDArray,
     scale_factor: float,
     distance_component: Optional[int] = 0,
-) -> list:
+) -> List[int]:
     """Checking if coordinates are within the downscaled array.
 
     Args:
@@ -318,12 +331,12 @@ def coordinates_in_downscaled_blocks(
 
     # check if input coordinates are within down-sampled blocks
     centroids = list(zip(table["anchor_x"], table["anchor_y"], table["anchor_z"]))
-    centroids_scaled = [np.floor(np.array([c[0]/scale_factor, c[1]/scale_factor, c[2]/scale_factor])) for c in centroids]
+    centroids = [np.floor(np.array([c[0]/scale_factor, c[1]/scale_factor, c[2]/scale_factor])) for c in centroids]
 
     distance_map = distance_transform_edt(down_array == 0)
 
     centroids_binary = []
-    for c in centroids_scaled:
+    for c in centroids:
         coord = (int(c[0]), int(c[1]), int(c[2]))
         if down_array[coord] != 0:
             centroids_binary.append(1)
@@ -335,13 +348,81 @@ def coordinates_in_downscaled_blocks(
     return centroids_binary
 
 
-def erode_sgn_seg(
+def erode_sgn_seg_graph(
+    table: pd.DataFrame,
+    keyword: Optional[str] = "distance_nn100",
+    threshold_erode: Optional[float] = None,
+) -> List[List[int]]:
+    """Eroding the SGN segmentation.
+
+    Args:
+        table: Dataframe of segmentation table.
+        keyword: Keyword of the dataframe column for erosion.
+        threshold_erode: Threshold of column value after erosion step with spatial statistics.
+
+    Returns:
+        Subgraph components as lists of label_ids of dataframe.
+    """
+    print("initial length", len(table))
+    distance_nn = list(table[keyword])
+    distance_nn.sort()
+
+    if len(table) < 20000:
+        iterations = 1
+        min_cells = None
+        average_dist = int(distance_nn[int(len(table) * 0.8)])
+        threshold = threshold_erode if threshold_erode is not None else average_dist
+    else:
+        iterations = 15
+        min_cells = 20000
+        threshold = threshold_erode if threshold_erode is not None else 40
+
+    print(f"Using threshold of {threshold} micrometer for eroding segmentation with keyword {keyword}.")
+
+    new_subset = erode_subset(table.copy(), iterations=iterations,
+                              threshold=threshold, min_cells=min_cells, keyword=keyword)
+
+    # create graph from coordinates of eroded subset
+    centroids_subset = list(zip(new_subset["anchor_x"], new_subset["anchor_y"], new_subset["anchor_z"]))
+    labels_subset = [int(i) for i in list(new_subset["label_id"])]
+    coords = {}
+    for index, element in zip(labels_subset, centroids_subset):
+        coords[index] = element
+
+    graph = nx.Graph()
+    for num, pos in coords.items():
+        graph.add_node(num, pos=pos)
+
+    # create edges between points whose distance is less than threshold
+    threshold = 30
+    for i in coords:
+        for j in coords:
+            if i < j:
+                dist = math.dist(coords[i], coords[j])
+                if dist <= threshold:
+                    graph.add_edge(i, j, weight=dist)
+
+    components = list(nx.connected_components(graph))
+
+    # remove connected components with less nodes than threshold
+    min_length = 100
+    for component in components:
+        if len(component) < min_length:
+            for c in component:
+                graph.remove_node(c)
+
+    components = list(nx.connected_components(graph))
+
+    return components
+
+
+def erode_sgn_seg_downscaling(
     table: pd.DataFrame,
     keyword: Optional[str] = "distance_nn100",
     filter_small_components: Optional[int] = None,
     scale_factor: Optional[float] = 20,
     threshold_erode: Optional[float] = None,
-) -> Tuple[pd.DataFrame,np.typing.NDArray,np.typing.NDArray,np.typing.NDArray]:
+) -> Tuple[np.typing.NDArray, np.typing.NDArray]:
     """Eroding the SGN segmentation.
 
     Args:
@@ -355,7 +436,6 @@ def erode_sgn_seg(
         The labeled components of the downscaled, eroded coordinates.
         The larget connected component of the labeled components.
     """
-
     ref_dimensions = (max(table["anchor_x"]), max(table["anchor_y"]), max(table["anchor_z"]))
     print("initial length", len(table))
     distance_nn = list(table[keyword])
@@ -375,7 +455,9 @@ def erode_sgn_seg(
 
     new_subset = erode_subset(table.copy(), iterations=iterations,
                               threshold=threshold, min_cells=min_cells, keyword=keyword)
+
     eroded_arr = downscaled_centroids(new_subset, scale_factor=scale_factor, ref_dimensions=ref_dimensions)
+
     # Label connected components
     labeled, num_features = label(eroded_arr)
 
@@ -387,7 +469,7 @@ def erode_sgn_seg(
     largest_component = (labeled == largest_label).astype(np.uint8)
     largest_component_filtered = binary_fill_holes(largest_component).astype(np.uint8)
 
-    #filter small sizes
+    # filter small sizes
     if filter_small_components is not None:
         for (size, feature) in zip(sizes, range(1, num_features + 1)):
             if size < filter_small_components:
@@ -396,11 +478,12 @@ def erode_sgn_seg(
     return labeled, largest_component_filtered
 
 
-def get_components(table: pd.DataFrame,
+def get_components(
+    table: pd.DataFrame,
     labeled: np.typing.NDArray,
     scale_factor: float,
     distance_component: Optional[int] = 0,
-) -> list:
+) -> List[int]:
     """Indexing coordinates according to labeled array.
 
     Args:
@@ -423,29 +506,71 @@ def get_components(table: pd.DataFrame,
     for label_index, l in enumerate(unique_labels):
         label_arr = (labeled == l).astype(np.uint8)
         centroids_binary = coordinates_in_downscaled_blocks(table, label_arr,
-                                                            scale_factor, distance_component = distance_component)
+                                                            scale_factor, distance_component=distance_component)
         for num, c in enumerate(centroids_binary):
             if c != 0:
                 component_labels[num] = label_index + 1
 
     return component_labels
 
 
-def postprocess_sgn_seg(table: pd.DataFrame, scale_factor: Optional[float] = 20) -> pd.DataFrame:
+def component_labels_graph(table: pd.DataFrame) -> List[int]:
+    """Label components using graph connected components.
+
+    Args:
+        table: Dataframe of segmentation table.
+
+    Returns:
+        List of component label for each point in dataframe.
+    """
+    components = erode_sgn_seg_graph(table)
+
+    length_components = [len(c) for c in components]
+    length_components, components = zip(*sorted(zip(length_components, components), reverse=True))
+
+    component_labels = [0 for _ in range(len(table))]
+    for lab, comp in enumerate(components):
+        for comp_index in comp:
+            component_labels[comp_index] = lab + 1
+
+    return component_labels
+
+
+def component_labels_downscaling(table: pd.DataFrame, scale_factor: float = 20) -> List[int]:
+    """Label components using downscaling and connected components.
+
+    Args:
+        table: Dataframe of segmentation table.
+        scale_factor: Factor for downscaling.
+
+    Returns:
+        List of component label for each point in dataframe.
+    """
+    labeled, largest_component = erode_sgn_seg_downscaling(table, filter_small_components=10,
+                                                           scale_factor=scale_factor, threshold_erode=None)
+    component_labels = get_components(table, labeled, scale_factor, distance_component=1)
+
+    return component_labels
+
+
+def postprocess_sgn_seg(
+    table: pd.DataFrame,
+    postprocess_type: Optional[str] = "downsampling",
+) -> pd.DataFrame:
     """Postprocessing SGN segmentation of cochlea.
 
     Args:
         table: Dataframe of segmentation table.
-        scale_factor: Scaling for downsampling.
+        postprocess_type: Postprocessing method, either 'downsampling' or 'graph'.
 
     Returns:
         Dataframe with component labels.
     """
-    labeled, largest_component = erode_sgn_seg(table, filter_small_labels=10,
-                                               scale_factor=scale_factor, threshold_erode=None)
-
-    component_labels = get_components(table, labeled, scale_factor, distance_component = 1)
+    if postprocess_type == "downsampling":
+        component_labels = component_labels_downscaling(table)
+    elif postprocess_type == "graph":
+        component_labels = component_labels_graph(table)
 
     table.loc[:, "component_labels"] = component_labels
 
-    return table
+    return table