Update postprocessing code

constantinpape · constantinpape · commit d9ae457949d9 · 2025-04-21T18:10:10.000+02:00
diff --git a/flamingo_tools/file_utils.py b/flamingo_tools/file_utils.py
@@ -64,7 +64,7 @@ def read_tif(file_path: str) -> Union[np.ndarray, np.memmap]:
 
 # TODO: Update the any types:
 # The first should be the type of a zarr s3 store,
-def read_image_data(input_path: Union[str, Any], input_key: Optional[str]) -> np.array_like:
+def read_image_data(input_path: Union[str, Any], input_key: Optional[str]) -> np.typing.ArrayLike:
     """Read flamingo image data, stored in various formats.
 
     Args:
diff --git a/flamingo_tools/segmentation/__init__.py b/flamingo_tools/segmentation/__init__.py
@@ -1,2 +1,2 @@
 from .unet_prediction import run_unet_prediction
-from .postprocessing import filter_isolated_objects
+from .postprocessing import filter_segmentation
diff --git a/flamingo_tools/segmentation/postprocessing.py b/flamingo_tools/segmentation/postprocessing.py
@@ -1,97 +1,174 @@
-import numpy as np
-import vigra
 import multiprocessing as mp
 from concurrent import futures
+from typing import Callable, Tuple, Optional
 
-from skimage import measure
+import elf.parallel as parallel
+import numpy as np
+import nifty.tools as nt
+import pandas as pd
+import vigra
+
+from elf.io import open_file
 from scipy.spatial import distance
 from scipy.sparse import csr_matrix
-from tqdm import tqdm
+from scipy.spatial import cKDTree, ConvexHull
+from skimage import measure
 from sklearn.neighbors import NearestNeighbors
+from tqdm import tqdm
 
-import elf.parallel as parallel
-from elf.io import open_file
-import nifty.tools as nt
 
+#
+# Spatial statistics:
+# Three different spatial statistics implementations that
+# can be used as the basis of a filtering criterion.
+#
 
-def distance_nearest_neighbors(tsv_table, n_neighbors=10, expand_table=True):
-    """Calculate average distance of n nearest neighbors.
 
-    :param DataFrame tsv_table:
-    :param int n_neighbors: Number of nearest neighbors
-    :param bool expand_table: Flag for expanding DataFrame
-    :returns: List of average distances
-    :rtype: list
-    """
-    centroids = list(zip(tsv_table["anchor_x"], tsv_table["anchor_y"], tsv_table["anchor_z"]))
+def nearest_neighbor_distance(table: pd.DataFrame, n_neighbors: int = 10) -> np.ndarray:
+    """Compute the average distance to the n nearest neighbors.
+
+    Args:
+        table: The table with the centroid coordinates.
+        n_neighbors: The number of neighbors to take into account for the distance computation.
 
-    coordinates = np.array(centroids)
+    Returns:
+        The average distances to the n nearest neighbors.
+    """
+    centroids = list(zip(table["anchor_x"], table["anchor_y"], table["anchor_z"]))
+    centroids = np.array(centroids)
 
-    # nearest neighbor is always itself, so n_neighbors+=1
-    nbrs = NearestNeighbors(n_neighbors=n_neighbors+1).fit(coordinates)
-    distances, indices = nbrs.kneighbors(coordinates)
+    # Nearest neighbor is always itself, so n_neighbors+=1.
+    nbrs = NearestNeighbors(n_neighbors=n_neighbors+1).fit(centroids)
+    distances, indices = nbrs.kneighbors(centroids)
 
     # Average distance to nearest neighbors
-    distance_avg = [sum(d) / len(d) for d in distances[:, 1:]]
+    distance_avg = np.array([sum(d) / len(d) for d in distances[:, 1:]])
+    return distance_avg
 
-    if expand_table:
-        tsv_table['distance_nn'+str(n_neighbors)] = distance_avg
 
-    return distance_avg
+def local_ripleys_k(table: pd.DataFrame, radius: float = 15, volume: Optional[float] = None) -> np.ndarray:
+    """Compute the local Ripley's K function for each point in a 2D / 3D.
 
+    Args:
+        table: The table with the centroid coordinates.
+        radius: The radius within which to count neighboring points.
+        volume: The area (2D) or volume (3D) of the study region. If None, it is estimated from the convex hull.
+
+    Returns:
+        An array containing the local K values for each point.
+    """
+    points = list(zip(table["anchor_x"], table["anchor_y"], table["anchor_z"]))
+    points = np.array(points)
+    n_points, dim = points.shape
+
+    if dim not in (2, 3):
+        raise ValueError("Points array must be of shape (n_points, 2) or (n_points, 3).")
+
+    # Estimate area/volume if not provided.
+    if volume is None:
+        hull = ConvexHull(points)
+        volume = hull.volume  # For 2D, 'volume' is area; for 3D, it's volume.
+
+    # Compute point density.
+    density = n_points / volume
+
+    # Build a KD-tree for efficient neighbor search.
+    tree = cKDTree(points)
+
+    # Count neighbors within the specified radius for each point
+    counts = tree.query_ball_point(points, r=radius)
+    local_counts = np.array([len(c) - 1 for c in counts])  # Exclude the point itself
+
+    # Normalize by density to get local K values
+    local_k = local_counts / density
+    return local_k
 
-def filter_isolated_objects(
-    segmentation, output_path, tsv_table=None,
-    distance_threshold=15, neighbor_threshold=5, min_size=1000,
-    output_key="segmentation_postprocessed",
-):
-    """Postprocessing step to filter isolated objects from a segmentation.
 
-    Instance segmentations are filtered if they have fewer neighbors
-    than a given threshold in a given distance around them.
-    Additionally, size filtering is possible if a TSV file is supplied.
-
-    :param dataset segmentation: Dataset containing the segmentation
-    :param str out_path: Output path for postprocessed segmentation
-    :param str tsv_file: Optional TSV file containing segmentation parameters in MoBIE format
-    :param int distance_threshold: Distance in micrometer to check for neighbors
-    :param int neighbor_threshold: Minimal number of neighbors for filtering
-    :param int min_size: Minimal number of pixels for filtering small instances
-    :param str output_key: Output key for postprocessed segmentation
+def neighbors_in_radius(table: pd.DataFrame, radius: float = 15) -> np.ndarray:
+    """Compute the number of neighbors within a given radius.
+
+    Args:
+        table: The table with the centroid coordinates.
+        radius: The radius within which to count neighboring points.
+
+    Returns:
+        An array containing the number of neighbors within the given radius.
     """
-    if tsv_table is not None:
-        n_pixels = tsv_table["n_pixels"].to_list()
-        label_ids = tsv_table["label_id"].to_list()
-        centroids = list(zip(tsv_table["anchor_x"], tsv_table["anchor_y"], tsv_table["anchor_z"]))
-        n_ids = len(label_ids)
-
-        # filter out cells smaller than min_size
-        if min_size is not None:
-            min_size_label_ids = [l for (l, n) in zip(label_ids, n_pixels) if n <= min_size]
-            centroids = [c for (c, l) in zip(centroids, label_ids) if l not in min_size_label_ids]
-            label_ids = [int(lid) for lid in label_ids if lid not in min_size_label_ids]
-
-        coordinates = np.array(centroids)
-        label_ids = np.array(label_ids)
-
-    else:
-        segmentation, n_ids, _ = vigra.analysis.relabelConsecutive(segmentation[:], start_label=1, keep_zeros=True)
-        props = measure.regionprops(segmentation)
-        coordinates = np.array([prop.centroid for prop in props])
-        label_ids = np.unique(segmentation)[1:]
-
-    # Calculate pairwise distances and convert to a square matrix
-    dist_matrix = distance.pdist(coordinates)
+    points = list(zip(table["anchor_x"], table["anchor_y"], table["anchor_z"]))
+    points = np.array(points)
+
+    dist_matrix = distance.pdist(points)
     dist_matrix = distance.squareform(dist_matrix)
 
-    # Create sparse matrix of connections within the threshold distance
-    sparse_matrix = csr_matrix(dist_matrix < distance_threshold, dtype=int)
+    # Create sparse matrix of connections within the threshold distance.
+    sparse_matrix = csr_matrix(dist_matrix < radius, dtype=int)
 
-    # Sum each row to count neighbors
+    # Sum each row to count neighbors.
     neighbor_counts = sparse_matrix.sum(axis=1)
+    return np.array(neighbor_counts)
+
+
+#
+# Filtering function:
+# Filter the segmentation based on a spatial statistics from above.
+#
+
+
+def _compute_table(segmentation):
+    segmentation, n_ids, _ = vigra.analysis.relabelConsecutive(segmentation[:], start_label=1, keep_zeros=True)
+    props = measure.regionprops(segmentation)
+    coordinates = np.array([prop.centroid for prop in props])[1:]
+    label_ids = np.unique(segmentation)[1:]
+    sizes = np.array([prop.area for prop in props])[1:]
+    table = pd.DataFrame({
+        "label_id": label_ids,
+        "n_pixels": sizes,
+        "anchor_x": coordinates[:, 2],
+        "anchor_y": coordinates[:, 1],
+        "anchor_z": coordinates[:, 0],
+    })
+    return table
+
+
+def filter_segmentation(
+    segmentation: np.typing.ArrayLike,
+    output_path: str,
+    spatial_statistics: Callable,
+    threshold: float,
+    min_size: int = 1000,
+    table: Optional[pd.DataFrame] = None,
+    output_key: str = "segmentation_postprocessed",
+) -> Tuple[int, int]:
+    """Postprocessing step to filter isolated objects from a segmentation.
 
-    filter_mask = np.array(neighbor_counts < neighbor_threshold).squeeze()
-    filter_ids = label_ids[filter_mask]
+    Instance segmentations are filtered based on spatial statistics and a threshold.
+    In addition, objects smaller than a given size are filtered out.
+
+    Args:
+        segmentation: Dataset containing the segmentation
+        output_path: Output path for postprocessed segmentation
+        spatial_statistics:
+        threshold: Distance in micrometer to check for neighbors
+        min_size: Minimal number of pixels for filtering small instances
+        table:
+        output_key: Output key for postprocessed segmentation
+
+    Returns:
+        n_ids
+        n_ids_filtered
+    """
+    # Compute the table on the fly.
+    # NOTE: this currently doesn't work for large segmentations.
+    if table is None:
+        table = _compute_table(segmentation)
+    n_ids = len(table)
+
+    # First apply the size filter.
+    table = table[table.n_pixels > min_size]
+    stat_values = spatial_statistics(table)
+
+    keep_mask = np.array(stat_values > threshold).squeeze()
+    keep_ids = table.label_id.values[keep_mask]
 
     shape = segmentation.shape
     block_shape = (128, 128, 128)
@@ -100,7 +177,6 @@ def filter_isolated_objects(
     blocking = nt.blocking([0] * len(shape), shape, block_shape)
 
     output = open_file(output_path, mode="a")
-
     output_dataset = output.create_dataset(
         output_key, shape=shape, dtype=segmentation.dtype,
         chunks=chunks, compression="gzip"
@@ -112,17 +188,16 @@ def filter_chunk(block_id):
         block = blocking.getBlock(block_id)
         volume_index = tuple(slice(beg, end) for beg, end in zip(block.begin, block.end))
         data = segmentation[volume_index]
-        data[np.isin(data, filter_ids)] = 0
+        data[np.isin(data, keep_ids)] = 0
         output_dataset[volume_index] = data
 
     # Limit the number of cores for parallelization.
     n_threads = min(16, mp.cpu_count())
-
     with futures.ThreadPoolExecutor(n_threads) as filter_pool:
         list(tqdm(filter_pool.map(filter_chunk, range(blocking.numberOfBlocks)), total=blocking.numberOfBlocks))
 
     seg_filtered, n_ids_filtered, _ = parallel.relabel_consecutive(
-        output_dataset, start_label=1, keep_zeros=True, block_shape=(128, 128, 128)
+        output_dataset, start_label=1, keep_zeros=True, block_shape=block_shape
     )
 
-    return seg_filtered, n_ids, n_ids_filtered
+    return n_ids, n_ids_filtered
diff --git a/flamingo_tools/segmentation/unet_prediction.py b/flamingo_tools/segmentation/unet_prediction.py
@@ -25,7 +25,7 @@
 from tqdm import tqdm
 
 import flamingo_tools.s3_utils as s3_utils
-from flamingl_tools.file_utils import read_image_data
+from flamingo_tools.file_utils import read_image_data
 
 
 class SelectChannel(SimpleTransformationWrapper):
@@ -35,7 +35,7 @@ class SelectChannel(SimpleTransformationWrapper):
         volume: The array-like input dataset.
         channel: The channel that will be selected.
     """
-    def __init__(self, volume: np.array_like, channel: int):
+    def __init__(self, volume: np.typing.ArrayLike, channel: int):
         self.channel = channel
         super().__init__(volume, lambda x: x[self.channel], with_channels=True)
 
diff --git a/scripts/prediction/postprocess_seg.py b/scripts/prediction/postprocess_seg.py
@@ -1,16 +1,15 @@
 import argparse
 import os
-import sys
 
 import pandas as pd
 import zarr
 
-sys.path.append("../..")
-
 import flamingo_tools.s3_utils as s3_utils
+from flamingo_tools.segmentation import filter_segmentation
+
 
+# TODO needs updates
 def main():
-    from flamingo_tools.segmentation import filter_isolated_objects
 
     parser = argparse.ArgumentParser(
         description="Script for postprocessing segmentation data in zarr format. Either locally or on an S3 bucket.")

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`from .unet_prediction import run_unet_prediction`
`2`		`-from .postprocessing import filter_isolated_objects`
	`2`	`+from .postprocessing import filter_segmentation`