Refactoring; Added weighted Steiner tree

schilling40 · schilling40 · commit c41743cfd8c2 · 2025-07-15T14:31:03.000+02:00
diff --git a/flamingo_tools/segmentation/cochlea_mapping.py b/flamingo_tools/segmentation/cochlea_mapping.py
@@ -8,6 +8,7 @@
 from networkx.algorithms.approximation import steiner_tree
 
 from flamingo_tools.segmentation.postprocessing import graph_connected_components
+from flamingo_tools.segmentation.distance_weighted_steiner import distance_weighted_steiner_path
 
 
 def find_most_distant_nodes(G: nx.classes.graph.Graph, weight: str = 'weight') -> Tuple[float, float]:
@@ -25,6 +26,94 @@ def find_most_distant_nodes(G: nx.classes.graph.Graph, weight: str = 'weight') -
     return u, v
 
 
+def voxel_subsample(G, factor=0.25, voxel_size=None, seed=1234):
+    coords = np.asarray([G.nodes[n]["pos"] for n in G.nodes])
+    nodes = np.asarray(list(G.nodes))
+
+    # choose a voxel edge length if the caller has not fixed one
+    if voxel_size is None:
+        bbox = np.ptp(coords, axis=0)                  # edge lengths
+        voxel_size = (bbox.prod() / (len(G)/factor)) ** (1/3)
+
+    # integer voxel indices
+    mins = coords.min(axis=0)
+    vox = np.floor((coords - mins) / voxel_size).astype(np.int32)
+
+    # bucket nodes per voxel
+    from collections import defaultdict
+    buckets = defaultdict(list)
+    for idx, v in enumerate(map(tuple, vox)):
+        buckets[v].append(idx)
+
+    rng = np.random.default_rng(seed)
+    keep = []
+    for bucket in buckets.values():
+        k = max(1, int(round(len(bucket)*factor)))          # local quota
+        keep.extend(rng.choice(bucket, k, replace=False))
+
+    sampled_nodes = nodes[keep]
+    return G.subgraph(sampled_nodes).copy()
+
+
+def measure_run_length_sgns(graph, centroids, label_ids, filter_factor, weight="weight"):
+    if filter_factor is not None:
+        if 0 <= filter_factor < 1:
+            graph = voxel_subsample(graph, factor=filter_factor)
+            centroid_labels = list(graph.nodes)
+            centroids = [graph.nodes[n]["pos"] for n in graph.nodes]
+            k_nn_thick = int(40 * filter_factor)
+            # centroids = [centroids[label_ids.index(i)] for i in centroid_labels]
+
+        else:
+            raise ValueError(f"Invalid filter factor {filter_factor}. Choose a filter factor between 0 and 1.")
+    else:
+        k_nn_thick = 40
+        centroid_labels = label_ids
+
+    path_coords, path = distance_weighted_steiner_path(
+            centroids,   # (N,3) ndarray
+            centroid_labels=centroid_labels,  # (N,) ndarray
+            k_nn_thick=k_nn_thick,      # 20‒30 is robust for SGN clouds  int(40 * (1 - filter_factor))
+            lam=0.5,            # 0.3‒1.0 : larger → stronger centripetal bias
+            r_connect=50.0      # connect neighbours within 50 µm
+    )
+
+    for num, p in enumerate(path[:-1]):
+        pos_i = centroids[centroid_labels.index(p)]
+        pos_j = centroids[centroid_labels.index(path[num+1])]
+        dist = math.dist(pos_i, pos_j)
+        graph.add_edge(p, path[num+1], weight=dist)
+
+    total_distance = nx.path_weight(graph, path, weight=weight)
+
+    return total_distance, path, graph
+
+
+def measure_run_length_ihcs(graph, weight="weight"):
+    u, v = find_most_distant_nodes(graph)
+    # approximate Steiner tree and find shortest path between the two most distant nodes
+    terminals = set(graph.nodes())  # All nodes are required
+    # Approximate Steiner Tree over all nodes
+    T = steiner_tree(graph, terminals, weight=weight)
+    path = nx.shortest_path(T, source=u, target=v, weight=weight)
+    total_distance = nx.path_weight(T, path, weight=weight)
+    return total_distance, path
+
+
+def map_frequency(table):
+    # map frequency using Greenwood function f(x) = A * (10 **(ax) - K), for humans: a=2.1, k=0.88, A = 165.4 [kHz]
+    var_k = 0.88
+    # calculate values to fit (assumed) minimal (1kHz) and maximal (80kHz) hearing range of mice at x=0, x=1
+    fmin = 1
+    fmax = 80
+    var_A = fmin / (1 - var_k)
+    var_exp = ((fmax + var_A * var_k) / var_A)
+    table.loc[table['distance_to_path[µm]'] >= 0, 'tonotopic_value[kHz]'] = var_A * (var_exp ** table["length_fraction"] - var_k)
+    table.loc[table['distance_to_path[µm]'] < 0, 'tonotopic_value[kHz]'] = 0
+
+    return table
+
+
 def tonotopic_mapping(
     table: pd.DataFrame,
     component_label: List[int] = [1],
@@ -47,16 +136,14 @@ def tonotopic_mapping(
     Returns:
         Table with tonotopic label for cells.
     """
-    weight = "weight"
     # subset of centroids for given component label(s)
     new_subset = table[table["component_labels"].isin(component_label)]
-    comp_label_ids = list(new_subset["label_id"])
-    centroids_subset = list(zip(new_subset["anchor_x"], new_subset["anchor_y"], new_subset["anchor_z"]))
-    labels_subset = [int(i) for i in list(new_subset["label_id"])]
+    centroids = list(zip(new_subset["anchor_x"], new_subset["anchor_y"], new_subset["anchor_z"]))
+    label_ids = [int(i) for i in list(new_subset["label_id"])]
 
     # create graph with connected components
     coords = {}
-    for index, element in zip(labels_subset, centroids_subset):
+    for index, element in zip(label_ids, centroids):
         coords[index] = element
 
     components, graph = graph_connected_components(coords, max_edge_distance, min_component_length)
@@ -66,45 +153,33 @@ def tonotopic_mapping(
 
     unfiltered_graph = graph.copy()
 
-    if filter_factor is not None:
-        if 0 <= filter_factor < 1:
-            rng = np.random.default_rng(seed=1234)
-            original_array = np.array(comp_label_ids)
-            target_length = int(len(original_array) * filter_factor)
-            filtered_list = list(rng.choice(original_array, size=target_length, replace=False))
-            for filter_id in filtered_list:
-                graph.remove_node(filter_id)
-        else:
-            raise ValueError(f"Invalid filter factor {filter_factor}. Choose a filter factor between 0 and 1.")
-
-    u, v = find_most_distant_nodes(graph)
-
-    if not nx.has_path(graph, source=u, target=v) or cell_type == "ihc":
-        # approximate Steiner tree and find shortest path between the two most distant nodes
-        terminals = set(graph.nodes())  # All nodes are required
-        # Approximate Steiner Tree over all nodes
-        T = steiner_tree(graph, terminals, weight=weight)
-        path = nx.shortest_path(T, source=u, target=v, weight=weight)
-        total_distance = nx.path_weight(T, path, weight=weight)
+    if cell_type == "ihc":
+        total_distance, path = measure_run_length_ihcs(graph)
 
     else:
-        path = nx.shortest_path(graph, source=u, target=v, weight=weight)
-        total_distance = nx.path_weight(graph, path, weight=weight)
+        total_distance, path, graph = measure_run_length_sgns(graph, centroids, label_ids,
+                                                              filter_factor, weight="weight")
+
+    # measure_betweenness
+    centrality = nx.betweenness_centrality(graph, k=100, normalized=True, weight='weight', seed=1234)
+    score = sum(centrality[n] for n in path) / len(path)
+    print(f"path distance: {total_distance}")
+    print(f"centrality score: {score}")
 
     # assign relative distance to nodes on path
-    path_list = {}
-    path_list[path[0]] = {"label_id": path[0], "tonotopic": 0}
+    path_dict = {}
+    path_dict[path[0]] = {"label_id": path[0], "length_fraction": 0}
     accumulated = 0
     for num, p in enumerate(path[1:-1]):
         distance = graph.get_edge_data(path[num], p)["weight"]
         accumulated += distance
         rel_dist = accumulated / total_distance
-        path_list[p] = {"label_id": p, "tonotopic": rel_dist}
-    path_list[path[-1]] = {"label_id": path[-1], "tonotopic": 1}
+        path_dict[p] = {"label_id": p, "length_fraction": rel_dist}
+    path_dict[path[-1]] = {"label_id": path[-1], "length_fraction": 1}
 
-    # add missing nodes from component
+    # add missing nodes from component and compute distance to path
     pos = nx.get_node_attributes(unfiltered_graph, 'pos')
-    for c in comp_label_ids:
+    for c in label_ids:
         if c not in path:
             min_dist = float('inf')
             nearest_node = None
@@ -115,27 +190,28 @@ def tonotopic_mapping(
                     min_dist = dist
                     nearest_node = p
 
-            path_list[c] = {"label_id": c, "tonotopic": path_list[nearest_node]["tonotopic"]}
+            path_dict[c] = {
+                "label_id": c,
+                "length_fraction": path_dict[nearest_node]["length_fraction"],
+                "distance_to_path": min_dist,
+                }
+        else:
+            path_dict[c]["distance_to_path"] = 0
 
-    # label in micrometer
-    tonotopic = [0 for _ in range(len(table))]
-    # be aware of 'label_id' of dataframe starting at 1
-    for key in list(path_list.keys()):
-        tonotopic[int(path_list[key]["label_id"] - 1)] = path_list[key]["tonotopic"] * total_distance
+    distance_to_path = [-1 for _ in range(len(table))]
+    # 'label_id' of dataframe starting at 1
+    for key in list(path_dict.keys()):
+        distance_to_path[int(path_dict[key]["label_id"] - 1)] = path_dict[key]["distance_to_path"]
 
-    table.loc[:, "tonotopic_label"] = tonotopic
+    table.loc[:, "distance_to_path[µm]"] = distance_to_path
 
-    # map frequency using Greenwood function f(x) = A * (10 **(ax) - K), for humans: a=2.1, k=0.88, A = 165.4 [kHz]
-    tonotopic_map = [0 for _ in range(len(table))]
-    var_k = 0.88
-    # calculate values to fit (assumed) minimal (1kHz) and maximal (80kHz) hearing range of mice at x=0, x=1
-    fmin = 1
-    fmax = 80
-    var_A = fmin / (1 - var_k)
-    var_exp = ((fmax + var_A * var_k) / var_A)
-    for key in list(path_list.keys()):
-        tonotopic_map[int(path_list[key]["label_id"] - 1)] = var_A * (var_exp ** path_list[key]["tonotopic"] - var_k)
+    length_fraction = [0 for _ in range(len(table))]
+    for key in list(path_dict.keys()):
+        length_fraction[int(path_dict[key]["label_id"] - 1)] = path_dict[key]["length_fraction"]
+
+    table.loc[:, "length_fraction"] = length_fraction
+    table.loc[:, "run_length[µm]"] = table["length_fraction"] * total_distance
 
-    table.loc[:, "tonotopic_value[kHz]"] = tonotopic_map
+    table = map_frequency(table)
 
     return table
diff --git a/flamingo_tools/segmentation/distance_weighted_steiner.py b/flamingo_tools/segmentation/distance_weighted_steiner.py
@@ -0,0 +1,115 @@
+"""
+distance_weighted_steiner.py
+Variant-B: centre-seeking Steiner path for cochlear run-length extraction
+"""
+
+from __future__ import annotations
+import numpy as np
+import networkx as nx
+from scipy.spatial import cKDTree
+from typing import Tuple, Sequence, Optional
+
+
+def estimate_local_thickness(points: np.ndarray,
+                             k_nn: int = 20) -> np.ndarray:
+    """
+    Return a per-point scalar proportional to local canal thickness.
+    We use the *k*-th NN distance as a cheap proxy.
+    """
+    tree = cKDTree(points)
+    # distances shape → (N, k_nn)
+    dists, _ = tree.query(points, k=k_nn + 1)   # +1 because k=0 is the point itself
+    kth = dists[:, -1]                          # farthest of the k neighbours
+    return kth                                  # units: same as points
+
+
+def make_graph(points: np.ndarray,
+               radii: np.ndarray,
+               r_connect: float = 60.0,
+               lam: float = 0.5,
+               k_edge: Optional[int] = None) -> nx.Graph:
+    """
+    Build a graph with distance-transform-weighted edges.
+
+    Parameters
+    ----------
+    points   : (N,3) float array
+    radii    : (N,) local thickness proxy
+    r_connect: connect all neighbours within this radius (µm)
+    lam      : weight of |d_i - d_j| term
+    k_edge   : alternative to r_connect - connect the k_edge
+               nearest neighbours; leave None to use radius
+    """
+    N = len(points)
+    tree = cKDTree(points)
+
+    G = nx.Graph()
+    # add nodes with attributes
+    for idx, (xyz, r) in enumerate(zip(points, radii)):
+        G.add_node(idx, pos=tuple(xyz), radius=float(r))
+
+    # choose connectivity strategy
+    if k_edge is not None:
+        for idx in range(N):
+            _, inds = tree.query(points[idx], k=k_edge + 1)
+            for j in inds[1:]:
+                _add_edge(G, idx, j, radii, lam)
+    else:
+        # radius search in batches (memory safe)
+        pairs = tree.query_pairs(r_connect)
+        for i, j in pairs:
+            _add_edge(G, i, j, radii, lam)
+
+    return G
+
+
+def _add_edge(G: nx.Graph, i: int, j: int,
+              radii: np.ndarray, lam: float):
+    """Helper to compute weighted edge once and add both directions."""
+    pi, pj = G.nodes[i]["pos"], G.nodes[j]["pos"]
+    dij = np.linalg.norm(np.subtract(pi, pj))
+    dr = abs(radii[i] - radii[j]) / (radii[i] + radii[j] + 1e-9)
+    w = dij * (1.0 + lam * dr)
+    G.add_edge(i, j, weight=w)
+
+
+def find_endpoints(points: np.ndarray) -> Tuple[int, int]:
+    """
+    Pick apical+basal terminals as the points with minimum/maximum
+    projection on the first PCA axis (fast & robust).
+    """
+    # simple PCA via SVD
+    pts = points - points.mean(0, keepdims=True)
+    u, s, vh = np.linalg.svd(pts, full_matrices=False)
+    axis = vh[0]
+    proj = pts @ axis
+    return int(proj.argmin()), int(proj.argmax())
+
+
+def distance_weighted_steiner_path(centroids: Sequence[Sequence[float]],
+                                   *,
+                                   centroid_labels: Optional[Sequence[int]] = None,
+                                   k_nn_thick: int = 20,
+                                   lam: float = 0.5,
+                                   r_connect: float = 60.0,
+                                   k_edge: Optional[int] = None) -> Tuple[np.ndarray, list[int]]:
+    """
+    Main public entry - returns (Mx3 point array, list of node indices)
+    representing the centre-biased cochlear path.
+    """
+    pts = np.asarray(centroids, dtype=float)
+    radii = estimate_local_thickness(pts, k_nn=k_nn_thick)
+
+    G = make_graph(pts, radii, r_connect=r_connect, lam=lam, k_edge=k_edge)
+
+    s, t = find_endpoints(pts)
+    steiner = nx.algorithms.approximation.steinertree.steiner_tree(G, {s, t}, weight="weight")
+    # unique s–t path inside the tree (no branches because only 2 terminals):
+    path_nodes = nx.shortest_path(steiner, source=s, target=t, weight="weight")
+    path_xyz = np.array([G.nodes[i]["pos"] for i in path_nodes])
+
+    # transfer path nodes into centroid_labels
+    if centroid_labels is not None:
+        path_nodes = [centroid_labels[i] for i in path_nodes]
+
+    return path_xyz, path_nodes
diff --git a/scripts/prediction/tonotopic_mapping.py b/scripts/prediction/tonotopic_mapping.py
@@ -16,7 +16,8 @@ def main():
     parser.add_argument("-o", "--output", required=True, help="Output path for post-processed table.")
 
     parser.add_argument("-t", "--type", type=str, default="ihc", help="Cell type of segmentation.")
-    parser.add_argument("--filter", type=float, default=None, help="Fraction of nodes to remove before mapping.")
+    parser.add_argument("--filter", type=float, default=None,
+                        help="Fraction of nodes to keep before mapping. Default: 1.")
     parser.add_argument("--edge_distance", type=float, default=30, help="Maximal edge distance between nodes.")
     parser.add_argument("--component_length", type=int, default=50, help="Minimal number of nodes in component.")