Implement subtype prediction WIP

constantinpape · constantinpape · commit 97dfa8591d5d · 2025-10-29T20:44:18.000+01:00
diff --git a/scripts/measurements/merge_sgn_segmentation.py b/scripts/measurements/merge_sgn_segmentation.py
@@ -4,6 +4,8 @@
 
 import numpy as np
 import zarr
+import z5py
+
 from elf.evaluation.matching import label_overlap, intersection_over_union
 from flamingo_tools.s3_utils import BUCKET_NAME, create_s3_target, get_s3_path
 from nifty.tools import blocking
@@ -13,7 +15,7 @@
 def merge_segmentations(seg_a, seg_b, ids_b, offset, output_path):
     assert seg_a.shape == seg_b.shape
 
-    output_file = zarr.open(output_path, mode="a")
+    output_file = z5py.File(output_path, mode="a")
     output = output_file.create_dataset("segmentation", shape=seg_a.shape, dtype=seg_a.dtype, chunks=seg_a.chunks)
     blocks = blocking([0, 0, 0], seg_a.shape, seg_a.chunks)
 
@@ -63,21 +65,24 @@ def merge_sgns(cochlea, name_a, name_b, overlap_threshold=0.25):
     cumulative_overlap = overlap[1:, :].sum(axis=0)
     all_ids_b = np.unique(seg_b)
     ids_b = all_ids_b[cumulative_overlap < overlap_threshold]
+    if 0 in ids_b:  # Zero is likely in the ids due to the logic.
+        ids_b = ids_b[1:]
+    assert 0 not in ids_b
     offset = seg_a.max()
 
     # Get the segmentations at full resolution to merge them.
-    seg_a = get_segmentation(cochlea, seg_name=name_a, seg_key="s2")
-    seg_b = get_segmentation(cochlea, seg_name=name_b, seg_key="s2")
+    seg_a = get_segmentation(cochlea, seg_name=name_a, seg_key="s0")
+    seg_b = get_segmentation(cochlea, seg_name=name_b, seg_key="s0")
 
     # Write out the merged segmentations.
     output_folder = f"./data/{cochlea}"
     os.makedirs(output_folder, exist_ok=True)
-    output_path = os.path.join(output_folder, "SGN_merged.zarr")
+    output_path = os.path.join(output_folder, "SGN_merged.n5")
     merge_segmentations(seg_a, seg_b, ids_b, offset, output_path)
 
 
 def main():
-    # merge_sgns(cochlea="M_AMD_N180_L", name_a="CR_SGN_v2", name_b="Ntng1_SGN_v2")
+    merge_sgns(cochlea="M_AMD_N180_L", name_a="CR_SGN_v2", name_b="Ntng1_SGN_v2")
     merge_sgns(cochlea="M_AMD_N180_R", name_a="CR_SGN_v2", name_b="Ntng1_SGN_v2")
 
 
diff --git a/scripts/measurements/subtype_prediction.py b/scripts/measurements/subtype_prediction.py
@@ -0,0 +1,129 @@
+import os
+from glob import glob
+from pathlib import Path
+
+import imageio.v3 as imageio
+import numpy as np
+import pandas as pd
+
+from skimage.measure import regionprops
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score
+
+ROOT_AMD = "/mnt/vast-nhr/projects/nim00007/data/moser/cochlea-lightsheet/SGN_subtypes/Result_AMD"
+ROOT_EK = "/mnt/vast-nhr/projects/nim00007/data/moser/cochlea-lightsheet/SGN_subtypes/Result_EK"
+
+
+def load_annotations(pattern):
+    paths = sorted(glob(pattern))
+    annotations = [path[len(pattern):] for path in paths]
+    channels = [annotation.split("_")[0] for annotation in annotations]
+    return paths, channels
+
+
+# Get the features and per channel labels from this crop.
+def extract_crop_data(crop_table, crop_root):
+    table = pd.read_csv(crop_table, sep="\t")
+    prefix = Path(crop_table).stem
+
+    # Get the paths to all annotations.
+    paths_amd, channels_amd = load_annotations(os.path.join(ROOT_AMD, f"positive-negative_{prefix}*"))
+    paths_ek, channels_ek = load_annotations(os.path.join(ROOT_EK, f"positive-negative_{prefix}*"))
+    channel_names = list(set(channels_amd))
+
+    # Load the segmentation.
+    seg_path = os.path.join(crop_root, f"{prefix}_PV_SGN_v2.tif")
+    seg = imageio.imread(seg_path)
+
+    # Load the features (= intensity and PV intensity ratios) for both channels.
+    features = table[
+        [f"marker_{channel_names[0]}", f"{channel_names[0]}_ratio_PV"] +
+        [f"marker_{channel_names[1]}", f"{channel_names[1]}_ratio_PV"]
+    ].values
+
+    # Load the labels, derived from the annotations.
+    labels = {channel: None for channel in channel_names}
+    for channel, path in zip(channels_amd, paths_amd):
+        data = imageio.imread(path)
+        props = regionprops(seg, data)
+        labeling = np.array([prop.max_intensity for prop in props], dtype="int32")
+        if labels[channel] is None:
+            labels[channel] = labeling
+        else:
+            # Combine labels so that we only keep the labels that agree, set others to zero
+            # (in order to filter them out later).
+            prev_labeling = labels[channel]
+            disagreement = prev_labeling != labeling
+            labeling[disagreement] = 0
+            labels[channel] = labeling
+
+    return features, labels
+
+
+def process_cochlea(cochlea):
+    # The root folders for tables and crop data for this cochlea.
+    table_root = f"/mnt/vast-nhr/projects/nim00007/data/moser/cochlea-lightsheet/SGN_subtypes/tables_{cochlea}"
+    crop_root = f"/mnt/vast-nhr/projects/nim00007/data/moser/cochlea-lightsheet/SGN_subtypes/{cochlea}"
+
+    # Getthe tables for all crops in this cochlea.
+    tables = sorted(glob(os.path.join(table_root, "*.tsv")))
+
+    # Iterate over the crops, load the features and the labels per channel.
+    features = []
+    labels = {}
+    for table in tables:
+        crop_features, crop_labels = extract_crop_data(table, crop_root)
+        features.append(crop_features)
+        # Concatenate the labels per channel.
+        for channel, labeling in crop_labels.items():
+            if channel in labels:
+                labels[channel] = np.concatenate([labels[channel], labeling], axis=0)
+            else:
+                labels[channel] = labeling
+    features = np.concatenate(features, axis=0)
+
+    # Train and evaluate logistic regression per channel.
+    start, stop = 0, 2
+    for channel, labeling in labels.items():
+        # Exclude labels with value zero.
+        label_mask = labeling != 0
+        # Get the features for this channel.
+        this_features = features[:, start:stop][label_mask]
+        this_labels = labeling[label_mask]
+
+        # Create a train and test split.
+        train_features, test_features, train_labels, test_labels = train_test_split(
+            this_features, this_labels, test_size=0.3
+        )
+
+        # Train and evaluate the classifier.
+        classifier = LogisticRegression(penalty="l2")
+        classifier.fit(train_features, train_labels)
+
+        prediction = classifier.predict(test_features)
+        accuracy = accuracy_score(test_labels, prediction)
+        print("Channel:", channel)
+        print("Accuracy:", accuracy)
+
+        start += 2
+        stop += 2
+
+    # Note: we could do some other things here:
+    # - Train a single classifier for subtype prediction (= 4 classes) using all channels.
+    # - Use different classifier (e.g. RandomForest); however, accuracy from logistic regression looks fine.
+    # - To better understand results we could also look at the confusion matrix.
+    # - A better evaluation would be to train and test on separate blocks.
+
+    # The classifier can be saved and loaded with pickle, to apply it to all SGNs in the cochlea later.
+
+
+def main():
+    # Process a cochlea by:
+    # - Extracting the features (intensities and intensity ratios) and labels for each crop.
+    # - Training a classifier based on the labels and evaluating it.
+    process_cochlea("MLR99L")
+
+
+if __name__ == "__main__":
+    main()