computational-cell-analytics
diff --git a/‎scripts/export_lower_resolution.py‎
Lines changed: 63 additions & 0 deletions b/‎scripts/export_lower_resolution.py‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎scripts/synapse_marker_detection/detection_dataset.py‎
Lines changed: 68 additions & 29 deletions b/‎scripts/synapse_marker_detection/detection_dataset.py‎
Lines changed: 68 additions & 29 deletions
diff --git a/‎scripts/synapse_marker_detection/extract_training_data.py‎
Lines changed: 55 additions & 8 deletions b/‎scripts/synapse_marker_detection/extract_training_data.py‎
Lines changed: 55 additions & 8 deletions
diff --git a/‎scripts/synapse_marker_detection/train_synapse_detection.py‎
Lines changed: 10 additions & 5 deletions b/‎scripts/synapse_marker_detection/train_synapse_detection.py‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎scripts/validation/IHCs/run_evaluation.py‎
Lines changed: 2 additions & 2 deletions b/‎scripts/validation/IHCs/run_evaluation.py‎
Lines changed: 2 additions & 2 deletions
@@ -0,0 +1,63 @@
+import argparse
+import os
+
+import numpy as np
+import pandas as pd
+import tifffile
+import zarr
+
+from flamingo_tools.s3_utils import get_s3_path, BUCKET_NAME, SERVICE_ENDPOINT
+from skimage.segmentation import relabel_sequential
+
+
+def filter_component(fs, segmentation, cochlea, seg_name):
+    # First, we download the MoBIE table for this segmentation.
+    internal_path = os.path.join(BUCKET_NAME, cochlea, "tables",  seg_name, "default.tsv")
+    with fs.open(internal_path, "r") as f:
+        table = pd.read_csv(f, sep="\t")
+
+    # Then we get the ids for the components and us them to filter the segmentation.
+    component_mask = np.isin(table.component_labels.values, [1])
+    keep_label_ids = table.label_id.values[component_mask].astype("int64")
+    filter_mask = ~np.isin(segmentation, keep_label_ids)
+    segmentation[filter_mask] = 0
+
+    segmentation, _, _ = relabel_sequential(segmentation)
+    return segmentation
+
+
+def export_lower_resolution(args):
+    output_folder = os.path.join(args.output_folder, args.cochlea, f"scale{args.scale}")
+    os.makedirs(output_folder, exist_ok=True)
+
+    input_key = f"s{args.scale}"
+    for channel in args.channels:
+        out_path = os.path.join(output_folder, f"{channel}.tif")
+        if os.path.exists(out_path):
+            continue
+
+        print("Exporting channel", channel)
+        internal_path = os.path.join(args.cochlea, "images",  "ome-zarr", f"{channel}.ome.zarr")
+        s3_store, fs = get_s3_path(internal_path, bucket_name=BUCKET_NAME, service_endpoint=SERVICE_ENDPOINT)
+        with zarr.open(s3_store, mode="r") as f:
+            data = f[input_key][:]
+        print(data.shape)
+        if args.filter_by_component:
+            data = filter_component(fs, data, args.cochlea, channel)
+        tifffile.imwrite(out_path, data, bigtiff=True, compression="zlib")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--cochlea", "-c", required=True)
+    parser.add_argument("--scale", "-s", type=int, required=True)
+    parser.add_argument("--output_folder", "-o", required=True)
+    parser.add_argument("--channels", nargs="+", default=["PV", "VGlut3", "CTBP2"])
+    parser.add_argument("--filter_by_component", action="store_true")
+    args = parser.parse_args()
+
+    export_lower_resolution(args)
+
+
+if __name__ == "__main__":
+    main()
@@ -7,22 +7,42 @@
 from torch_em.util import ensure_tensor_with_channels
 
 
-# Process labels stored in json napari style.
-# I don't actually think that we need the epsilon here, but will leave it for now.
-def process_labels(label_path, shape, sigma, eps, bb=None):
-    points = pd.read_csv(label_path)
+class MinPointSampler:
+    """A sampler to reject samples with a low fraction of foreground pixels in the labels.
+
+    Args:
+        min_fraction: The minimal fraction of foreground pixels for accepting a sample.
+        background_id: The id of the background label.
+        p_reject: The probability for rejecting a sample that does not meet the criterion.
+    """
+    def __init__(self, min_points: int, p_reject: float = 1.0):
+        self.min_points = min_points
+        self.p_reject = p_reject
+
+    def __call__(self, x: np.ndarray, n_points: int) -> bool:
+        """Check the sample.
+
+        Args:
+            x: The raw data.
+            y: The label data.
+
+        Returns:
+            Whether to accept this sample.
+        """
+
+        if n_points > self.min_points:
+            return True
+        else:
+            return np.random.rand() > self.p_reject
 
-    if bb:
-        (z_min, z_max), (y_min, y_max), (x_min, x_max) = [(s.start, s.stop) for s in bb]
-        restricted_shape = (z_max - z_min, y_max - y_min, x_max - x_min)
-        labels = np.zeros(restricted_shape, dtype="float32")
-        shape = restricted_shape
-    else:
-        labels = np.zeros(shape, dtype="float32")
 
+def load_labels(label_path, shape, bb):
+    points = pd.read_csv(label_path)
     assert len(points.columns) == len(shape)
-    z_coords, y_coords, x_coords = points["axis-0"], points["axis-1"], points["axis-2"]
+    z_coords, y_coords, x_coords = points["axis-0"].values, points["axis-1"].values, points["axis-2"].values
+
     if bb is not None:
+        (z_min, z_max), (y_min, y_max), (x_min, x_max) = [(s.start, s.stop) for s in bb]
         z_coords -= z_min
         y_coords -= y_min
         x_coords -= x_min
@@ -32,13 +52,31 @@ def process_labels(label_path, shape, sigma, eps, bb=None):
             np.logical_and(x_coords >= 0, x_coords < (x_max - x_min)),
         ])
         z_coords, y_coords, x_coords = z_coords[mask], y_coords[mask], x_coords[mask]
+        restricted_shape = (z_max - z_min, y_max - y_min, x_max - x_min)
+        shape = restricted_shape
 
+    n_points = len(z_coords)
     coords = tuple(
         np.clip(np.round(coord).astype("int"), 0, coord_max - 1) for coord, coord_max in zip(
             (z_coords, y_coords, x_coords), shape
         )
     )
 
+    return coords, n_points
+
+
+# Process labels stored in json napari style.
+# I don't actually think that we need the epsilon here, but will leave it for now.
+def process_labels(coords, shape, sigma, eps, bb=None):
+
+    if bb:
+        (z_min, z_max), (y_min, y_max), (x_min, x_max) = [(s.start, s.stop) for s in bb]
+        restricted_shape = (z_max - z_min, y_max - y_min, x_max - x_min)
+        labels = np.zeros(restricted_shape, dtype="float32")
+        shape = restricted_shape
+    else:
+        labels = np.zeros(shape, dtype="float32")
+
     labels[coords] = 1
     labels = gaussian(labels, sigma)
     # TODO better normalization?
@@ -124,16 +162,10 @@ def _get_sample(self, index):
         raw, label_path = self.raw_path, self.label_path
 
         raw = zarr.open(raw)[self.raw_key]
+        have_raw_channels = raw.ndim == 4  # 3D with channels
         shape = raw.shape
 
         bb = self._sample_bounding_box(shape)
-        label = process_labels(label_path, shape, self.sigma, self.eps, bb=bb)
-
-        have_raw_channels = raw.ndim == 4  # 3D with channels
-        have_label_channels = label.ndim == 4
-        if have_label_channels:
-            raise NotImplementedError("Multi-channel labels are not supported.")
-
         prefix_box = tuple()
         if have_raw_channels:
             if shape[-1] < 16:
@@ -143,18 +175,25 @@ def _get_sample(self, index):
                 prefix_box = (slice(None), )
 
         raw_patch = np.array(raw[prefix_box + bb])
-        label_patch = np.array(label)
 
+        coords, n_points = load_labels(label_path, shape, bb)
         if self.sampler is not None:
-            assert False, "Sampler not implemented"
-            # sample_id = 0
-            # while not self.sampler(raw_patch, label_patch):
-            #     bb = self._sample_bounding_box(shape)
-            #     raw_patch = np.array(raw[prefix_box + bb])
-            #     label_patch = np.array(label[bb])
-            #     sample_id += 1
-            #     if sample_id > self.max_sampling_attempts:
-            #         raise RuntimeError(f"Could not sample a valid batch in {self.max_sampling_attempts} attempts")
+            sample_id = 0
+            while not self.sampler(raw_patch, n_points):
+                bb = self._sample_bounding_box(shape)
+                raw_patch = np.array(raw[prefix_box + bb])
+                coords, n_points = load_labels(label_path, shape, bb)
+                sample_id += 1
+                if sample_id > self.max_sampling_attempts:
+                    raise RuntimeError(f"Could not sample a valid batch in {self.max_sampling_attempts} attempts")
+
+        label = process_labels(coords, shape, self.sigma, self.eps, bb=bb)
+
+        have_label_channels = label.ndim == 4
+        if have_label_channels:
+            raise NotImplementedError("Multi-channel labels are not supported.")
+
+        label_patch = np.array(label)
 
         if have_raw_channels and len(prefix_box) == 0:
             raw_patch = raw_patch.transpose((3, 0, 1, 2))  # Channels, Depth, Height, Width
 
@@ -19,23 +19,30 @@ def get_voxel_size(imaris_file):
     return vsize
 
 
-def extract_training_data(imaris_file, output_folder):
+def extract_training_data(imaris_file, output_folder, crop=True, scale=True):
+    point_key = "/Scene/Content/Points0/CoordsXYZR"
     with h5py.File(imaris_file, "r") as f:
+        if point_key not in f:
+            print("Skipping", imaris_file, "due to missing annotations")
+            return
         data = f["/DataSet/ResolutionLevel 0/TimePoint 0/Channel 0/Data"][:]
-        points = f["/Scene/Content/Points0/CoordsXYZR"][:]
+        points = f[point_key][:]
         points = points[:, :-1]
         points = points[:, ::-1]
 
     # TODO crop the data to the original shape.
     # Can we just crop the zero-padding ?!
-    crop_box = np.where(data != 0)
-    crop_box = tuple(slice(0, int(cb.max() + 1)) for cb in crop_box)
-    data = data[crop_box]
-    print(data.shape)
+    if crop:
+        crop_box = np.where(data != 0)
+        crop_box = tuple(slice(0, int(cb.max() + 1)) for cb in crop_box)
+        data = data[crop_box]
 
     # Scale the points to match the image dimensions.
     voxel_size = get_voxel_size(imaris_file)
-    points /= voxel_size[None]
+    if scale:
+        points /= voxel_size[None]
+
+    print(data.shape, voxel_size)
 
     if output_folder is None:
         v = napari.Viewer()
@@ -69,11 +76,51 @@ def extract_training_data(imaris_file, output_folder):
 # - 4.2R_apex_IHCribboncount_Z.ims
 # - 6.2R_apex_IHCribboncount_Z.ims  (very small crop)
 # - 6.2R_base_IHCribbons_Z.ims
-def main():
+def process_training_data_v1():
     files = sorted(glob("./data/synapse_stains/*.ims"))
     for ff in files:
         extract_training_data(ff, output_folder="./training_data")
 
 
+def process_training_data_v2(visualize=True):
+    input_root = "/mnt/vast-nhr/projects/nim00007/data/moser/cochlea-lightsheet/ImageCropsIHC_synapses"
+
+    train_output = "/mnt/vast-nhr/projects/nim00007/data/moser/cochlea-lightsheet/training_data/synapses/training_data/v2"  # noqa
+    test_output = "/mnt/vast-nhr/projects/nim00007/data/moser/cochlea-lightsheet/training_data/synapses/test/v2"  # noqa
+
+    train_folders = ["M78L_IHC-synapse_crops"]
+    test_folders = ["M226L_IHC-synapse_crops", "M226R_IHC-synapsecrops"]
+
+    valid_files = [
+        "m78l_apexp2718_cr-ctbp2.ims",
+        "m226r_apex_p1268_pv-ctbp2.ims",
+        "m226r_base_p800_vglut3-ctbp2.ims",
+    ]
+
+    for folder in train_folders + test_folders:
+
+        if visualize:
+            output_folder = None
+        elif folder in train_folders:
+            output_folder = train_output
+            os.makedirs(output_folder, exist_ok=True)
+        else:
+            output_folder = test_output
+            os.makedirs(output_folder, exist_ok=True)
+
+        imaris_files = sorted(glob(os.path.join(input_root, folder, "*.ims")))
+        for imaris_file in imaris_files:
+            fname = os.path.basename(imaris_file)
+            if fname not in valid_files:
+                continue
+            print(fname)
+            extract_training_data(imaris_file, output_folder, crop=True, scale=True)
+
+
+def main():
+    # process_training_data_v1()
+    process_training_data_v2(visualize=False)
+
+
 if __name__ == "__main__":
     main()
@@ -1,14 +1,14 @@
 import os
 import sys
 
-from detection_dataset import DetectionDataset
+from detection_dataset import DetectionDataset, MinPointSampler
 
 sys.path.append("/home/pape/Work/my_projects/czii-protein-challenge")
 sys.path.append("/user/pape41/u12086/Work/my_projects/czii-protein-challenge")
 
 from utils.training.training import supervised_training  # noqa
 
-ROOT = "/mnt/vast-nhr/projects/nim00007/data/moser/cochlea-lightsheet/training_data/synapses/training_data/v1"  # noqa
+ROOT = "/mnt/vast-nhr/projects/nim00007/data/moser/cochlea-lightsheet/training_data/synapses/training_data/v2"  # noqa
 TRAIN_ROOT = os.path.join(ROOT, "images")
 LABEL_ROOT = os.path.join(ROOT, "labels")
 
@@ -21,6 +21,7 @@ def get_paths(split):
         "4.2R_apex_IHCribboncount_Z",
         "4.2R_apex_IHCribboncount_Z",
         "6.2R_apex_IHCribboncount_Z",
+        "m78l_apexp2718_cr-ctbp2",
         "6.2R_base_IHCribbons_Z",
     ]
     image_paths = [os.path.join(TRAIN_ROOT, f"{fname}.zarr") for fname in file_names]
@@ -33,13 +34,16 @@ def get_paths(split):
         image_paths = image_paths[-1:]
         label_paths = label_paths[-1:]
 
+    for path in image_paths:
+        assert os.path.exists(path), path
+
     return image_paths, label_paths
 
 
 # TODO maybe add a sampler for the label data
 def train():
 
-    model_name = "synapse_detection_v1"
+    model_name = "synapse_detection_v2"
 
     train_paths, train_label_paths = get_paths("train")
     val_paths, val_label_paths = get_paths("val")
@@ -52,7 +56,7 @@ def train():
 
     patch_shape = [40, 112, 112]
     batch_size = 32
-    check = False
+    check = True
 
     supervised_training(
         name=model_name,
@@ -64,7 +68,7 @@ def train():
         patch_shape=patch_shape, batch_size=batch_size,
         check=check,
         lr=1e-4,
-        n_iterations=int(5e4),
+        n_iterations=int(1e5),
         out_channels=1,
         augmentations=None,
         eps=1e-5,
@@ -77,6 +81,7 @@ def train():
         dataset_class=DetectionDataset,
         n_samples_train=3200,
         n_samples_val=160,
+        sampler=MinPointSampler(min_points=1, p_reject=0.6),
     )
 
 
 
@@ -3,7 +3,7 @@
 
 import pandas as pd
 from flamingo_tools.validation import (
-    fetch_data_for_evaluation, parse_annotation_path, compute_scores_for_annotated_slice
+    fetch_data_for_evaluation, _parse_annotation_path, compute_scores_for_annotated_slice
 )
 
 ROOT = "/mnt/vast-nhr/projects/nim00007/data/moser/cochlea-lightsheet/AnnotatedImageCrops/F1ValidationIHCs"
@@ -29,7 +29,7 @@ def run_evaluation(root, annotation_folders, result_file, cache_folder):
         annotations = sorted(glob(os.path.join(root, folder, "*.csv")))
         for annotation_path in annotations:
             print(annotation_path)
-            cochlea, slice_id = parse_annotation_path(annotation_path)
+            cochlea, slice_id = _parse_annotation_path(annotation_path)
 
             # For the cochlea M_LR_000226_R the actual component is 2, not 1
             component = 2 if "226_R" in cochlea else 1