Training is working

constantinpape · constantinpape · commit 938d4baf24d0 · 2025-01-29T18:57:00.000+01:00
diff --git a/scripts/synapse_marker_detection/.gitignore b/scripts/synapse_marker_detection/.gitignore
@@ -0,0 +1 @@
+data/
diff --git a/scripts/synapse_marker_detection/detection_dataset.py b/scripts/synapse_marker_detection/detection_dataset.py
@@ -0,0 +1,166 @@
+import numpy as np
+import pandas as pd
+import torch
+import zarr
+
+from skimage.filters import gaussian
+from torch_em.util import ensure_tensor_with_channels
+
+
+# Process labels stored in json napari style.
+# I don't actually think that we need the epsilon here, but will leave it for now.
+def process_labels(label_path, shape, sigma, eps):
+    labels = np.zeros(shape, dtype="float32")
+    points = pd.read_csv(label_path)
+    assert len(points.columns) == len(shape)
+    coords = tuple(
+        np.clip(np.round(points[ax].values).astype("int"), 0, shape[i] - 1)
+        for i, ax in enumerate(points.columns)
+    )
+    labels[coords] = 1
+    labels = gaussian(labels, sigma)
+    # TODO better normalization?
+    labels /= labels.max()
+    return labels
+
+
+class DetectionDataset(torch.utils.data.Dataset):
+    max_sampling_attempts = 500
+
+    def __init__(
+        self,
+        raw_image_paths,
+        label_paths,
+        patch_shape,
+        raw_transform=None,
+        label_transform=None,
+        transform=None,
+        dtype=torch.float32,
+        label_dtype=torch.float32,
+        n_samples=None,
+        sampler=None,
+        eps=1e-8,
+        sigma=None,
+        **kwargs,
+    ):
+        self.raw_images = raw_image_paths
+        # TODO make this a parameter
+        self.raw_key = "raw"
+        self.label_images = label_paths
+        self._ndim = 3
+
+        assert len(patch_shape) == self._ndim
+        self.patch_shape = patch_shape
+
+        self.raw_transform = raw_transform
+        self.label_transform = label_transform
+        self.transform = transform
+        self.sampler = sampler
+
+        self.dtype = dtype
+        self.label_dtype = label_dtype
+
+        self.eps = eps
+        self.sigma = sigma
+
+        if n_samples is None:
+            self._len = len(self.raw_images)
+            self.sample_random_index = False
+        else:
+            self._len = n_samples
+            self.sample_random_index = True
+
+    def __len__(self):
+        return self._len
+
+    @property
+    def ndim(self):
+        return self._ndim
+
+    def _sample_bounding_box(self, shape):
+        if any(sh < psh for sh, psh in zip(shape, self.patch_shape)):
+            raise NotImplementedError(
+                f"Image padding is not supported yet. Data shape {shape}, patch shape {self.patch_shape}"
+            )
+        bb_start = [
+            np.random.randint(0, sh - psh) if sh - psh > 0 else 0
+            for sh, psh in zip(shape, self.patch_shape)
+        ]
+        return tuple(slice(start, start + psh) for start, psh in zip(bb_start, self.patch_shape))
+
+    def _get_sample(self, index):
+        if self.sample_random_index:
+            index = np.random.randint(0, len(self.raw_images))
+        raw, label = self.raw_images[index], self.label_images[index]
+
+        raw = zarr.open(raw)[self.raw_key]
+        # Note: this is quite inefficient, because we process the full crop rather than
+        # just the requested bounding box.
+        label = process_labels(label, raw.shape, self.sigma, self.eps)
+
+        have_raw_channels = raw.ndim == 4  # 3D with channels
+        have_label_channels = label.ndim == 4
+        if have_label_channels:
+            raise NotImplementedError("Multi-channel labels are not supported.")
+
+        shape = raw.shape
+        prefix_box = tuple()
+        if have_raw_channels:
+            if shape[-1] < 16:
+                shape = shape[:-1]
+            else:
+                shape = shape[1:]
+                prefix_box = (slice(None), )
+
+        bb = self._sample_bounding_box(shape)
+        raw_patch = np.array(raw[prefix_box + bb])
+        label_patch = np.array(label[bb])
+
+        if self.sampler is not None:
+            sample_id = 0
+            while not self.sampler(raw_patch, label_patch):
+                bb = self._sample_bounding_box(shape)
+                raw_patch = np.array(raw[prefix_box + bb])
+                label_patch = np.array(label[bb])
+                sample_id += 1
+                if sample_id > self.max_sampling_attempts:
+                    raise RuntimeError(f"Could not sample a valid batch in {self.max_sampling_attempts} attempts")
+
+        if have_raw_channels and len(prefix_box) == 0:
+            raw_patch = raw_patch.transpose((3, 0, 1, 2))  # Channels, Depth, Height, Width
+
+        return raw_patch, label_patch
+
+    def __getitem__(self, index):
+        raw, labels = self._get_sample(index)
+        # initial_label_dtype = labels.dtype
+
+        if self.raw_transform is not None:
+            raw = self.raw_transform(raw)
+
+        if self.label_transform is not None:
+            labels = self.label_transform(labels)
+
+        if self.transform is not None:
+            raw, labels = self.transform(raw, labels)
+
+        raw = ensure_tensor_with_channels(raw, ndim=self._ndim, dtype=self.dtype)
+        labels = ensure_tensor_with_channels(labels, ndim=self._ndim, dtype=self.label_dtype)
+        return raw, labels
+
+
+if __name__ == "__main__":
+    import napari
+
+    raw_path = "training_data/images/10.1L_mid_IHCribboncount_5_Z.zarr"
+    label_path = "training_data/labels/10.1L_mid_IHCribboncount_5_Z.csv"
+
+    f = zarr.open(raw_path, "r")
+    raw = f["raw"][:]
+
+    labels = process_labels(label_path, shape=raw.shape, sigma=1, eps=1e-7)
+
+    v = napari.Viewer()
+    v.add_image(raw)
+    v.add_image(labels)
+    napari.run()
diff --git a/scripts/synapse_marker_detection/extract_training_data.py b/scripts/synapse_marker_detection/extract_training_data.py
@@ -0,0 +1,79 @@
+import os
+from glob import glob
+from pathlib import Path
+
+import h5py
+import napari
+import numpy as np
+import pandas as pd
+import zarr
+
+
+def get_voxel_size(imaris_file):
+    with h5py.File(imaris_file, "r") as f:
+        info = f["/DataSetInfo/Image"]
+        ext = [[float(b"".join(info.attrs[f"ExtMin{i}"]).decode()),
+                float(b"".join(info.attrs[f"ExtMax{i}"]).decode())] for i in range(3)]
+        size = [int(b"".join(info.attrs[dim]).decode()) for dim in ["X", "Y", "Z"]]
+        vsize = np.array([(max_-min_)/s for (min_, max_), s in zip(ext, size)])
+    return vsize
+
+
+def extract_training_data(imaris_file, output_folder):
+    with h5py.File(imaris_file, "r") as f:
+        data = f["/DataSet/ResolutionLevel 0/TimePoint 0/Channel 0/Data"][:]
+        points = f["/Scene/Content/Points0/CoordsXYZR"][:]
+        points = points[:, :-1]
+        points = points[:, ::-1]
+
+    # TODO crop the data to the original shape.
+    # Can we just crop the zero-padding ?!
+    crop_box = np.where(data != 0)
+    crop_box = tuple(slice(0, int(cb.max() + 1)) for cb in crop_box)
+    data = data[crop_box]
+    print(data.shape)
+
+    # Scale the points to match the image dimensions.
+    voxel_size = get_voxel_size(imaris_file)
+    points /= voxel_size[None]
+
+    if output_folder is None:
+        v = napari.Viewer()
+        v.add_image(data)
+        v.add_points(points)
+        v.title = os.path.basename(imaris_file)
+        napari.run()
+    else:
+        image_folder = os.path.join(output_folder, "images")
+        os.makedirs(image_folder, exist_ok=True)
+
+        label_folder = os.path.join(output_folder, "labels")
+        os.makedirs(label_folder, exist_ok=True)
+
+        fname = Path(imaris_file).stem
+        image_file = os.path.join(image_folder, f"{fname}.zarr")
+        label_file = os.path.join(label_folder, f"{fname}.csv")
+
+        coords = pd.DataFrame(points, columns=["axis-0", "axis-1", "axis-2"])
+        coords.to_csv(label_file, index=False)
+
+        f = zarr.open(image_file, "a")
+        f.create_dataset("raw", data=data)
+
+
+# Files that look good for training:
+# - 4.1L_apex_IHCribboncount_Z.ims
+# - 4.1L_base_IHCribbons_Z.ims
+# - 4.1L_mid_IHCribboncount_Z.ims
+# - 4.2R_apex_IHCribboncount_Z.ims
+# - 4.2R_apex_IHCribboncount_Z.ims
+# - 6.2R_apex_IHCribboncount_Z.ims  (very small crop)
+# - 6.2R_base_IHCribbons_Z.ims
+def main():
+    files = sorted(glob("./data/synapse_stains/*.ims"))
+    for ff in files:
+        extract_training_data(ff, output_folder="./training_data")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/synapse_marker_detection/train_synapse_detection.py b/scripts/synapse_marker_detection/train_synapse_detection.py
@@ -0,0 +1,85 @@
+import os
+import sys
+
+from detection_dataset import DetectionDataset
+
+# sys.path.append()
+sys.path.append("/home/pape/Work/my_projects/czii-protein-challenge")
+
+from utils.training import supervised_training  # noqa
+
+TRAIN_ROOT = "./training_data/images"
+LABEL_ROOT = "./training_data/labels"
+
+
+def get_paths(split):
+    file_names = [
+        "4.1L_apex_IHCribboncount_Z",
+        "4.1L_base_IHCribbons_Z",
+        "4.1L_mid_IHCribboncount_Z",
+        "4.2R_apex_IHCribboncount_Z",
+        "4.2R_apex_IHCribboncount_Z",
+        "6.2R_apex_IHCribboncount_Z",
+        "6.2R_base_IHCribbons_Z",
+    ]
+    image_paths = [os.path.join(TRAIN_ROOT, f"{fname}.zarr") for fname in file_names]
+    label_paths = [os.path.join(LABEL_ROOT, f"{fname}.csv") for fname in file_names]
+
+    if split == "train":
+        image_paths = image_paths[:-1]
+        label_paths = label_paths[:-1]
+    else:
+        image_paths = image_paths[-1:]
+        label_paths = label_paths[-1:]
+
+    return image_paths, label_paths
+
+
+# TODO maybe add a sampler for the label data
+def train():
+
+    model_name = "synapse_detection_v1"
+
+    train_paths, train_label_paths = get_paths("train")
+    val_paths, val_label_paths = get_paths("val")
+    # We need to give the paths for the test loader, although it's never used.
+    test_paths, test_label_paths = val_paths, val_label_paths
+
+    print("Start training with:")
+    print(len(train_paths), "tomograms for training")
+    print(len(val_paths), "tomograms for validation")
+
+    patch_shape = [32, 96, 96]
+
+    batch_size = 8
+    check = False
+
+    supervised_training(
+        name=model_name,
+        train_paths=train_paths,
+        train_label_paths=train_label_paths,
+        val_paths=val_paths,
+        val_label_paths=val_label_paths,
+        patch_shape=patch_shape, batch_size=batch_size,
+        check=check,
+        lr=1e-4,
+        n_iterations=int(2.5e4),
+        out_channels=1,
+        augmentations=None,
+        eps=1e-5,
+        sigma=1,
+        lower_bound=None,
+        upper_bound=None,
+        test_paths=test_paths,
+        test_label_paths=test_label_paths,
+        # save_root="",
+        dataset_class=DetectionDataset,
+    )
+
+
+def main():
+    train()
+
+
+if __name__ == "__main__":
+    main()