Merge branch 'master' into ome-challenge

constantinpape · constantinpape · commit 9aa89eeb782f · 2024-10-08T09:43:23.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,5 @@ synthetic_data/
 __pycache__/
 converted/
 *.egg-info/
+checkpoints/
+logs/
diff --git a/flamingo_tools/data_conversion.py b/flamingo_tools/data_conversion.py
@@ -171,10 +171,16 @@ def flamingo_filename_parser(file_path, name_mapping):
     illumination_mapping = name_mapping.get("illumination", {})
     attributes["illumination"] = {"id": illumination, "name": illumination_mapping.get(illumination, str(illumination))}
 
+    # Extract D. TODO what is this?
+    match = re.search(r'_D(\d+)_', filename)
+    D = int(match.group(1)) if match else 0
+    D_mapping = name_mapping.get("D", {})
+    attributes["D"] = {"id": D, "name": D_mapping.get(D, str(D))}
+
     # BDV also supports an angle attribute, but it does not seem to be stored in the filename
     # "angle": {"id": 0, "name": "0"}
 
-    attribute_id = f"c{channel}-t{tile}-i{illumination}"
+    attribute_id = f"c{channel}-t{tile}-i{illumination}-d{D}"
     return timepoint, attributes, attribute_id
 
 
@@ -282,13 +288,13 @@ def convert_lightsheet_to_bdv(
         else:  # We have metadata and read it.
             resolution, unit, tile_transformation = read_metadata_flamingo(metadata_file, offset)
 
+        print(f"Converting tp={timepoint}, channel={attributes['channel']}, tile={attributes['tile']}")
         try:
             data = tifffile.memmap(file_path, mode="r")
         except ValueError:
             print(f"Could not memmap the data from {file_path}. Fall back to load it into memory.")
             data = tifffile.imread(file_path)
 
-        print(f"Converting tp={timepoint}, channel={attributes['channel']}, tile={attributes['tile']}")
         if scale_factors is None:
             scale_factors = derive_scale_factors(data.shape)
 
diff --git a/flamingo_tools/mobie.py b/flamingo_tools/mobie.py
@@ -0,0 +1,77 @@
+import os
+import tempfile
+from typing import Tuple
+
+from mobie import add_bdv_image, add_segmentation
+from mobie.metadata.dataset_metadata import read_dataset_metadata
+
+
+# TODO refactor to mobie utils
+def _source_exists(mobie_project, mobie_dataset, source_name):
+    dataset_folder = os.path.join(mobie_project, mobie_dataset)
+    metadata = read_dataset_metadata(dataset_folder)
+    sources = metadata.get("sources", {})
+    return source_name in sources
+
+
+def add_raw_to_mobie(
+    mobie_project: str,
+    mobie_dataset: str,
+    source_name: str,
+    xml_path: str,
+    skip_existing: bool = True,
+    setup_id: int = 0,
+):
+    """
+    """
+    # Check if we have converted this data already.
+    have_source = _source_exists(mobie_project, mobie_dataset, source_name)
+    if have_source and skip_existing:
+        print(f"Source {source_name} already exists in {mobie_project}:{mobie_dataset}.")
+        print("Conversion to mobie will be skipped.")
+        return
+    elif have_source:
+        raise NotImplementedError
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        add_bdv_image(
+            xml_path=xml_path,
+            root=mobie_project,
+            dataset_name=mobie_dataset,
+            image_name=source_name,
+            tmp_folder=tmpdir,
+            file_format="bdv.n5",
+            setup_ids=[setup_id],
+        )
+
+
+def add_segmentation_to_mobie(
+    mobie_project: str,
+    mobie_dataset: str,
+    source_name: str,
+    segmentation_path: str,
+    segmentation_key: str,
+    resolution: Tuple[int, int, int],
+    unit: str,
+    scale_factors: Tuple[Tuple[int, int, int]],
+    chunks: Tuple[int, int, int],
+    skip_existing: bool = True,
+):
+    # Check if we have converted this data already.
+    have_source = _source_exists(mobie_project, mobie_dataset, source_name)
+    if have_source and skip_existing:
+        print(f"Source {source_name} already exists in {mobie_project}:{mobie_dataset}.")
+        print("Conversion to mobie will be skipped.")
+        return
+    elif have_source:
+        raise NotImplementedError
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        add_segmentation(
+            input_path=segmentation_path, input_key=segmentation_key,
+            root=mobie_project, dataset_name=mobie_dataset,
+            segmentation_name=source_name,
+            resolution=resolution, scale_factors=scale_factors,
+            chunks=chunks, file_format="bdv.n5",
+            tmp_folder=tmpdir
+        )
diff --git a/scripts/README.md b/scripts/README.md
@@ -1,18 +1,41 @@
 # Segmentation for large lightsheet volumes
 
+
+## Installation
+
+Needs [torch-em](https://github.com/constantinpape/torch-em) in the python environment. See [here](https://github.com/constantinpape/torch-em?tab=readme-ov-file#installation) for installation instructions. (If possible use `mamba` instead of `conda`.)
+After setting up the environment you also have to add support for the MoBIE python library via
+```
+conda install -c conda-forge mobie_utils
+```
+
+
 ## Training
 
 Contains the scripts for training a U-Net that predicts foreground probabilties and normalized object distances.
 
+
 ## Prediction
 
-Contains the scripts for running segmentation for a large volume with a distance prediction U-Net. (Other scripts are work in progress.)
+Contains the scripts for running segmentation for a large volume with a distance prediction U-Net, postprocessing the segmentation
+and exporting the segmentation result to MoBIE
 
-You can run it like this for input that is stored in n5:
+To run the full segmentation workflow, including the export to MoBIE you can use the `segmentation_workflow.py` script as follows:
+```
+python segmentation_workflow.py -i /path/to/volume.xml -o /path/to/output_folder --scale 0 -m data_name --model /path/to/model.pt
+```
+
+Here, `-i` must point to the xml file of the fused data exported from BigSticher, `-o` indicates the output folder where the MoBIE project with the semgentation result will be saved, `--scale` indicates the scale to use for the segmentation, `-m` the name of the data in MoBIE and `--model` the path to the segmentation model.
+
+### Individual Workflow Steps
+
+You can also run individual steps of the workflow, like prediction and segmentation: 
+
+You can run it like this for an input volume that is stored in n5, e.g. the fused export from bigstitcher:
 ```
 python run_prediction_distance_unet.py -i /path/to/volume.n5 -k setup0/timepoint0/s0 -m /path/to/model -o /path/to/output_folder
 ```
-Here, `-i` specifies the input filepath, `-o` the folder where the results are saved and `-k` the internal path for a zarr or n5 file.
+Here, `-i` specifies the input filepath, `-o` the folder where the results are saved and `-k` the internal path in the n5 file.
 The `-m` argument specifies the model to use for prediction. You need to give the path to the folder that contains the checkpoint (the `best.pt` file).
 
 You can also run the script for a tif file. In this case you don't need the `-k` parameter:
@@ -31,8 +54,4 @@ to downsample the input by a factor of 2. Note that the segmentation result will
 
 In addition, the script `postprocess_seg.py` can be used to filter out false positive nucleus segmentations from regions in the segmentation with a low density of segmented nuclei.
 
-You can use the script `to_tif.py` to convert the zarr object to a tif volume for easier viewing (won't work for very large volumes!).
-
-## Installation
-
-Needs [torch-em](https://github.com/constantinpape/torch-em) in the python environment. See [here](https://github.com/constantinpape/torch-em?tab=readme-ov-file#installation) for installation instructions. (If possible use `mamba` instead of `conda`.)
+You can use the script `to_tif.py` to convert the zarr object to a tif volume for easier viewing (won't work for large volumes!).
diff --git a/scripts/data_transfer/README.md b/scripts/data_transfer/README.md
@@ -5,37 +5,30 @@
 Current approach to the data transfer:
 - Log in to SCC login node:
   $ 
-- Go to `/scratch1/projects/cca/data/moser`
+- Go to "/scratch1/projects/cca/data/moser"
 - Create subfolder <NAME> for cochlea to be copied 
-- Log in via 
-```
-$ smbclient \\\\wfs-medizin.top.gwdg.de\\ukon-all\$\\ukon100 -U GWDG\\pape41"
-```
+- Log in via $ smbclient \\\\wfs-medizin.top.gwdg.de\\ukon-all\$\\ukon100 -U GWDG\\pape41"
 - Go to the folder with the cochlea to copy (cd works)
 - Copy the folder via:
     - recurse ON
     - prompt OFF
     - mget *
 - Copy this to HLRN by logging into it and running
-```
+  $ rsync  pape41:/scratch1/projects/cca/data/moser/<NAME>
   $ rsync -e "ssh -i ~/.ssh/id_rsa_hlrn" -avz pape41@login-mdc.hpc.gwdg.de:/scratch1/projects/cca/data/mose
-r/<NAME> /mnt/lustre-emmy-hdd/projects/nim00007/data/moser/lightsheet/volumes/<NAME>
-```
+r/<NAME> /mnt/lustre-grete/usr/u12086/moser/lightsheet/<NAME>
 - Remove on SCC
 
 ## Next files
 
 - UKON100\archiv\imaging\Lightsheet\Huiskengroup_CTLSM\2024\M171_2R_converted_n5
-    - unclear what the converted data is
-- UKON100\archiv\imaging\Lightsheet\Huiskengroup_CTLSM\2024\155_1L_converted_n5\BDVexport.n5
-    - Copied to SCC, need to rsync.
+- UKON100\archiv\imaging\Lightsheet\Huiskengroup_CTLSM\2024\155_1L_converted_n5
 - UKON100\archiv\imaging\Lightsheet\Huiskengroup_CTLSM\2024\MLR151_2R_converted_n5
 - UKON100\archiv\imaging\Lightsheet\Huiskengroup_CTLSM\2024\G11_1L_converted_n5
 
 ## Improvements
 
 Try to automate via https://github.com/jborean93/smbprotocol see `sync_smb.py` for ChatGPT's inital version.
-Connection not possible from HLRN.
 
 ## Transfer Back
 
diff --git a/scripts/prediction/.gitignore b/scripts/prediction/.gitignore
@@ -0,0 +1 @@
+credentials*
diff --git a/scripts/prediction/segmentation_workflow.py b/scripts/prediction/segmentation_workflow.py
@@ -0,0 +1,152 @@
+import argparse
+import os
+from shutil import rmtree
+
+import pybdv.metadata as bdv_metadata
+import torch
+import z5py
+
+from flamingo_tools.segmentation import run_unet_prediction, filter_isolated_objects
+from flamingo_tools.mobie import add_raw_to_mobie, add_segmentation_to_mobie
+
+MOBIE_ROOT = "/mnt/lustre-emmy-hdd/projects/nim00007/data/moser/lightsheet/mobie"
+
+
+def postprocess_seg(output_folder):
+    print("Run segmentation postprocessing ...")
+    seg_path = os.path.join(output_folder, "segmentation.zarr")
+    seg_key = "segmentation"
+
+    with z5py.File(seg_path, "r") as f:
+        segmentation = f[seg_key][:]
+
+    seg_filtered, n_pre, n_post = filter_isolated_objects(segmentation)
+
+    with z5py.File(seg_path, "a") as f:
+        chunks = f[seg_key].chunks
+        f.create_dataset(
+            "segmentation_postprocessed", data=seg_filtered, compression="gzip",
+            chunks=chunks, dtype=seg_filtered.dtype
+        )
+
+
+def export_to_mobie(xml_path, segmentation_folder, scale, mobie_dataset, chunks):
+    # Add to mobie:
+
+    # - raw data (if not yet present)
+    add_raw_to_mobie(
+        mobie_project=MOBIE_ROOT,
+        mobie_dataset=mobie_dataset,
+        source_name="pv-channel",
+        xml_path=xml_path,
+        setup_id=0,
+    )
+
+    # TODO enable passing extra channel names
+    # - additional channels
+    setup_ids = bdv_metadata.get_setup_ids(xml_path)
+    if len(setup_ids) > 1:
+        extra_channel_names = ["gfp_channel", "myo_channel"]
+        for i, setup_id in enumerate(setup_ids[1:]):
+            add_raw_to_mobie(
+                mobie_project=MOBIE_ROOT,
+                mobie_dataset=mobie_dataset,
+                source_name=extra_channel_names[i],
+                xml_path=xml_path,
+                setup_id=setup_id
+            )
+
+    # - segmentation and post-processed segmentation
+    seg_path = os.path.join(segmentation_folder, "segmentation.zarr")
+    seg_resolution = bdv_metadata.get_resolution(xml_path, setup_id=0)
+    if scale == 1:
+        seg_resolution = [2 * res for res in seg_resolution]
+    unit = bdv_metadata.get_unit(xml_path, setup_id=0)
+
+    seg_key = "segmentation"
+    seg_name = "nuclei_fullscale" if scale == 0 else "nuclei_downscaled"
+    add_segmentation_to_mobie(
+        mobie_project=MOBIE_ROOT,
+        mobie_dataset=mobie_dataset,
+        source_name=seg_name,
+        segmentation_path=seg_path,
+        segmentation_key=seg_key,
+        resolution=seg_resolution,
+        unit=unit,
+        scale_factors=4*[[2, 2, 2]],
+        chunks=chunks,
+    )
+
+    seg_key = "segmentation_postprocessed"
+    seg_name += "_postprocessed"
+    add_segmentation_to_mobie(
+        mobie_project=MOBIE_ROOT,
+        mobie_dataset=mobie_dataset,
+        source_name=seg_name,
+        segmentation_path=seg_path,
+        segmentation_key=seg_key,
+        resolution=seg_resolution,
+        unit=unit,
+        scale_factors=4*[[2, 2, 2]],
+        chunks=chunks,
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--input", required=True)
+    parser.add_argument("-o", "--output_folder", required=True)
+    parser.add_argument("-s", "--scale", required=True, type=int)
+    parser.add_argument("-m", "--mobie_dataset", required=True)
+    parser.add_argument("--model")
+
+    args = parser.parse_args()
+
+    scale = args.scale
+    if scale == 0:
+        min_size = 1000
+    elif scale == 1:
+        min_size = 250
+    else:
+        raise ValueError
+
+    xml_path = args.input
+    assert os.path.splitext(xml_path)[1] == ".xml"
+    input_path = bdv_metadata.get_data_path(xml_path, return_absolute_path=True)
+
+    # TODO need to make sure that PV is always setup 0
+    input_key = f"setup0/timepoint0/s{scale}"
+
+    have_cuda = torch.cuda.is_available()
+    chunks = z5py.File(input_path, "r")[input_key].chunks
+    block_shape = tuple([2 * ch for ch in chunks]) if have_cuda else tuple(chunks)
+    halo = (16, 64, 64) if have_cuda else (8, 32, 32)
+
+    if args.model is not None:
+        model = args.model
+    else:
+        if scale == 0:
+            model = "../training/checkpoints/cochlea_distance_unet"
+        else:
+            model = "../training/checkpoints/cochlea_distance_unet-train-downsampled"
+
+    run_unet_prediction(
+        input_path, input_key, args.output_folder, model,
+        scale=None, min_size=min_size,
+        block_shape=block_shape, halo=halo,
+    )
+
+    postprocess_seg(args.output_folder)
+
+    export_to_mobie(xml_path, args.output_folder, scale, args.mobie_dataset, chunks)
+
+    # clean up: remove segmentation folders
+    print("Cleaning up intermediate segmentation results")
+    print("This may take a while, but everything else is done.")
+    print("You can check the results in the MoBIE project already at:")
+    print(f"{MOBIE_ROOT}:{args.mobie_dataset}")
+    rmtree(args.output_folder)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/prediction/upload_to_s3.py b/scripts/prediction/upload_to_s3.py