Update data conversion CLI and the segmentation workflow script

constantinpape · constantinpape · commit 5a397cd94295 · 2024-11-06T13:39:12.000+01:00
diff --git a/flamingo_tools/data_conversion.py b/flamingo_tools/data_conversion.py
@@ -245,7 +245,7 @@ def convert_lightsheet_to_bdv(
     file_ext: str = ".tif",
     attribute_parser: callable = flamingo_filename_parser,
     attribute_names: Optional[Dict[str, Dict[int, str]]] = None,
-    metadata_file_name_pattern: Optional[str] = None,
+    metadata_file_name_pattern: Optional[str] = "*_Settings.txt",
     metadata_root: Optional[str] = None,
     metadata_type: str = "flamingo",
     center_tiles: bool = False,
@@ -383,36 +383,35 @@ def convert_lightsheet_to_bdv(
     _write_missing_views(out_path)
 
 
-# TODO expose more arguments via CLI.
 def convert_lightsheet_to_bdv_cli():
     import argparse
 
     parser = argparse.ArgumentParser(
-        description="Convert lightsheet data to format compatible with BigDataViewer / BigStitcher. "
-                    "Example useage: To convert the synthetic data created via create_synthetic_data.py run: \n"
-                    "python convert_flamingo_data.py -i synthetic_data -c channel0 channel1 -f *.tif -o synthetic.n5"
+        description="Convert lightsheet data from a flamingo microscope to a format compatible with BigDataViewer / BigStitcher. "  # noqa
+                    "For most flamingo data it should be sufficient to run the script like this: \n"
+                    "python convert_flamingo_data.py -i /path/to/flamingo_data -o /path/to/output.n5 \n"
+                    "Here, -i specifies the path to the input folder and -o specifies the path to the output data. \n"
+                    "In order to process flamingo data stored in raw format you also need to pass the argument '-f .raw'."  # noqa
     )
     parser.add_argument(
-        "--input_root", "-i", required=True,
-        help="Folder that contains the folders with tifs for each channel."
+        "--input_root", "-i", required=True, help="Folder that contains the data from the flamingo microscope."
     )
     parser.add_argument(
-        "--channel_folders", "-c", nargs="+", required=True,
-        help="Name of folders with the data for each channel."
+        "--out_path", "-o", required=True, help="Output path where the converted data will be saved."
     )
     parser.add_argument(
-        "--image_file_name_pattern", "-f", required=True,
-        help="The pattern for the names of the tifs that contain the data. "
-             "This expects a glob pattern (name with '*') to select the corresponding tif files."
-             "The simplest pattern that should work in most cases is '*.tif'."
+        "--file_ext", "-f", default=".tif",
+        help="The file extension of the image data. By default '.tif' is used, pass '.raw' if your data is stored as raw files."  # noqa
     )
     parser.add_argument(
-        "--out_path", "-o", required=True,
-        help="Output path where the converted data is saved."
+        "--metadata_pattern", default="*_Settings.txt",
+        help="The filepattern for finding metadata information. The default value works for flamingo data."
     )
 
     args = parser.parse_args()
-    channel_folders = {name: name for name in args.channel_folders}
     convert_lightsheet_to_bdv(
-        args.input_root, channel_folders, args.image_file_name_pattern, args.out_path,
+        root=args.input_root,
+        out_path=args.out_path,
+        file_ext=args.file_ext,
+        metadata_file_name_pattern=args.metadata_pattern
     )
diff --git a/scripts/prediction/segmentation_workflow.py b/scripts/prediction/segmentation_workflow.py
@@ -1,5 +1,6 @@
 import argparse
 import os
+from pathlib import Path
 from shutil import rmtree
 
 import pybdv.metadata as bdv_metadata
@@ -9,8 +10,6 @@
 from flamingo_tools.segmentation import run_unet_prediction, filter_isolated_objects
 from flamingo_tools.mobie import add_raw_to_mobie, add_segmentation_to_mobie
 
-MOBIE_ROOT = "/mnt/lustre-emmy-hdd/projects/nim00007/data/moser/lightsheet/mobie"
-
 
 def postprocess_seg(output_folder):
     print("Run segmentation postprocessing ...")
@@ -30,33 +29,23 @@ def postprocess_seg(output_folder):
         )
 
 
-def export_to_mobie(xml_path, segmentation_folder, scale, mobie_dataset, chunks):
-    # Add to mobie:
-
-    # - raw data (if not yet present)
-    add_raw_to_mobie(
-        mobie_project=MOBIE_ROOT,
-        mobie_dataset=mobie_dataset,
-        source_name="pv-channel",
-        xml_path=xml_path,
-        setup_id=0,
-    )
-
-    # TODO enable passing extra channel names
-    # - additional channels
+def export_to_mobie(xml_path, segmentation_folder, output_folder, scale, mobie_dataset, chunks, channel_names):
+    # Add to mobie: All the channels.
     setup_ids = bdv_metadata.get_setup_ids(xml_path)
-    if len(setup_ids) > 1:
-        extra_channel_names = ["gfp_channel", "myo_channel"]
-        for i, setup_id in enumerate(setup_ids[1:]):
-            add_raw_to_mobie(
-                mobie_project=MOBIE_ROOT,
-                mobie_dataset=mobie_dataset,
-                source_name=extra_channel_names[i],
-                xml_path=xml_path,
-                setup_id=setup_id
-            )
-
-    # - segmentation and post-processed segmentation
+    if channel_names is None:
+        channel_names = [f"channel-{i}" for i in range(len(setup_ids))]
+    else:
+        assert len(channel_names) == len(setup_ids)
+    for i, setup_id in enumerate(setup_ids):
+        add_raw_to_mobie(
+            mobie_project=output_folder,
+            mobie_dataset=mobie_dataset,
+            source_name=channel_names[i],
+            xml_path=xml_path,
+            setup_id=setup_id
+        )
+
+    # The segmentation and post-processed segmentation results.
     seg_path = os.path.join(segmentation_folder, "segmentation.zarr")
     seg_resolution = bdv_metadata.get_resolution(xml_path, setup_id=0)
     if scale == 1:
@@ -66,7 +55,7 @@ def export_to_mobie(xml_path, segmentation_folder, scale, mobie_dataset, chunks)
     seg_key = "segmentation"
     seg_name = "nuclei_fullscale" if scale == 0 else "nuclei_downscaled"
     add_segmentation_to_mobie(
-        mobie_project=MOBIE_ROOT,
+        mobie_project=output_folder,
         mobie_dataset=mobie_dataset,
         source_name=seg_name,
         segmentation_path=seg_path,
@@ -80,7 +69,7 @@ def export_to_mobie(xml_path, segmentation_folder, scale, mobie_dataset, chunks)
     seg_key = "segmentation_postprocessed"
     seg_name += "_postprocessed"
     add_segmentation_to_mobie(
-        mobie_project=MOBIE_ROOT,
+        mobie_project=output_folder,
         mobie_dataset=mobie_dataset,
         source_name=seg_name,
         segmentation_path=seg_path,
@@ -93,15 +82,42 @@ def export_to_mobie(xml_path, segmentation_folder, scale, mobie_dataset, chunks)
 
 
 def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-i", "--input", required=True)
-    parser.add_argument("-o", "--output_folder", required=True)
-    parser.add_argument("-s", "--scale", required=True, type=int)
-    parser.add_argument("-m", "--mobie_dataset", required=True)
+    # Argument parser so that this script can be used from the command line.
+    parser = argparse.ArgumentParser(
+        description="Run segmentation and export the segmentation result for a lightsheet volume."
+    )
+    parser.add_argument(
+        "-i", "--input", required=True,
+        help="Path to the input volume. This should be the path to the xml file obtained after stitching."
+    )
+    parser.add_argument(
+        "-o", "--output", required=True,
+        help="Path to the output folder. This is where the MoBIE project, with image data and segmentation result, will be stored."  # noqa
+    )
+    parser.add_argument(
+        "-s", "--segmentation_folder", required=True,
+        help="Path to a folder where intermediate results for the segmentation will be stored. "
+        "The results will be removed after the export to MoBIE."
+    )
+    parser.add_argument(
+        "--mobie_dataset",
+        help="Internal name of the dataset in MoBIE. If not given this will be derived from the name of the input volume.",  # noqa
+    )
+    parser.add_argument(
+        "--setup_id", default=0, type=int,
+        help="The setup id to use for the segmentation. Choose the setup-id for the channel that contains the data to be used for segmentation."  # noqa
+        "This should be the PV channel for SGN segmentation."
+    )
+    parser.add_argument(
+        "--scale", default=0, type=int,
+        help="The scale to use for segmentation. By default this will run at the lowest scale (= full resolution)."
+    )
     parser.add_argument("--model")
-
+    parser.add_argument("--channel_names", nargs="+", default=None, help="The names of channels in the dataset, in the same order as the setup-ids.")  # noqa
     args = parser.parse_args()
 
+    # This is just some preparation logic to get a good size for filtering
+    # the nuclei depending on which scale we use for running the segmentation.
     scale = args.scale
     if scale == 0:
         min_size = 1000
@@ -110,18 +126,25 @@ def main():
     else:
         raise ValueError
 
+    # Here we read the path to the data from the xml file and we construct the
+    # input key (= internal file path in the n5 file with the data),
+    # that points to the correct setup-id and scale.
     xml_path = args.input
     assert os.path.splitext(xml_path)[1] == ".xml"
     input_path = bdv_metadata.get_data_path(xml_path, return_absolute_path=True)
+    input_key = f"setup{args.setup_id}/timepoint0/s{scale}"
 
-    # TODO need to make sure that PV is always setup 0
-    input_key = f"setup0/timepoint0/s{scale}"
-
+    # This is just some preparation to choose the correct block sizes for running prediction
+    # depending on having a GPU or not available.
+    # (You will need a GPU to run this for any larger volume, CPU support is just for testing purposes.)
     have_cuda = torch.cuda.is_available()
     chunks = z5py.File(input_path, "r")[input_key].chunks
     block_shape = tuple([2 * ch for ch in chunks]) if have_cuda else tuple(chunks)
     halo = (16, 64, 64) if have_cuda else (8, 32, 32)
 
+    # Here we find the path to the model for segmentation.
+    # If the path it given it should point to the ".pt" file.
+    # Otherwise, we try to load the model from where the checkpoint was stored on my system.
     if args.model is not None:
         model = args.model
     else:
@@ -130,22 +153,31 @@ def main():
         else:
             model = "../training/checkpoints/cochlea_distance_unet-train-downsampled"
 
+    # These functions run the actual segmentation and the segmentation postprocessing.
     run_unet_prediction(
-        input_path, input_key, args.output_folder, model,
+        input_path, input_key, args.segmentation_folder, model,
         scale=None, min_size=min_size,
         block_shape=block_shape, halo=halo,
     )
+    postprocess_seg(args.segmentation_folder)
 
-    postprocess_seg(args.output_folder)
-
-    export_to_mobie(xml_path, args.output_folder, scale, args.mobie_dataset, chunks)
+    # This function exports the segmentation and the corresponding channel to MoBIE.
+    if args.mobie_dataset is None:
+        mobie_dataset = Path(xml_path).stem
+    else:
+        mobie_dataset = args.mobie_dataset
+    export_to_mobie(
+        xml_path, args.segmentation_folder, args.output_folder, scale, mobie_dataset, chunks,
+        channel_names=args.channel_names
+    )
 
-    # clean up: remove segmentation folders
+    # Finally, we clean up the intermediate segmentation results, that are not needed anymore
+    # because everything was exported to MoBIE.
     print("Cleaning up intermediate segmentation results")
     print("This may take a while, but everything else is done.")
     print("You can check the results in the MoBIE project already at:")
-    print(f"{MOBIE_ROOT}:{args.mobie_dataset}")
-    rmtree(args.output_folder)
+    print(f"{args.output_folder}:{mobie_dataset}")
+    rmtree(args.segmentation_folder)
 
 
 if __name__ == "__main__":