computational-cell-analytics · constantinpape · Oct 3, 2024 · Sep 4, 2024 · Sep 4, 2024 · Sep 8, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 synthetic_data/
 __pycache__/
 converted/
+*.egg-info/
diff --git a/flamingo_tools/data_conversion.py b/flamingo_tools/data_conversion.py
@@ -1,5 +1,6 @@
 import multiprocessing as mp
 import os
+import re
 
 from glob import glob
 from pathlib import Path
@@ -54,19 +55,15 @@ def _read_start_position_flamingo(path):
     return start_position
 
 
-def read_metadata_flamingo(metadata_paths, center_tiles):
-    start_positions = []
+def read_metadata_flamingo(metadata_path, offset=None):
     resolution, unit = None, None
-    for path in metadata_paths:
-        resolution, unit = _read_resolution_and_unit_flamingo(path)
-        start_position = _read_start_position_flamingo(path)
-        start_positions.append(start_position)
 
-    start_positions = np.array(start_positions)
-    offset = np.min(start_positions, axis=0) if center_tiles else np.array([0.0, 0.0, 0.0])
+    resolution, unit = _read_resolution_and_unit_flamingo(metadata_path)
+    start_position = _read_start_position_flamingo(metadata_path)
 
     def _pos_to_trafo(pos):
-        pos -= offset
+        if offset is not None:
+            pos -= offset
 
         # FIXME: dirty hack
         # scale = 4
@@ -93,11 +90,9 @@ def _pos_to_trafo(pos):
         }
         return trafo
 
-    transformations = [
-        _pos_to_trafo(pos) for pos in start_positions
-    ]
+    transformation = _pos_to_trafo(start_position)
     # We have to reverse the resolution because pybdv expects ZYX.
-    return resolution[::-1], unit, transformations
+    return resolution[::-1], unit, transformation
 
 
 # TODO derive the scale factors from the shape rather than hard-coding it to 5 levels
@@ -106,15 +101,61 @@ def derive_scale_factors(shape):
     return scale_factors
 
 
+def flamingo_filename_parser(file_path, name_mapping):
+    filename = os.path.basename(file_path)
+
+    # Extract the timepoint.
+    match = re.search(r'_t(\d+)_', filename)
+    if match:
+        timepoint = int(match.group(1))
+    else:
+        timepoint = 0
+
+    # Extract the additional attributes.
+    attributes = {}
+    if name_mapping is None:
+        name_mapping = {}
+
+    # Extract the channel.
+    match = re.search(r'_C(\d+)_', filename)
+    channel = int(match.group(1)) if match else 0
+    channel_mapping = name_mapping.get("channel", {})
+    attributes["channel"] = {"id": channel, "name": channel_mapping.get(channel, str(channel))}
+
+    # Extract the tile.
+    match = re.search(r'_R(\d+)_', filename)
+    tile = int(match.group(1)) if match else 0
+    tile_mapping = name_mapping.get("tile", {})
+    attributes["tile"] = {"id": tile, "name": tile_mapping.get(tile, str(tile))}
+
+    # Extract the illumination.
+    match = re.search(r'_I(\d+)_', filename)
+    illumination = int(match.group(1)) if match else 0
+    illumination_mapping = name_mapping.get("illumination", {})
+    attributes["illumination"] = {"id": illumination, "name": illumination_mapping.get(illumination, str(illumination))}
+
+    # Extract D. TODO what is this?
+    match = re.search(r'_D(\d+)_', filename)
+    D = int(match.group(1)) if match else 0
+    D_mapping = name_mapping.get("D", {})
+    attributes["D"] = {"id": D, "name": D_mapping.get(D, str(D))}
+
+    # BDV also supports an angle attribute, but it does not seem to be stored in the filename
+    # "angle": {"id": 0, "name": "0"}
+
+    attribute_id = f"c{channel}-t{tile}-i{illumination}-d{D}"
+    return timepoint, attributes, attribute_id
+
+
 def convert_lightsheet_to_bdv(
     root: str,
-    channel_folders: Dict[str, str],
-    image_file_name_pattern: str,
     out_path: str,
+    attribute_parser: callable = flamingo_filename_parser,
+    attribute_names: Optional[Dict[str, Dict[int, str]]] = None,
     metadata_file_name_pattern: Optional[str] = None,
     metadata_root: Optional[str] = None,
     metadata_type: str = "flamingo",
-    center_tiles: bool = True,
+    center_tiles: bool = False,
     resolution: Optional[List[float]] = None,
     unit: Optional[str] = None,
     scale_factors: Optional[List[List[int]]] = None,
@@ -125,24 +166,14 @@ def convert_lightsheet_to_bdv(
     The data is converted to the bdv-n5 file format and can be opened with BigDataViewer
     or BigStitcher. This function is written with data layout and metadata of flamingo
     microscopes in mind, but could potentially be adapted to other data formats.
-    We currently don't support multiple timepoints, but support can be added if needed.
 
-    This function assumes the following input data format:
-    <ROOT>/<CHANNEL1>/<TILE1>.tif
-                     /<TILE2>.tif
-                     /...
-          /<CHANNEL2>/<TILE1>.tif
-                     /<TILE2>.tif
-                     /...
+    TODO explain the attribute parsing.
 
     Args:
-        root: Folder that contains the folders with tifs for each channel.
-        channel_folders: Dictionary that maps the name of each channel to the corresponding folder name
-            underneath the root folder.
-        image_file_name_pattern: The pattern for the names of the tifs that contain the data.
-            This expects a glob pattern (name with '*') to select the corresponding tif files .
-            The simplest pattern that should work in most cases is '*.tif'.
+        root: Folder that contains the image data stored as tifs.
+            This function will take into account all tif files in folders beneath this root directory.
         out_path: Output path where the converted data is saved.
+        attribute_parser: TODO
         metadata_file_name_pattern: The pattern for the names of files that contain the metadata.
             For flamingo metadata the following pattern should work: '*_Settings.txt'.
         metadata_root: Different root folder for the metadata. By default 'root' is used here as well.
@@ -170,60 +201,73 @@ def convert_lightsheet_to_bdv(
     if ext == "":
         out_path = str(Path(out_path).with_suffix(".n5"))
 
-    # Iterate over the channels
-    for channel_id, (channel_name, channel_folder) in enumerate(channel_folders.items()):
-
-        # Get all the image file paths for this channel.
-        tile_pattern = os.path.join(root, channel_folder, image_file_name_pattern)
-        file_paths = sorted(glob(tile_pattern))
-        assert len(file_paths) > 0, tile_pattern
+    files = sorted(glob(os.path.join(root, "**/*.tif"), recursive=True))
+    if metadata_file_name_pattern is None:
+        metadata_files = [None] * len(files)
+        offset = None
+    else:
+        metadata_files = sorted(
+            glob(
+                os.path.join(root if metadata_root is None else metadata_root, f"**/{metadata_file_name_pattern}"),
+                recursive=True
+            )
+        )
+        assert len(metadata_files) == len(files)
+
+        if center_tiles:
+            start_positions = []
+            for mpath in metadata_files:
+                start_positions.append(_read_start_position_flamingo(mpath))
+            offset = np.min(start_positions, axis=0)
+        else:
+            offset = None
+
+    next_setup_id = 0
+    attrs_to_setups = {}
+
+    for file_path, metadata_file in zip(files, metadata_files):
+        timepoint, attributes, aid = attribute_parser(file_path, attribute_names)
+
+        if aid in attrs_to_setups:
+            setup_id = attrs_to_setups[aid]
+        else:
+            attrs_to_setups[aid] = next_setup_id
+            setup_id = next_setup_id
+            next_setup_id += 1
 
         # Read the metadata if it was given.
-        if metadata_file_name_pattern is None:  # No metadata given.
+        if metadata_file is None:  # No metadata given.
             # We don't use any tile transformation.
-            tile_transformations = [None] * len(file_paths)
+            tile_transformation = None
             # Set resolution and unit to their default values if they were not passed.
             if resolution is None:
                 resolution = [1.0, 1.0, 1.0]
             if unit is None:
                 unit = "pixel"
 
         else:  # We have metadata and read it.
-            metadata_pattern = os.path.join(
-                root if metadata_root is None else metadata_root,
-                channel_folder, metadata_file_name_pattern
-            )
-            metadata_paths = sorted(glob(metadata_pattern))
-            assert len(metadata_paths) == len(file_paths)
-            resolution, unit, tile_transformations = read_metadata_flamingo(metadata_paths, center_tiles)
-
-        if channel_name is None or channel_name.strip() == "": #channel name is empty, assign channel id as name
-            channel_name = str(channel_id)
-
-        for tile_id, (file_path, tile_transformation) in enumerate(zip(file_paths, tile_transformations)):
-
-            # Try to memmap the data. If that doesn't work fall back to loading it into memory.
-            try:
-                data = tifffile.memmap(file_path, mode="r")
-            except ValueError:
-                print(f"Could not memmap the data from {file_path}. Fall back to load it into memory.")
-                data = tifffile.imread(file_path)
-
-            print("Converting channel", channel_id, "tile", tile_id, "from", file_path, "with shape", data.shape)
-            if scale_factors is None:
-                scale_factors = derive_scale_factors(data.shape)
-
-            pybdv.make_bdv(
-                data, out_path,
-                downscale_factors=scale_factors, downscale_mode="mean",
-                n_threads=n_threads,
-                resolution=resolution, unit=unit,
-                attributes={
-                    "channel": {"id": channel_id, "name": channel_name}, "tile": {"id": tile_id, "name": str(tile_id)},
-                    "angle": {"id": 0, "name": "0"}, "illumination": {"id": 0, "name": "0"}
-                },
-                affine=tile_transformation,
-            )
+            resolution, unit, tile_transformation = read_metadata_flamingo(metadata_file, offset)
+
+        print(f"Converting tp={timepoint}, channel={attributes['channel']}, tile={attributes['tile']}")
+        try:
+            data = tifffile.memmap(file_path, mode="r")
+        except ValueError:
+            print(f"Could not memmap the data from {file_path}. Fall back to load it into memory.")
+            data = tifffile.imread(file_path)
+
+        if scale_factors is None:
+            scale_factors = derive_scale_factors(data.shape)
+
+        pybdv.make_bdv(
+            data, out_path,
+            downscale_factors=scale_factors, downscale_mode="mean",
+            n_threads=n_threads,
+            resolution=resolution, unit=unit,
+            attributes=attributes,
+            affine=tile_transformation,
+            timepoint=timepoint,
+            setup_id=setup_id,
+        )
 
 
 # TODO expose more arguments via CLI.

diff --git a/flamingo_tools/mobie.py b/flamingo_tools/mobie.py
@@ -0,0 +1,77 @@
+import os
+import tempfile
+from typing import Tuple
+
+from mobie import add_bdv_image, add_segmentation
+from mobie.metadata.dataset_metadata import read_dataset_metadata
+
+
+# TODO refactor to mobie utils
+def _source_exists(mobie_project, mobie_dataset, source_name):
+    dataset_folder = os.path.join(mobie_project, mobie_dataset)
+    metadata = read_dataset_metadata(dataset_folder)
+    sources = metadata.get("sources", {})
+    return source_name in sources
+
+
+def add_raw_to_mobie(
+    mobie_project: str,
+    mobie_dataset: str,
+    source_name: str,
+    xml_path: str,
+    skip_existing: bool = True,
+    setup_id: int = 0,
+):
+    """
+    """
+    # Check if we have converted this data already.
+    have_source = _source_exists(mobie_project, mobie_dataset, source_name)
+    if have_source and skip_existing:
+        print(f"Source {source_name} already exists in {mobie_project}:{mobie_dataset}.")
+        print("Conversion to mobie will be skipped.")
+        return
+    elif have_source:
+        raise NotImplementedError
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        add_bdv_image(
+            xml_path=xml_path,
+            root=mobie_project,
+            dataset_name=mobie_dataset,
+            image_name=source_name,
+            tmp_folder=tmpdir,
+            file_format="bdv.n5",
+            setup_ids=[setup_id],
+        )
+
+
+def add_segmentation_to_mobie(
+    mobie_project: str,
+    mobie_dataset: str,
+    source_name: str,
+    segmentation_path: str,
+    segmentation_key: str,
+    resolution: Tuple[int, int, int],
+    unit: str,
+    scale_factors: Tuple[Tuple[int, int, int]],
+    chunks: Tuple[int, int, int],
+    skip_existing: bool = True,
+):
+    # Check if we have converted this data already.
+    have_source = _source_exists(mobie_project, mobie_dataset, source_name)
+    if have_source and skip_existing:
+        print(f"Source {source_name} already exists in {mobie_project}:{mobie_dataset}.")
+        print("Conversion to mobie will be skipped.")
+        return
+    elif have_source:
+        raise NotImplementedError
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        add_segmentation(
+            input_path=segmentation_path, input_key=segmentation_key,
+            root=mobie_project, dataset_name=mobie_dataset,
+            segmentation_name=source_name,
+            resolution=resolution, scale_factors=scale_factors,
+            chunks=chunks, file_format="bdv.n5",
+            tmp_folder=tmpdir
+        )
diff --git a/flamingo_tools/test_data.py b/flamingo_tools/test_data.py
@@ -7,7 +7,7 @@
 # TODO add metadata
 def create_test_data(root, size=256, n_channels=2, n_tiles=4):
     channel_folders = [f"channel{chan_id}" for chan_id in range(n_channels)]
-    file_name_pattern = "volume_R%i_C%i.tif"
+    file_name_pattern = "volume_R%i_C%i_I0.tif"
     for chan_id, channel_folder in enumerate(channel_folders):
         out_folder = os.path.join(root, channel_folder)
         os.makedirs(out_folder, exist_ok=True)

diff --git a/flamingo_tools/version.py b/flamingo_tools/version.py
@@ -0,0 +1 @@
+__version__ = "0.0.1"