Add initial data conversion scripts

constantinpape · constantinpape · commit 7869ee945bcc · 2024-08-31T11:20:30.000+02:00
diff --git a/flamingo_tools/data_conversion.py b/flamingo_tools/data_conversion.py
@@ -9,6 +9,10 @@
 import pybdv
 import tifffile
 
+from cluster_tools.utils.volume_utils import write_format_metadata
+from elf.io import open_file
+from skimage.transform import rescale
+
 
 def _read_resolution_and_unit_flamingo(mdata_path):
     resolution = None
@@ -106,6 +110,51 @@ def derive_scale_factors(shape):
     return scale_factors
 
 
+def _to_bdv(
+    data, out_path, scale_factors, n_threads, resolution, unit, channel_id, channel_name, tile_id, tile_transformation
+):
+    pybdv.make_bdv(
+        data, out_path,
+        downscale_factors=scale_factors, downscale_mode="mean",
+        n_threads=n_threads,
+        resolution=resolution, unit=unit,
+        attributes={
+            "channel": {"id": channel_id, "name": channel_name}, "tile": {"id": tile_id, "name": str(tile_id)},
+            "angle": {"id": 0, "name": "0"}, "illumination": {"id": 0, "name": "0"}
+        },
+        affine=tile_transformation,
+    )
+
+
+def _to_ome_zarr(
+    data, out_path, scale_factors, n_threads, resolution, unit, channel_id, channel_name, tile_id, tile_transformation
+):
+    # Write the base dataset.
+    base_key = f"c{channel_id}-t{tile_id}"
+    chunks = (128, 128, 128)
+    with open_file(out_path, "a") as f:
+        ds = f.create_dataset(f"{base_key}/s0", shape=data.shape, compression='gzip',
+                              chunks=chunks, dtype=data.dtype)
+        ds.n_threads = n_threads
+        ds[:] = data
+
+        # TODO parallelized implementation.
+        # Do downscaling.
+        for level, scale_factor in enumerate(scale_factors, 1):
+            inv_scale = [1.0 / sc for sc in scale_factor]
+            data = rescale(data, inv_scale, preserve_range=True).astype(data.dtype)
+            ds = f.create_dataset(f"{base_key}/s{level}", shape=data.shape, compression='gzip',
+                                  chunks=chunks, dtype=data.dtype)
+            ds.n_threads = n_threads
+            ds[:] = data
+
+    # Write the ome zarr metadata.
+    metadata_dict = {"unit": unit, "resolution": resolution}
+    write_format_metadata(
+        "ome.zarr", out_path, metadata_dict, scale_factors=scale_factors, prefix=base_key
+    )
+
+
 def convert_lightsheet_to_bdv(
     root: str,
     channel_folders: Dict[str, str],
@@ -169,6 +218,11 @@ def convert_lightsheet_to_bdv(
     ext = os.path.splitext(out_path)[1]
     if ext == "":
         out_path = str(Path(out_path).with_suffix(".n5"))
+        conversion_function = _to_bdv
+    elif ext == ".zarr":
+        conversion_function = _to_ome_zarr
+    else:
+        conversion_function = _to_bdv
 
     # Iterate over the channels
     for channel_id, (channel_name, channel_folder) in enumerate(channel_folders.items()):
@@ -197,7 +251,7 @@ def convert_lightsheet_to_bdv(
             assert len(metadata_paths) == len(file_paths)
             resolution, unit, tile_transformations = read_metadata_flamingo(metadata_paths, center_tiles)
 
-        if channel_name is None or channel_name.strip() == "": #channel name is empty, assign channel id as name
+        if channel_name is None or channel_name.strip() == "":  # channel name is empty, assign channel id as name
             channel_name = str(channel_id)
 
         for tile_id, (file_path, tile_transformation) in enumerate(zip(file_paths, tile_transformations)):
@@ -213,16 +267,9 @@ def convert_lightsheet_to_bdv(
             if scale_factors is None:
                 scale_factors = derive_scale_factors(data.shape)
 
-            pybdv.make_bdv(
-                data, out_path,
-                downscale_factors=scale_factors, downscale_mode="mean",
-                n_threads=n_threads,
-                resolution=resolution, unit=unit,
-                attributes={
-                    "channel": {"id": channel_id, "name": channel_name}, "tile": {"id": tile_id, "name": str(tile_id)},
-                    "angle": {"id": 0, "name": "0"}, "illumination": {"id": 0, "name": "0"}
-                },
-                affine=tile_transformation,
+            conversion_function(
+                data, out_path, scale_factors, n_threads, resolution, unit,
+                channel_id, channel_name, tile_id, tile_transformation
             )
 
 
diff --git a/scripts/ome_challenge/.gitignore b/scripts/ome_challenge/.gitignore
@@ -0,0 +1 @@
+credentials.json
diff --git a/scripts/ome_challenge/convert_data.py b/scripts/ome_challenge/convert_data.py
@@ -0,0 +1,40 @@
+import os
+import sys
+
+ROOT = "/mnt/lustre-emmy-hdd/usr/u12086/data/flamingo"
+
+
+def convert_to_ome_zarr_v2(name):
+    sys.path.append("../..")
+    from flamingo_tools.data_conversion import convert_lightsheet_to_bdv
+
+    input_root = os.path.join(ROOT, name)
+    assert os.path.exists(input_root)
+
+    output_root = os.path.join(ROOT, "ngff-v2")
+    os.makedirs(output_root, exist_ok=True)
+
+    output_path = os.path.join(output_root, f"{name}.ome.zarr")
+
+    # Number of timepoints:
+    # ntp = 10
+    ntp = 1  # for testing
+
+    channel_folders = {f"t{tp:02}": "" for tp in range(ntp)}
+    convert_lightsheet_to_bdv(
+        input_root, channel_folders, image_file_name_pattern="*_t000000_*_C01_I0_*.tif",
+        out_path=output_path,
+    )
+
+
+def convert_to_ome_zarr_v3(name):
+    pass
+
+
+def main():
+    name = "Platynereis-H2B-TL"
+    convert_to_ome_zarr_v2(name)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/ome_challenge/download_data.py b/scripts/ome_challenge/download_data.py
@@ -0,0 +1,55 @@
+import os
+
+from pydrive2.auth import GoogleAuth
+from pydrive2.drive import GoogleDrive
+
+
+IDS = {
+    "Platynereis-H2B-TL": "1jGwaJ62w80GYo5I_Jcb3O_g7y4RKEhjI",
+    "Zebrafish-XSPIM-multiview": "https://drive.google.com/drive/folders/175hZRrUNWM2UzY0wzXPFjuFZ5QKUN-tm?usp=drive_link"  # noqa
+}
+
+# ROOT = "/mnt/lustre-grete/usr/u12086/data/flamingo"
+ROOT = "/mnt/lustre-emmy-hdd/usr/u12086/data/flamingo"
+
+
+def download_folder(drive, name):
+    os.makedirs(ROOT, exist_ok=True)
+
+    destination_folder = os.path.join(ROOT, name)
+    folder_id = IDS[name]
+
+    folder_query = f"'{folder_id}' in parents and trashed=false"
+    file_list = drive.ListFile({'q': folder_query}).GetList()
+
+    for file in file_list:
+        if file['mimeType'] == 'application/vnd.google-apps.folder':
+            folder_name = os.path.join(destination_folder, file['title'])
+            os.makedirs(folder_name, exist_ok=True)
+            download_folder(file['id'], folder_name)
+        else:
+            print(f"Downloading {file['title']} to {destination_folder}")
+            # breakpoint()
+            file.GetContentFile(os.path.join(destination_folder, file['title']))
+
+
+def get_drive():
+    gauth = GoogleAuth()
+    gauth.LoadCredentialsFile("credentials.json")  # Use the saved credentials
+    if gauth.access_token_expired:
+        gauth.Refresh()
+    else:
+        gauth.Authorize()
+    drive = GoogleDrive(gauth)
+    return drive
+
+
+def main():
+    drive = get_drive()
+
+    # download_from_gdrive(name="Zebrafish-XSPIM-multiview")
+    download_folder(drive, name="Platynereis-H2B-TL")
+
+
+if __name__ == "__main__":
+    main()