computational-cell-analytics · constantinpape · Oct 8, 2024 · Aug 31, 2024 · Sep 4, 2024 · Sep 8, 2024
diff --git a/environment.yaml b/environment.yaml
@@ -6,6 +6,7 @@ channels:
 
 dependencies:
   - cpuonly
+  - cluster_tools
   - scikit-image
   - pybdv
   - pytorch

diff --git a/flamingo_tools/data_conversion.py b/flamingo_tools/data_conversion.py
@@ -10,6 +10,10 @@
 import pybdv
 import tifffile
 
+from cluster_tools.utils.volume_utils import write_format_metadata
+from elf.io import open_file
+from skimage.transform import rescale
+
 
 def _read_resolution_and_unit_flamingo(mdata_path):
     resolution = None
@@ -101,6 +105,39 @@ def derive_scale_factors(shape):
     return scale_factors
 
 
+def _to_ome_zarr(data, out_path, scale_factors, timepoint, setup_id, attributes, unit, resolution):
+    n_threads = mp.cpu_count()
+    chunks = (128, 128, 128)
+
+    # Write the base dataset.
+    base_key = f"setup{setup_id}/timepoint{timepoint}"
+
+    with open_file(out_path, "a") as f:
+        ds = f.create_dataset(f"{base_key}/s0", shape=data.shape, compression="gzip",
+                              chunks=chunks, dtype=data.dtype)
+        ds.n_threads = n_threads
+        ds[:] = data
+
+        # TODO parallelized implementation.
+        # Do downscaling.
+        for level, scale_factor in enumerate(scale_factors, 1):
+            inv_scale = [1.0 / sc for sc in scale_factor]
+            data = rescale(data, inv_scale, preserve_range=True).astype(data.dtype)
+            ds = f.create_dataset(f"{base_key}/s{level}", shape=data.shape, compression="gzip",
+                                  chunks=chunks, dtype=data.dtype)
+            ds.n_threads = n_threads
+            ds[:] = data
+
+        g = f[f"setup{setup_id}"]
+        g.attrs.update(attributes)
+
+    # Write the ome zarr metadata.
+    metadata_dict = {"unit": unit, "resolution": resolution}
+    write_format_metadata(
+        "ome.zarr", out_path, metadata_dict, scale_factors=scale_factors, prefix=base_key
+    )
+
+
 def flamingo_filename_parser(file_path, name_mapping):
     filename = os.path.basename(file_path)
 
@@ -198,8 +235,11 @@ def convert_lightsheet_to_bdv(
 
     # Make sure we convert to n5, in case no extension is passed.
     ext = os.path.splitext(out_path)[1]
+    convert_to_ome_zarr = False
     if ext == "":
         out_path = str(Path(out_path).with_suffix(".n5"))
+    elif ext == ".zarr":
+        convert_to_ome_zarr = True
 
     files = sorted(glob(os.path.join(root, "**/*.tif"), recursive=True))
     if metadata_file_name_pattern is None:
@@ -258,16 +298,19 @@ def convert_lightsheet_to_bdv(
         if scale_factors is None:
             scale_factors = derive_scale_factors(data.shape)
 
-        pybdv.make_bdv(
-            data, out_path,
-            downscale_factors=scale_factors, downscale_mode="mean",
-            n_threads=n_threads,
-            resolution=resolution, unit=unit,
-            attributes=attributes,
-            affine=tile_transformation,
-            timepoint=timepoint,
-            setup_id=setup_id,
-        )
+        if convert_to_ome_zarr:
+            _to_ome_zarr(data, out_path, scale_factors, timepoint, setup_id, attributes, unit, resolution)
+        else:
+            pybdv.make_bdv(
+                data, out_path,
+                downscale_factors=scale_factors, downscale_mode="mean",
+                n_threads=n_threads,
+                resolution=resolution, unit=unit,
+                attributes=attributes,
+                affine=tile_transformation,
+                timepoint=timepoint,
+                setup_id=setup_id,
+            )
 
 
 # TODO expose more arguments via CLI.

diff --git a/scripts/ome_challenge/.gitignore b/scripts/ome_challenge/.gitignore
@@ -0,0 +1,2 @@
+credentials.json
+export_minio.sh
diff --git a/scripts/ome_challenge/README.md b/scripts/ome_challenge/README.md
@@ -0,0 +1,6 @@
+# OME-Challenge
+
+Scripts for converting flamingo data for the [OME-NGFF-Challenge](https://forum.image.sc/t/ome2024-ngff-challenge/97363):
+- `convert_data.py`: to convert the data from the flamingo tif format to ome-zarr-v3 (via ome-zarr-v2 and the challenge converter tool)
+- `create_metadata.py`: to add additional top-level metadata to keep track of the different tiles, timepoints etc.
+- `upload_data.py`: to upload the data to s3 (needs credentials not stored in this repository)
diff --git a/scripts/ome_challenge/convert_data.py b/scripts/ome_challenge/convert_data.py
@@ -0,0 +1,62 @@
+import os
+from glob import glob
+from subprocess import run
+
+import zarr
+from flamingo_tools.data_conversion import convert_lightsheet_to_bdv
+
+ROOT = "/mnt/lustre-emmy-hdd/usr/u12086/data/flamingo"
+
+
+def convert_to_ome_zarr_v2(name):
+    input_root = os.path.join(ROOT, name)
+    assert os.path.exists(input_root)
+
+    output_root = os.path.join(ROOT, "ngff-v2")
+    os.makedirs(output_root, exist_ok=True)
+    output_path = os.path.join(output_root, f"{name}.ome.zarr")
+
+    convert_lightsheet_to_bdv(input_root, out_path=output_path)
+
+
+def convert_to_ome_zarr_v3(name):
+    input_path = os.path.join(ROOT, "ngff-v2", f"{name}.ome.zarr")
+
+    output_root = os.path.join(ROOT, "ngff-v3")
+    os.makedirs(output_root, exist_ok=True)
+    output_path = os.path.join(output_root, f"{name}.ome.zarr")
+
+    f_in = zarr.v2.open(store=input_path, mode="r")
+    f_out = zarr.open_group(store=output_path, mode="a")
+
+    setup_folders = sorted(glob(os.path.join(input_path, "setup*")))
+    for sfolder in setup_folders:
+        setup = os.path.basename(sfolder)
+        f_out.create_group(name=setup)
+
+        attrs = {k: v for k, v in f_in[setup].attrs.items()}
+        f_out[setup].attrs.update(attrs)
+
+        # Copy over the attributes for this set-up.
+        timepoint_folders = sorted(glob(os.path.join(sfolder, "timepoint*")))
+        for tfolder in timepoint_folders:
+            timepoint = os.path.basename(tfolder)
+            print("Converting", setup, timepoint)
+            out = os.path.join(output_path, setup, timepoint)
+            cmd = [
+                "ome2024-ngff-challenge", "resave", "--cc-by", tfolder, out,
+                "--output-overwrite", "--output-shards=512,512,512"
+            ]
+            run(cmd)
+
+
+def main():
+    # name = "Platynereis-H2B-TL"
+    # name = "Zebrafish-XSPIM-multiview"
+    name = "Zebrafish-H2B-short-timelapse"
+    convert_to_ome_zarr_v2(name)
+    convert_to_ome_zarr_v3(name)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/ome_challenge/create_csv.py b/scripts/ome_challenge/create_csv.py
@@ -0,0 +1,47 @@
+import os
+import numpy as np
+import pandas as pd
+
+ROOT = "/mnt/lustre-emmy-hdd/usr/u12086/data/flamingo/ngff-v3"
+URL_ROOT = "https://radosgw.public.os.wwu.de/n4bi-goe"
+
+
+def get_directory_size(directory):
+    total_size = 0
+    # Walk through all subdirectories and files
+    for dirpath, dirnames, filenames in os.walk(directory):
+        for filename in filenames:
+            filepath = os.path.join(dirpath, filename)
+            # Only add file size if it is a file (skip if it's a broken symlink)
+            if os.path.isfile(filepath):
+                total_size += os.path.getsize(filepath)
+
+    size_rd = np.round(total_size / 1e9, 2)
+    size_rd = f"{size_rd} GB"
+    return total_size, size_rd
+
+
+names = [
+    "Platynereis-H2B-TL.ome.zarr",
+    "Zebrafish-H2B-short-timelapse.ome.zarr",
+    "Zebrafish-XSPIM-multiview.ome.zarr",
+]
+
+urls = []
+written = []
+written_human_readable = []
+
+for name in names:
+    url = f"{URL_ROOT}/{name}"
+    urls.append(url)
+    folder = os.path.join(ROOT, name)
+    size, size_rd = get_directory_size(folder)
+    written.append(size)
+    written_human_readable.append(size_rd)
+
+df = {
+    "url": urls, "written": written, "written_human_readable": written_human_readable,
+}
+
+df = pd.DataFrame(df)
+df.to_csv("flamingo.csv", index=False)
diff --git a/scripts/ome_challenge/create_metadata.py b/scripts/ome_challenge/create_metadata.py
@@ -0,0 +1,60 @@
+# Create additional top-level metadata.
+
+import argparse
+import os
+import json
+from glob import glob
+
+
+def get_series(path):
+    setups = sorted(glob(os.path.join(path, "**/timepoint*")))
+    setups = [os.path.relpath(p, path) for p in setups]
+    return setups
+
+
+def create_metadata(path):
+    # TOP LEVEL METADATA
+    bf_to_raw = {
+        "attributes": {
+            "ome": {
+                "version": "0.5",
+                "bioformats2raw.layout": 3
+            }
+        },
+        "zarr_format": 3,
+        "node_type": "group",
+    }
+    meta_path = os.path.join(path, "zarr.json")
+
+    # This can be safely over-written.
+    with open(meta_path, "w") as f:
+        json.dump(bf_to_raw, f)
+
+    # OME METADATA
+    series = get_series(path)
+    ome_metadata = {
+        "attributes": {
+            "ome": {
+                "version": "0.5",
+                "series": series
+            }
+        },
+        "zarr_format": 3,
+        "node_type": "group",
+    }
+    meta_folder = os.path.join(path, "OME")
+    os.makedirs(meta_folder, exist_ok=True)
+    meta_path = os.path.join(meta_folder, "zarr.json")
+    with open(meta_path, "w") as f:
+        json.dump(ome_metadata, f)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("path")
+    args = parser.parse_args()
+    create_metadata(args.path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/ome_challenge/download_data.py b/scripts/ome_challenge/download_data.py
@@ -0,0 +1,61 @@
+import os
+
+from pydrive2.auth import GoogleAuth
+from pydrive2.drive import GoogleDrive
+
+
+IDS = {
+    "Platynereis-H2B-TL": "1jGwaJ62w80GYo5I_Jcb3O_g7y4RKEhjI",
+    "Zebrafish-XSPIM-multiview": "175hZRrUNWM2UzY0wzXPFjuFZ5QKUN-tm",  # noqa
+    "Zebrafish-H2B-short-timelapse": "18fGJwQ0i5pBHQO8FHUuFcxqWeD7uwiBM",  # noqa
+    # This doesn't work.
+    # "Zebrafish-H2B-short-4views": "1iyMMCZO1rmamVGNVThJWElJKSKXROscF"
+}
+
+# ROOT = "/mnt/lustre-grete/usr/u12086/data/flamingo"
+ROOT = "/mnt/lustre-emmy-hdd/usr/u12086/data/flamingo"
+
+
+def download_folder(drive, name):
+    os.makedirs(ROOT, exist_ok=True)
+
+    destination_folder = os.path.join(ROOT, name)
+    os.makedirs(destination_folder, exist_ok=True)
+    folder_id = IDS[name]
+
+    folder_query = f"'{folder_id}' in parents and trashed=false"
+    file_list = drive.ListFile({'q': folder_query}).GetList()
+
+    for file in file_list:
+        if file['mimeType'] == 'application/vnd.google-apps.folder':
+            folder_name = os.path.join(destination_folder, file['title'])
+            os.makedirs(folder_name, exist_ok=True)
+            download_folder(file['id'], folder_name)
+        else:
+            print(f"Downloading {file['title']} to {destination_folder}")
+            # breakpoint()
+            file.GetContentFile(os.path.join(destination_folder, file['title']))
+
+
+def get_drive():
+    gauth = GoogleAuth()
+    gauth.LoadCredentialsFile("credentials.json")  # Use the saved credentials
+    if gauth.access_token_expired:
+        gauth.Refresh()
+    else:
+        gauth.Authorize()
+    drive = GoogleDrive(gauth)
+    return drive
+
+
+def main():
+    drive = get_drive()
+
+    # download_from_gdrive(name="Zebrafish-XSPIM-multiview")
+    # download_folder(drive, name="Platynereis-H2B-TL")
+    # download_folder(drive, name="Zebrafish-H2B-short-timelapse")
+    download_folder(drive, name="Zebrafish-H2B-short-4views")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/ome_challenge/upload_data.py b/scripts/ome_challenge/upload_data.py
@@ -0,0 +1,36 @@
+import os
+from subprocess import run
+
+ROOT = "/mnt/lustre-emmy-hdd/usr/u12086/data/flamingo"
+
+
+def upload_data(name):
+    data_root = os.path.join(ROOT, "ngff-v3", name)
+    assert os.path.exists(data_root), data_root
+
+    bucket_name = "n4bi-goe"
+
+    # Create the bucket.
+    cmd = [
+        "mc-client", "mb", f"challenge/{bucket_name}/{name}/"
+    ]
+    run(cmd)
+
+    # Run the copy.
+    cmd = [
+        "mc-client", "cp", "--recursive",
+        f"{data_root}/", f"challenge/{bucket_name}/{name}/"
+    ]
+    run(cmd)
+
+
+def main():
+    # name = "Platynereis-H2B-TL.ome.zarr"
+    # name = "Zebrafish-H2B-short-timelapse.ome.zarr"
+    name = "Zebrafish-XSPIM-multiview.ome.zarr"
+
+    upload_data(name)
+
+
+if __name__ == "__main__":
+    main()
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,6 +6,7 @@ channels: @@
     dependencies:
       - cpuonly
+      - cluster_tools
       - scikit-image
       - pybdv
       - pytorch
@@ Expand Down @@