diff --git a/environment.yaml b/environment.yaml index defb5b4..b342e5e 100644 --- a/environment.yaml +++ b/environment.yaml @@ -6,6 +6,7 @@ channels: dependencies: - cpuonly + - cluster_tools - scikit-image - pybdv - pytorch diff --git a/flamingo_tools/data_conversion.py b/flamingo_tools/data_conversion.py index ca4d421..164aedf 100644 --- a/flamingo_tools/data_conversion.py +++ b/flamingo_tools/data_conversion.py @@ -10,6 +10,10 @@ import pybdv import tifffile +from cluster_tools.utils.volume_utils import write_format_metadata +from elf.io import open_file +from skimage.transform import rescale + def _read_resolution_and_unit_flamingo(mdata_path): resolution = None @@ -101,6 +105,39 @@ def derive_scale_factors(shape): return scale_factors +def _to_ome_zarr(data, out_path, scale_factors, timepoint, setup_id, attributes, unit, resolution): + n_threads = mp.cpu_count() + chunks = (128, 128, 128) + + # Write the base dataset. + base_key = f"setup{setup_id}/timepoint{timepoint}" + + with open_file(out_path, "a") as f: + ds = f.create_dataset(f"{base_key}/s0", shape=data.shape, compression="gzip", + chunks=chunks, dtype=data.dtype) + ds.n_threads = n_threads + ds[:] = data + + # TODO parallelized implementation. + # Do downscaling. + for level, scale_factor in enumerate(scale_factors, 1): + inv_scale = [1.0 / sc for sc in scale_factor] + data = rescale(data, inv_scale, preserve_range=True).astype(data.dtype) + ds = f.create_dataset(f"{base_key}/s{level}", shape=data.shape, compression="gzip", + chunks=chunks, dtype=data.dtype) + ds.n_threads = n_threads + ds[:] = data + + g = f[f"setup{setup_id}"] + g.attrs.update(attributes) + + # Write the ome zarr metadata. + metadata_dict = {"unit": unit, "resolution": resolution} + write_format_metadata( + "ome.zarr", out_path, metadata_dict, scale_factors=scale_factors, prefix=base_key + ) + + def flamingo_filename_parser(file_path, name_mapping): filename = os.path.basename(file_path) @@ -198,8 +235,11 @@ def convert_lightsheet_to_bdv( # Make sure we convert to n5, in case no extension is passed. ext = os.path.splitext(out_path)[1] + convert_to_ome_zarr = False if ext == "": out_path = str(Path(out_path).with_suffix(".n5")) + elif ext == ".zarr": + convert_to_ome_zarr = True files = sorted(glob(os.path.join(root, "**/*.tif"), recursive=True)) if metadata_file_name_pattern is None: @@ -258,16 +298,19 @@ def convert_lightsheet_to_bdv( if scale_factors is None: scale_factors = derive_scale_factors(data.shape) - pybdv.make_bdv( - data, out_path, - downscale_factors=scale_factors, downscale_mode="mean", - n_threads=n_threads, - resolution=resolution, unit=unit, - attributes=attributes, - affine=tile_transformation, - timepoint=timepoint, - setup_id=setup_id, - ) + if convert_to_ome_zarr: + _to_ome_zarr(data, out_path, scale_factors, timepoint, setup_id, attributes, unit, resolution) + else: + pybdv.make_bdv( + data, out_path, + downscale_factors=scale_factors, downscale_mode="mean", + n_threads=n_threads, + resolution=resolution, unit=unit, + attributes=attributes, + affine=tile_transformation, + timepoint=timepoint, + setup_id=setup_id, + ) # TODO expose more arguments via CLI. diff --git a/scripts/ome_challenge/.gitignore b/scripts/ome_challenge/.gitignore new file mode 100644 index 0000000..caa55dd --- /dev/null +++ b/scripts/ome_challenge/.gitignore @@ -0,0 +1,2 @@ +credentials.json +export_minio.sh diff --git a/scripts/ome_challenge/README.md b/scripts/ome_challenge/README.md new file mode 100644 index 0000000..3eb8e1f --- /dev/null +++ b/scripts/ome_challenge/README.md @@ -0,0 +1,6 @@ +# OME-Challenge + +Scripts for converting flamingo data for the [OME-NGFF-Challenge](https://forum.image.sc/t/ome2024-ngff-challenge/97363): +- `convert_data.py`: to convert the data from the flamingo tif format to ome-zarr-v3 (via ome-zarr-v2 and the challenge converter tool) +- `create_metadata.py`: to add additional top-level metadata to keep track of the different tiles, timepoints etc. +- `upload_data.py`: to upload the data to s3 (needs credentials not stored in this repository) diff --git a/scripts/ome_challenge/convert_data.py b/scripts/ome_challenge/convert_data.py new file mode 100644 index 0000000..a40a838 --- /dev/null +++ b/scripts/ome_challenge/convert_data.py @@ -0,0 +1,62 @@ +import os +from glob import glob +from subprocess import run + +import zarr +from flamingo_tools.data_conversion import convert_lightsheet_to_bdv + +ROOT = "/mnt/lustre-emmy-hdd/usr/u12086/data/flamingo" + + +def convert_to_ome_zarr_v2(name): + input_root = os.path.join(ROOT, name) + assert os.path.exists(input_root) + + output_root = os.path.join(ROOT, "ngff-v2") + os.makedirs(output_root, exist_ok=True) + output_path = os.path.join(output_root, f"{name}.ome.zarr") + + convert_lightsheet_to_bdv(input_root, out_path=output_path) + + +def convert_to_ome_zarr_v3(name): + input_path = os.path.join(ROOT, "ngff-v2", f"{name}.ome.zarr") + + output_root = os.path.join(ROOT, "ngff-v3") + os.makedirs(output_root, exist_ok=True) + output_path = os.path.join(output_root, f"{name}.ome.zarr") + + f_in = zarr.v2.open(store=input_path, mode="r") + f_out = zarr.open_group(store=output_path, mode="a") + + setup_folders = sorted(glob(os.path.join(input_path, "setup*"))) + for sfolder in setup_folders: + setup = os.path.basename(sfolder) + f_out.create_group(name=setup) + + attrs = {k: v for k, v in f_in[setup].attrs.items()} + f_out[setup].attrs.update(attrs) + + # Copy over the attributes for this set-up. + timepoint_folders = sorted(glob(os.path.join(sfolder, "timepoint*"))) + for tfolder in timepoint_folders: + timepoint = os.path.basename(tfolder) + print("Converting", setup, timepoint) + out = os.path.join(output_path, setup, timepoint) + cmd = [ + "ome2024-ngff-challenge", "resave", "--cc-by", tfolder, out, + "--output-overwrite", "--output-shards=512,512,512" + ] + run(cmd) + + +def main(): + # name = "Platynereis-H2B-TL" + # name = "Zebrafish-XSPIM-multiview" + name = "Zebrafish-H2B-short-timelapse" + convert_to_ome_zarr_v2(name) + convert_to_ome_zarr_v3(name) + + +if __name__ == "__main__": + main() diff --git a/scripts/ome_challenge/create_csv.py b/scripts/ome_challenge/create_csv.py new file mode 100644 index 0000000..7191a81 --- /dev/null +++ b/scripts/ome_challenge/create_csv.py @@ -0,0 +1,47 @@ +import os +import numpy as np +import pandas as pd + +ROOT = "/mnt/lustre-emmy-hdd/usr/u12086/data/flamingo/ngff-v3" +URL_ROOT = "https://radosgw.public.os.wwu.de/n4bi-goe" + + +def get_directory_size(directory): + total_size = 0 + # Walk through all subdirectories and files + for dirpath, dirnames, filenames in os.walk(directory): + for filename in filenames: + filepath = os.path.join(dirpath, filename) + # Only add file size if it is a file (skip if it's a broken symlink) + if os.path.isfile(filepath): + total_size += os.path.getsize(filepath) + + size_rd = np.round(total_size / 1e9, 2) + size_rd = f"{size_rd} GB" + return total_size, size_rd + + +names = [ + "Platynereis-H2B-TL.ome.zarr", + "Zebrafish-H2B-short-timelapse.ome.zarr", + "Zebrafish-XSPIM-multiview.ome.zarr", +] + +urls = [] +written = [] +written_human_readable = [] + +for name in names: + url = f"{URL_ROOT}/{name}" + urls.append(url) + folder = os.path.join(ROOT, name) + size, size_rd = get_directory_size(folder) + written.append(size) + written_human_readable.append(size_rd) + +df = { + "url": urls, "written": written, "written_human_readable": written_human_readable, +} + +df = pd.DataFrame(df) +df.to_csv("flamingo.csv", index=False) diff --git a/scripts/ome_challenge/create_metadata.py b/scripts/ome_challenge/create_metadata.py new file mode 100644 index 0000000..4d0f9b4 --- /dev/null +++ b/scripts/ome_challenge/create_metadata.py @@ -0,0 +1,60 @@ +# Create additional top-level metadata. + +import argparse +import os +import json +from glob import glob + + +def get_series(path): + setups = sorted(glob(os.path.join(path, "**/timepoint*"))) + setups = [os.path.relpath(p, path) for p in setups] + return setups + + +def create_metadata(path): + # TOP LEVEL METADATA + bf_to_raw = { + "attributes": { + "ome": { + "version": "0.5", + "bioformats2raw.layout": 3 + } + }, + "zarr_format": 3, + "node_type": "group", + } + meta_path = os.path.join(path, "zarr.json") + + # This can be safely over-written. + with open(meta_path, "w") as f: + json.dump(bf_to_raw, f) + + # OME METADATA + series = get_series(path) + ome_metadata = { + "attributes": { + "ome": { + "version": "0.5", + "series": series + } + }, + "zarr_format": 3, + "node_type": "group", + } + meta_folder = os.path.join(path, "OME") + os.makedirs(meta_folder, exist_ok=True) + meta_path = os.path.join(meta_folder, "zarr.json") + with open(meta_path, "w") as f: + json.dump(ome_metadata, f) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("path") + args = parser.parse_args() + create_metadata(args.path) + + +if __name__ == "__main__": + main() diff --git a/scripts/ome_challenge/download_data.py b/scripts/ome_challenge/download_data.py new file mode 100644 index 0000000..d8df664 --- /dev/null +++ b/scripts/ome_challenge/download_data.py @@ -0,0 +1,61 @@ +import os + +from pydrive2.auth import GoogleAuth +from pydrive2.drive import GoogleDrive + + +IDS = { + "Platynereis-H2B-TL": "1jGwaJ62w80GYo5I_Jcb3O_g7y4RKEhjI", + "Zebrafish-XSPIM-multiview": "175hZRrUNWM2UzY0wzXPFjuFZ5QKUN-tm", # noqa + "Zebrafish-H2B-short-timelapse": "18fGJwQ0i5pBHQO8FHUuFcxqWeD7uwiBM", # noqa + # This doesn't work. + # "Zebrafish-H2B-short-4views": "1iyMMCZO1rmamVGNVThJWElJKSKXROscF" +} + +# ROOT = "/mnt/lustre-grete/usr/u12086/data/flamingo" +ROOT = "/mnt/lustre-emmy-hdd/usr/u12086/data/flamingo" + + +def download_folder(drive, name): + os.makedirs(ROOT, exist_ok=True) + + destination_folder = os.path.join(ROOT, name) + os.makedirs(destination_folder, exist_ok=True) + folder_id = IDS[name] + + folder_query = f"'{folder_id}' in parents and trashed=false" + file_list = drive.ListFile({'q': folder_query}).GetList() + + for file in file_list: + if file['mimeType'] == 'application/vnd.google-apps.folder': + folder_name = os.path.join(destination_folder, file['title']) + os.makedirs(folder_name, exist_ok=True) + download_folder(file['id'], folder_name) + else: + print(f"Downloading {file['title']} to {destination_folder}") + # breakpoint() + file.GetContentFile(os.path.join(destination_folder, file['title'])) + + +def get_drive(): + gauth = GoogleAuth() + gauth.LoadCredentialsFile("credentials.json") # Use the saved credentials + if gauth.access_token_expired: + gauth.Refresh() + else: + gauth.Authorize() + drive = GoogleDrive(gauth) + return drive + + +def main(): + drive = get_drive() + + # download_from_gdrive(name="Zebrafish-XSPIM-multiview") + # download_folder(drive, name="Platynereis-H2B-TL") + # download_folder(drive, name="Zebrafish-H2B-short-timelapse") + download_folder(drive, name="Zebrafish-H2B-short-4views") + + +if __name__ == "__main__": + main() diff --git a/scripts/ome_challenge/upload_data.py b/scripts/ome_challenge/upload_data.py new file mode 100644 index 0000000..d548fda --- /dev/null +++ b/scripts/ome_challenge/upload_data.py @@ -0,0 +1,36 @@ +import os +from subprocess import run + +ROOT = "/mnt/lustre-emmy-hdd/usr/u12086/data/flamingo" + + +def upload_data(name): + data_root = os.path.join(ROOT, "ngff-v3", name) + assert os.path.exists(data_root), data_root + + bucket_name = "n4bi-goe" + + # Create the bucket. + cmd = [ + "mc-client", "mb", f"challenge/{bucket_name}/{name}/" + ] + run(cmd) + + # Run the copy. + cmd = [ + "mc-client", "cp", "--recursive", + f"{data_root}/", f"challenge/{bucket_name}/{name}/" + ] + run(cmd) + + +def main(): + # name = "Platynereis-H2B-TL.ome.zarr" + # name = "Zebrafish-H2B-short-timelapse.ome.zarr" + name = "Zebrafish-XSPIM-multiview.ome.zarr" + + upload_data(name) + + +if __name__ == "__main__": + main()