Add zebrahub data with tracking data

anwai98 · anwai98 · commit 24278a7eed5e · 2025-07-18T15:47:27.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,4 @@ converted/
 *.egg-info/
 checkpoints/
 logs/
+*.csv
diff --git a/development/prepare_czi_zebrafish_data.py b/development/prepare_czi_zebrafish_data.py
@@ -1,9 +1,62 @@
+import os
+import subprocess
+from typing import Tuple
+
+import pandas as pd
+
 from ome_zarr.io import parse_url
 from ome_zarr.reader import Reader
 
 import dask.array as da
 
 
+def _get_dasky_data(url):
+    reader = Reader(parse_url(url))  # Prepare a reader.
+    nodes = list(reader())  # Might include multiple stuff
+    image_node = nodes[0]  # First node is expected to be image pixel data.
+
+    dask_data = image_node.data  # Get the daskified data.
+
+    return dask_data
+
+
+def get_zebrahub_data(timepoint: int = 99, view: bool = False) -> Tuple[da.Array, pd.DataFrame]:
+    """Gets the ZebraHub data from https://doi.org/10.1016/j.cell.2024.09.047.
+    """
+    # NOTE: There's more single objective samples for zebrafish available with tracking annotations
+    # https://public.czbiohub.org/royerlab/zebrahub/imaging/single-objective/
+    url = "https://public.czbiohub.org/royerlab/zebrahub/imaging/single-objective/ZSNS001.ome.zarr"
+
+    # Let's get the image data.
+    dask_data = _get_dasky_data(url)
+
+    # Get the lowest resolution (see below on how to access other resolutions)
+    curr_data = dask_data[-1]
+
+    # And strip out the channel dimension (see below for more details)
+    curr_data = curr_data[timepoint, 0]
+
+    # We have tracking annotations here. Let's check them out.
+    tracks_fpath = "ZSNS001_tracks.csv"
+    if not os.path.exists(tracks_fpath):
+        subprocess.run(
+            ["wget", "https://public.czbiohub.org/royerlab/zebrahub/imaging/single-objective/ZSNS001_tracks.csv"]
+        )
+
+    # Load the tracking annotation file.
+    tracks = pd.read_csv("ZSNS001_tracks.csv")  # I think this is on original resolution (?)
+
+    # HACK: Filtering ids based on one time-frame (the most plausible setup we might be opting for)
+    curr_tracks = tracks.loc[tracks["t"] == timepoint]
+
+    if view:
+        import napari
+        napari.view_image(curr_data)
+        napari.run()
+
+    return curr_data, curr_tracks
+
+
 def get_czi_zebrafish_data(
     neuromast: bool = True, view: bool = False
 ) -> da.Array:
@@ -26,12 +79,8 @@ def get_czi_zebrafish_data(
         # NOTE: This data does not have tracking annotations!
         url = "https://public.czbiohub.org/royerlab/ultrack/zebrafish_embryo.ome.zarr"
 
-    # First, let's get the image data
-    reader = Reader(parse_url(url))  # Prepare a reader.
-    nodes = list(reader())  # Might include multiple stuff
-    image_node = nodes[0]  # First node is expecte to be image pixel data.
-
-    dask_data = image_node.data  # Get the daskified data.
+    # Let's get the image data
+    dask_data = _get_dasky_data(url)
 
     # HACK: Try it for one dask array with lowest resolution (there exists four resolutions in this data).
     # TODO: Control res below, the highest res starts at the first index, lowest at the last index.
@@ -53,7 +102,8 @@ def get_czi_zebrafish_data(
 
 
 def main():
-    image = get_czi_zebrafish_data(neuromast=True, view=False)
+    # image = get_czi_zebrafish_data(neuromast=True, view=False)
+    image, tracks = get_zebrahub_data(view=False)
     print(image.shape)