Extend support for data download from CryoET portal

constantinpape · constantinpape · commit caba84fa1f22 · 2025-01-12T20:51:41.000+01:00
diff --git a/scripts/cryo/cryo-et-portal/download_tomogram_lists.py b/scripts/cryo/cryo-et-portal/download_tomogram_lists.py
@@ -0,0 +1,39 @@
+import json
+import os
+
+from synapse_net.file_utils import read_data_from_cryo_et_portal_run
+from tqdm import tqdm
+
+
+def download_tomogram_list(run_ids, output_root):
+    print("Downloading", len(run_ids), "tomograms")
+    os.makedirs(output_root, exist_ok=True)
+    for run_id in tqdm(run_ids):
+        output_path = os.path.join(output_root, f"{run_id}.mrc")
+        data, voxel_size = read_data_from_cryo_et_portal_run(
+            run_id, use_zarr_format=False, output_path=output_path, id_field="id",
+        )
+        if data is None:
+            print("Did not find a tomogram for", run_id)
+
+
+def download_tomograms_for_da():
+    with open("./list_for_da.json") as f:
+        run_ids = json.load(f)
+    output_root = "/scratch-grete/projects/nim00007/cryo-et/from_portal/for_domain_adaptation"
+    download_tomogram_list(run_ids, output_root)
+
+
+def download_tomograms_for_eval():
+    with open("./list_for_eval.json") as f:
+        run_ids = json.load(f)
+    download_tomogram_list(run_ids)
+
+
+def main():
+    # download_tomograms_for_eval()
+    download_tomograms_for_da()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/synapse_net/file_utils.py b/synapse_net/file_utils.py
@@ -15,6 +15,11 @@
 except ImportError:
     zarr = None
 
+try:
+    import s3fs
+except ImportError:
+    s3fs = None
+
 
 def get_cache_dir() -> str:
     """Get the cache directory of synapse net.
@@ -100,12 +105,13 @@ def read_mrc(path: str) -> Tuple[np.ndarray, Dict[str, float]]:
     return data, voxel_size
 
 
-def read_ome_zarr(uri: str, scale_level: int = 0) -> Tuple[np.ndarray, Dict[str, float]]:
+def read_ome_zarr(uri: str, scale_level: int = 0, fs=None) -> Tuple[np.ndarray, Dict[str, float]]:
     """Read data and voxel size from an ome.zarr file.
 
     Args:
         uri: Path or url to the ome.zarr file.
         scale_level: The level of the multi-scale image pyramid to load.
+        fs: S3 filesystem to use for initializing the store.
 
     Returns:
         The data read from the file.
@@ -114,31 +120,96 @@ def read_ome_zarr(uri: str, scale_level: int = 0) -> Tuple[np.ndarray, Dict[str,
     if zarr is None:
         raise RuntimeError("The zarr library is required to read ome.zarr files.")
 
-    # TODO handle URLs / make sure that zarr parses it correctly.
-    with zarr.open(uri, "r") as f:
+    def parse_s3_uri(uri):
+        return uri.lstrip("s3://")
+
+    if uri.startswith("s3"):
+        if fs is None:
+            fs = s3fs.S3FileSystem(anon=True)
+        s3_uri = parse_s3_uri(uri)
+        store = s3fs.S3Map(root=s3_uri, s3=fs, check=False)
+    elif fs is not None:
+        s3_uri = parse_s3_uri(uri)
+        store = s3fs.S3Map(root=s3_uri, s3=fs, check=False)
+    else:
+        if not os.path.exists(uri):
+            raise ValueError(f"Cannot find the filepath at {uri}.")
+        store = uri
+
+    with zarr.open(store, "r") as f:
         multiscales = f.attrs["multiscales"][0]
-        # TODO double check that the metadata is correct and transform the voxel size to a dict.
-        # TODO voxel size is given in Angstrom, divide by 10 to get nanometer
-        internal_path = multiscales["dataset"][scale_level]
+
+        # Read the axis and transformation metadata for this dataset, to determine the voxel size.
+        axes = [axis["name"] for axis in multiscales["axes"]]
+        assert set(axes) == set("xyz")
+        transformations = multiscales["datasets"][scale_level]["coordinateTransformations"]
+        scale_transformation = [trafo["scale"] for trafo in transformations if trafo["type"] == "scale"][0]
+
+        # The voxel size is given in angstrom, we divide it by 10 to convert it to nanometer.
+        voxel_size = {axis: scale / 10.0 for axis, scale in zip(axes, scale_transformation)}
+
+        # Get the internale path for the given scale and load the data.
+        internal_path = multiscales["datasets"][scale_level]["path"]
         data = f[internal_path][:]
-        transformation = multiscales["transformation"][scale_level]
-        voxel_size = transformation["scale"]
 
     return data, voxel_size
 
 
 def read_data_from_cryo_et_portal_run(
-    run_id: int, output_path: Optional[str] = None
+    run_id: int,
+    output_path: Optional[str] = None,
+    use_zarr_format: bool = True,
+    processing_type: str = "denoised",
+    id_field: str = "run_id",
+    scale_level: Optional[int] = None,
 ) -> Tuple[np.ndarray, Dict[str, float]]:
     """Read data and voxel size from a CryoET Data Portal run.
 
     Args:
         run_id: The ID of the experiment run.
         output_path: The path for saving the data. The data will be streamed if the path is not given.
+        use_zarr_format: Whether to use the data in zarr format instead of mrc.
+        processing_type: The processing type of the tomogram to download.
+        id_field: The name of the id field.
+        scale_level: The scale level to read from the data. Only valid for zarr data.
 
     Returns:
         The data read from the run.
-        The voxel size read from the run
+        The voxel size read from the run.
     """
+    if output_path is not None and os.path.exists(output_path):
+        return read_ome_zarr(output_path) if use_zarr_format else read_mrc(output_path)
+
     if cdp is None:
-        raise RuntimeError("The CryoET Data portal library is required to read data from the portal.")
+        raise RuntimeError("The CryoET data portal library is required to download data from the portal.")
+    if s3fs is None:
+        raise RuntimeError("The CryoET data portal download requires s3fs download.")
+
+    client = cdp.Client()
+
+    fs = s3fs.S3FileSystem(anon=True)
+    tomograms = cdp.Tomogram.find(
+        client, [getattr(cdp.Tomogram, id_field) == run_id, cdp.Tomogram.processing == processing_type]
+    )
+    if len(tomograms) == 0:
+        return None, None
+    if len(tomograms) > 1:
+        raise NotImplementedError
+    tomo = tomograms[0]
+
+    if use_zarr_format:
+        if output_path is None:
+            scale_level = 0 if scale_level is None else scale_level
+            data, voxel_size = read_ome_zarr(tomo.s3_omezarr_dir, fs=fs)
+        else:
+            # TODO: write the outuput to ome zarr, for all scale levels.
+            raise NotImplementedError
+    else:
+        if scale_level is not None:
+            raise ValueError
+        if output_path is None:
+            raise RuntimeError("You have to pass an output_path to download the data as mrc file.")
+        fs.get(tomo.s3_mrc_file, output_path)
+        data, voxel_size = read_mrc(output_path)
+
+    return data, voxel_size