Add a function to produce a ManifestStore from HDF5 files (#516)

maxrjones · sharkinsspatial · pre-commit-ci[bot] · web-flow · commit e7073750a821 · 2025-04-03T20:04:44.000-04:00
* Add a function to produce a ManifestStore from HDF5 files * Improve chunk index determination * Allow specifying chunk key encoding * Move obstore import * Add docstring entry * Fix typo * Return empty ChunkManifests for empty HDF Datasets. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix type checking block. --------- Co-authored-by: sharkinsspatial <sharkinsgis@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/virtualizarr/readers/hdf/hdf.py b/virtualizarr/readers/hdf/hdf.py
@@ -4,6 +4,7 @@
 from pathlib import Path
 from typing import (
     TYPE_CHECKING,
+    Any,
     Dict,
     Hashable,
     Iterable,
@@ -27,6 +28,8 @@
     ChunkEntry,
     ChunkManifest,
     ManifestArray,
+    ManifestGroup,
+    ManifestStore,
 )
 from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri
 from virtualizarr.manifests.utils import create_v3_array_metadata
@@ -41,6 +44,7 @@
 if TYPE_CHECKING:
     from h5py import Dataset as H5Dataset
     from h5py import Group as H5Group
+    from obstore.store import ObjectStore
 
 FillValueType = Union[
     int,
@@ -58,6 +62,111 @@
 
 
 class HDFVirtualBackend(VirtualBackend):
+    @staticmethod
+    def _construct_manifest_array(
+        path: str,
+        dataset: H5Dataset,
+        group: str,
+    ) -> ManifestArray:
+        """
+        Construct a ManifestArray from an h5py dataset
+        Parameters
+        ----------
+        path: str
+            The path of the hdf5 file.
+        dataset : h5py.Dataset
+            An h5py dataset.
+        group : str
+            Name of the group containing this h5py.Dataset.
+        Returns
+        -------
+        ManifestArray
+        """
+        chunks = dataset.chunks if dataset.chunks else dataset.shape
+        codecs = codecs_from_dataset(dataset)
+        attrs = HDFVirtualBackend._extract_attrs(dataset)
+        dtype = dataset.dtype
+
+        codec_configs = [
+            numcodec_config_to_configurable(codec.get_config()) for codec in codecs
+        ]
+
+        fill_value = dataset.fillvalue.item()
+        dims = tuple(HDFVirtualBackend._dataset_dims(dataset, group=group))
+        metadata = create_v3_array_metadata(
+            shape=dataset.shape,
+            data_type=dtype,
+            chunk_shape=chunks,
+            fill_value=fill_value,
+            codecs=codec_configs,
+            dimension_names=dims,
+            attributes=attrs,
+        )
+
+        manifest = HDFVirtualBackend._dataset_chunk_manifest(path, dataset)
+        return ManifestArray(metadata=metadata, chunkmanifest=manifest)
+
+    @staticmethod
+    def _construct_manifest_group(
+        store: ObjectStore,
+        filepath: str,
+        *,
+        group: str | None = None,
+        drop_variables: Optional[List[str]] = None,
+    ) -> ManifestGroup:
+        """
+        Construct a virtual Group from a HDF dataset.
+        """
+        from virtualizarr.utils import ObstoreReader
+
+        if drop_variables is None:
+            drop_variables = []
+
+        reader = ObstoreReader(store=store, path=filepath)
+        f = h5py.File(reader, mode="r")
+
+        if group is not None and group != "":
+            g = f[group]
+            group_name = group
+            if not isinstance(g, h5py.Group):
+                raise ValueError("The provided group is not an HDF group")
+        else:
+            g = f["/"]
+            group_name = "/"
+
+        manifest_dict = {}
+        non_coordinate_dimesion_vars = HDFVirtualBackend._find_non_coord_dimension_vars(
+            group=g
+        )
+        drop_variables = list(set(drop_variables + non_coordinate_dimesion_vars))
+        attrs: dict[str, Any] = {}
+        for key in g.keys():
+            if key not in drop_variables:
+                if isinstance(g[key], h5py.Dataset):
+                    variable = HDFVirtualBackend._construct_manifest_array(
+                        path=filepath,
+                        dataset=g[key],
+                        group=group_name,
+                    )
+                    if variable is not None:
+                        manifest_dict[key] = variable
+        return ManifestGroup(arrays=manifest_dict, attributes=attrs)
+
+    @staticmethod
+    def _create_manifest_store(
+        filepath: str,
+        *,
+        prefix: str,
+        store: ObjectStore,
+        group: str | None = None,
+    ) -> ManifestStore:
+        # Create a group containing dataset level metadata and all the manifest arrays
+        manifest_group = HDFVirtualBackend._construct_manifest_group(
+            store=store, filepath=filepath, group=group
+        )
+        # Convert to a manifest store
+        return ManifestStore(stores={prefix: store}, group=manifest_group)
+
     @staticmethod
     def open_virtual_dataset(
         filepath: str,
@@ -119,7 +228,7 @@ def open_virtual_dataset(
     def _dataset_chunk_manifest(
         path: str,
         dataset: H5Dataset,
-    ) -> Optional[ChunkManifest]:
+    ) -> ChunkManifest:
         """
         Generate ChunkManifest for HDF5 dataset.
 
@@ -138,7 +247,7 @@ def _dataset_chunk_manifest(
         dsid = dataset.id
         if dataset.chunks is None:
             if dsid.get_offset() is None:
-                return None
+                chunk_manifest = ChunkManifest(entries={}, shape=dataset.shape)
             else:
                 key_list = [0] * (len(dataset.shape) or 1)
                 key = ".".join(map(str, key_list))
@@ -149,42 +258,42 @@ def _dataset_chunk_manifest(
                 chunk_key = ChunkKey(key)
                 chunk_entries = {chunk_key: chunk_entry}
                 chunk_manifest = ChunkManifest(entries=chunk_entries)
-                return chunk_manifest
         else:
             num_chunks = dsid.get_num_chunks()
             if num_chunks == 0:
-                raise ValueError("The dataset is chunked but contains no chunks")
-            shape = tuple(
-                math.ceil(a / b) for a, b in zip(dataset.shape, dataset.chunks)
-            )
-            paths = np.empty(shape, dtype=np.dtypes.StringDType)  # type: ignore
-            offsets = np.empty(shape, dtype=np.uint64)
-            lengths = np.empty(shape, dtype=np.uint64)
-
-            def get_key(blob):
-                return tuple(
-                    [a // b for a, b in zip(blob.chunk_offset, dataset.chunks)]
+                chunk_manifest = ChunkManifest(entries={}, shape=dataset.shape)
+            else:
+                shape = tuple(
+                    math.ceil(a / b) for a, b in zip(dataset.shape, dataset.chunks)
                 )
+                paths = np.empty(shape, dtype=np.dtypes.StringDType)  # type: ignore
+                offsets = np.empty(shape, dtype=np.uint64)
+                lengths = np.empty(shape, dtype=np.uint64)
+
+                def get_key(blob):
+                    return tuple(
+                        [a // b for a, b in zip(blob.chunk_offset, dataset.chunks)]
+                    )
 
-            def add_chunk_info(blob):
-                key = get_key(blob)
-                paths[key] = path
-                offsets[key] = blob.byte_offset
-                lengths[key] = blob.size
+                def add_chunk_info(blob):
+                    key = get_key(blob)
+                    paths[key] = path
+                    offsets[key] = blob.byte_offset
+                    lengths[key] = blob.size
 
-            has_chunk_iter = callable(getattr(dsid, "chunk_iter", None))
-            if has_chunk_iter:
-                dsid.chunk_iter(add_chunk_info)
-            else:
-                for index in range(num_chunks):
-                    add_chunk_info(dsid.get_chunk_info(index))
+                has_chunk_iter = callable(getattr(dsid, "chunk_iter", None))
+                if has_chunk_iter:
+                    dsid.chunk_iter(add_chunk_info)
+                else:
+                    for index in range(num_chunks):
+                        add_chunk_info(dsid.get_chunk_info(index))
 
-            chunk_manifest = ChunkManifest.from_arrays(
-                paths=paths,  # type: ignore
-                offsets=offsets,
-                lengths=lengths,
-            )
-            return chunk_manifest
+                chunk_manifest = ChunkManifest.from_arrays(
+                    paths=paths,  # type: ignore
+                    offsets=offsets,
+                    lengths=lengths,
+                )
+        return chunk_manifest
 
     @staticmethod
     def _dataset_dims(dataset: H5Dataset, group: str = "") -> List[str]:
diff --git a/virtualizarr/tests/test_readers/test_hdf/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf/test_hdf.py
@@ -16,19 +16,18 @@ class TestDatasetChunkManifest:
     def test_empty_chunks(self, empty_chunks_hdf5_file):
         f = h5py.File(empty_chunks_hdf5_file)
         ds = f["data"]
-        with pytest.raises(ValueError, match="chunked but contains no chunks"):
-            HDFVirtualBackend._dataset_chunk_manifest(
-                path=empty_chunks_hdf5_file, dataset=ds
-            )
+        manifest = HDFVirtualBackend._dataset_chunk_manifest(
+            path=empty_chunks_hdf5_file, dataset=ds
+        )
+        assert manifest.shape_chunk_grid == (0,)
 
-    @pytest.mark.skip("Need to differentiate non coordinate dimensions from empty")
     def test_empty_dataset(self, empty_dataset_hdf5_file):
         f = h5py.File(empty_dataset_hdf5_file)
         ds = f["data"]
-        with pytest.raises(ValueError, match="no space allocated in the file"):
-            HDFVirtualBackend._dataset_chunk_manifest(
-                path=empty_dataset_hdf5_file, dataset=ds
-            )
+        manifest = HDFVirtualBackend._dataset_chunk_manifest(
+            path=empty_dataset_hdf5_file, dataset=ds
+        )
+        assert manifest.shape_chunk_grid == (0,)
 
     def test_no_chunking(self, no_chunks_hdf5_file):
         f = h5py.File(no_chunks_hdf5_file)
diff --git a/virtualizarr/tests/test_readers/test_hdf/test_hdf_manifest_store.py b/virtualizarr/tests/test_readers/test_hdf/test_hdf_manifest_store.py
@@ -0,0 +1,40 @@
+import numpy as np
+import pytest
+import xarray as xr
+
+from virtualizarr.readers.hdf import HDFVirtualBackend
+from virtualizarr.tests import (
+    requires_hdf5plugin,
+    requires_obstore,
+)
+
+
+@pytest.fixture(name="basic_ds")
+def basic_ds():
+    x = np.arange(100)
+    y = np.arange(100)
+    temperature = 0.1 * x[:, None] + 0.1 * y[None, :]
+    ds = xr.Dataset(
+        {"temperature": (["x", "y"], temperature)},
+        coords={"x": np.arange(100), "y": np.arange(100)},
+    )
+    return ds
+
+
+@requires_hdf5plugin
+@requires_obstore
+class TestHDFManifestStore:
+    def test_rountrip_simple_virtualdataset(self, tmpdir, basic_ds):
+        from obstore.store import LocalStore
+
+        "Roundtrip a dataset to/from NetCDF with the HDF reader and ManifestStore"
+
+        filepath = f"{tmpdir}/basic_ds_roundtrip.nc"
+        basic_ds.to_netcdf(filepath, engine="h5netcdf")
+        store = HDFVirtualBackend._create_manifest_store(
+            filepath=filepath, store=LocalStore(), prefix="file://"
+        )
+        rountripped_ds = xr.open_dataset(
+            store, engine="zarr", consolidated=False, zarr_format=3
+        )
+        xr.testing.assert_allclose(basic_ds, rountripped_ds)
diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py
@@ -2,6 +2,7 @@
 
 import importlib
 import io
+from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any, Iterable, Optional, Union
 
 from zarr.abc.codec import ArrayArrayCodec, BytesBytesCodec
@@ -12,14 +13,32 @@
 if TYPE_CHECKING:
     import fsspec.core
     import fsspec.spec
+    from obstore import ReadableFile
+    from obstore.store import ObjectStore
 
     # See pangeo_forge_recipes.storage
     OpenFileType = Union[
         fsspec.core.OpenFile, fsspec.spec.AbstractBufferedFile, io.IOBase
     ]
 
 
-from dataclasses import dataclass, field
+class ObstoreReader:
+    _reader: ReadableFile
+
+    def __init__(self, store: ObjectStore, path: str) -> None:
+        import obstore as obs
+
+        self._reader = obs.open_reader(store, path)
+
+    def read(self, size: int, /) -> bytes:
+        return self._reader.read(size).to_bytes()
+
+    def seek(self, offset: int, whence: int = 0, /):
+        # TODO: Check on default for whence
+        return self._reader.seek(offset, whence)
+
+    def tell(self) -> int:
+        return self._reader.tell()
 
 
 @dataclass