From 439accd6e0aa0098ef3af7122792b61d3e30d08e Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Tue, 11 Nov 2025 12:44:01 -0500 Subject: [PATCH 1/7] Return None for Zarr V2/consolidated metadata requests --- virtualizarr/manifests/store.py | 6 ++++++ .../test_hdf/test_hdf_manifest_store.py | 16 ++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/virtualizarr/manifests/store.py b/virtualizarr/manifests/store.py index 962c122e..2c301447 100644 --- a/virtualizarr/manifests/store.py +++ b/virtualizarr/manifests/store.py @@ -171,6 +171,12 @@ async def get( return self._group.arrays[var].metadata.to_buffer_dict( prototype=default_buffer_prototype() )["zarr.json"] + elif key.endswith((".zattrs", ".zgroup", ".zarray", ".zmetadata")): + # Zarr-Python expects store classes to return None when metadata JSONs are not found. + # Zarr-Python uses this behavior to distinguish between V2/V3 and consolidated/unconsolidated stores. + # This upstream behavior will hopefully change in the future to be more Zarr-hierarchy aware, in + # which case this may need refactoring. + return None var = key.split("/")[0] marr = self._group.arrays[var] manifest = marr.manifest diff --git a/virtualizarr/tests/test_parsers/test_hdf/test_hdf_manifest_store.py b/virtualizarr/tests/test_parsers/test_hdf/test_hdf_manifest_store.py index 3194e8db..32046f0e 100644 --- a/virtualizarr/tests/test_parsers/test_hdf/test_hdf_manifest_store.py +++ b/virtualizarr/tests/test_parsers/test_hdf/test_hdf_manifest_store.py @@ -42,6 +42,22 @@ def test_roundtrip_simple_virtualdataset(self, tmpdir, basic_ds): ) as rountripped_ds: xr.testing.assert_allclose(basic_ds, rountripped_ds) + def test_roundtrip_simple_virtualdataset_guess_zarr_format(self, tmpdir, basic_ds): + """ + Roundtrip a dataset to/from NetCDF with the HDF reader and ManifestStore, relying + on xarray/zarr to guess the zarr format and unconsolidated store metadata. + """ + + filepath = f"{tmpdir}/basic_ds_roundtrip.nc" + url = f"file://{filepath}" + basic_ds.to_netcdf(filepath, engine="h5netcdf") + manifest_store = manifest_store_from_hdf_url(url) + with xr.open_dataset( + manifest_store, + engine="zarr", + ) as rountripped_ds: + xr.testing.assert_allclose(basic_ds, rountripped_ds) + def test_rountrip_partial_chunk_virtualdataset(self, tmpdir, basic_ds): "Roundtrip a dataset to/from NetCDF with the HDF reader and ManifestStore with a single partial chunk" From 24e29f38d518bd5059de1102e55f47e97ead9cf9 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Tue, 11 Nov 2025 14:11:46 -0500 Subject: [PATCH 2/7] Squashed commit of the following: commit 113c71285534f8c39c56ca40e117bd946b9e9c51 Author: Ilan Gold Date: Tue Nov 11 17:25:49 2025 +0100 Update virtualizarr/tests/test_parsers/test_hdf/test_hdf.py Co-authored-by: Chuck Daniels commit 56936491db174faeae97e97635c8522218e7f172 Author: Ilan Gold Date: Tue Nov 11 17:25:10 2025 +0100 refactor: `get_deepest_group_or_array` type commit 849d5677462261f9e446cec870619f72aa5f7a15 Merge: 916c3e1 7a13261 Author: Ilan Gold Date: Fri Nov 7 14:18:51 2025 +0100 Merge branch 'main' into ig/nested_h5_group commit 916c3e17f05fd0d90520784ba79260ac61279264 Merge: d8dc369 a2b65c1 Author: Ilan Gold Date: Wed Sep 10 13:11:01 2025 +0200 Merge branch 'main' into ig/nested_h5_group commit d8dc36973e66c1c75eb713d03a0028119b0dda84 Author: Ilan Gold Date: Sun Sep 7 19:53:08 2025 +0200 fix: update release note commit a3a26153da2644acf7c2e6396f05f6d2e7b98944 Author: ilan-gold Date: Sun Sep 7 16:45:22 2025 +0200 chore: clean up error handling for non-metadata group requests + tests commit fc0e99952ca3f2aadb399d39b127fdf0ff3967a4 Merge: 3036de8 1facb10 Author: ilan-gold Date: Sun Sep 7 16:29:59 2025 +0200 Merge branch 'ig/nested_h5_group' of https://github.com/ilan-gold/VirtualiZarr into ig/nested_h5_group commit 3036de8911bbdf445fb4aa927a2034dec6aa28a4 Author: ilan-gold Date: Sun Sep 7 16:29:44 2025 +0200 fix: remove subgroup check commit 1facb107127874ca5aac8e274e027ab7abc5b783 Author: Ilan Gold Date: Sun Sep 7 16:17:43 2025 +0200 fix: wrong place! commit 33e517e84c2dce700c2f4f6e147d4f060f9447ac Merge: afcaaed c1db925 Author: Ilan Gold Date: Sun Sep 7 16:17:21 2025 +0200 Merge branch 'main' into ig/nested_h5_group commit afcaaedb7d0bb2400f3836dda029d10a2853dfbd Author: ilan-gold Date: Sun Sep 7 16:15:26 2025 +0200 chore: consolidate tests commit 253d184ddb0c8c15f1376c392f18d339981b3b13 Author: ilan-gold Date: Sun Sep 7 15:57:18 2025 +0200 fix: disallow nested xarray handling commit 57551b893eb08b342507d172891221c2bcbcc26b Author: ilan-gold Date: Fri Aug 29 08:52:28 2025 +0200 chore: relnote commit 6fe1dbd0773aa860f2c3cc551197c0abed915c76 Author: ilan-gold Date: Fri Aug 29 08:45:57 2025 +0200 fix: typing commit 85f2930d11f02d8eabd8453b0e440571ac8a6cef Author: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu Aug 28 16:27:24 2025 +0000 [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci commit e92ba8cf39be7faed44a29a542891fba595bb829 Author: ilan-gold Date: Thu Aug 28 18:19:46 2025 +0200 fix: simplify node logic commit 323be331d98d8aae53ddbab33c4113c60f9dcd83 Author: ilan-gold Date: Thu Aug 28 18:02:04 2025 +0200 feat: nested groups --- docs/releases.md | 4 ++ virtualizarr/manifests/group.py | 8 +--- virtualizarr/manifests/store.py | 40 +++++++++++-------- virtualizarr/parsers/hdf/hdf.py | 17 ++++++-- .../tests/test_manifests/test_store.py | 24 +++++++++-- virtualizarr/tests/test_parsers/conftest.py | 3 +- .../tests/test_parsers/test_hdf/test_hdf.py | 20 +++++++++- virtualizarr/xarray.py | 1 + 8 files changed, 85 insertions(+), 32 deletions(-) diff --git a/docs/releases.md b/docs/releases.md index 092622da..55f98b06 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -4,6 +4,10 @@ ### New Features +- Allow nested-groups inside `ManifestStore` and `ManifestGroup` objects and update `HDFParser` to be able to create nested `zarr.Group` objects. + ([#790](https://github.com/zarr-developers/VirtualiZarr/pull/790)). + By [Ilan Gold](https://github.com/ilan-gold) + ### Breaking changes ### Bug fixes diff --git a/virtualizarr/manifests/group.py b/virtualizarr/manifests/group.py index c3caf685..7f637fcf 100644 --- a/virtualizarr/manifests/group.py +++ b/virtualizarr/manifests/group.py @@ -40,13 +40,7 @@ def __init__( self._metadata = GroupMetadata(attributes=attributes) _arrays: Mapping[str, ManifestArray] = {} if arrays is None else arrays - - if groups: - # TODO add support for nested groups - raise NotImplementedError - else: - _groups: Mapping[str, ManifestGroup] = {} if groups is None else groups - + _groups: Mapping[str, ManifestGroup] = {} if groups is None else groups for name, arr in _arrays.items(): if not isinstance(arr, ManifestArray): raise TypeError( diff --git a/virtualizarr/manifests/store.py b/virtualizarr/manifests/store.py index 2c301447..de1c93fb 100644 --- a/virtualizarr/manifests/store.py +++ b/virtualizarr/manifests/store.py @@ -16,6 +16,7 @@ from zarr.core.buffer import Buffer, BufferPrototype, default_buffer_prototype from zarr.core.common import BytesLike +from virtualizarr.manifests.array import ManifestArray from virtualizarr.manifests.group import ManifestGroup from virtualizarr.manifests.utils import construct_chunk_pattern from virtualizarr.registry import ObjectStoreRegistry @@ -97,6 +98,16 @@ def parse_manifest_index( return tuple(int(ind) for ind in chunk_component.split(chunk_key_encoding)) +def get_deepest_group_or_array( + node: ManifestGroup, key: Iterable[str] +) -> ManifestGroup | ManifestArray: + for var in key: + if var in node.arrays: + return node.arrays[var] + node = node.groups[var] + return node + + class ManifestStore(Store): """ A read-only Zarr store that uses obstore to read data from inside arbitrary files on AWS, GCP, Azure, or a local filesystem. @@ -158,31 +169,26 @@ async def get( byte_range: ByteRequest | None = None, ) -> Buffer | None: # docstring inherited - - if key == "zarr.json": - # Return group metadata - return self._group.metadata.to_buffer_dict( - prototype=default_buffer_prototype() - )["zarr.json"] - elif key.endswith("zarr.json"): - # Return array metadata - # TODO: Handle nested groups - var, _ = key.split("/") - return self._group.arrays[var].metadata.to_buffer_dict( - prototype=default_buffer_prototype() - )["zarr.json"] + node = get_deepest_group_or_array(self._group, key.split("/")[:-1]) + if key.endswith("zarr.json"): + # Return metadata + return node.metadata.to_buffer_dict(prototype=default_buffer_prototype())[ + "zarr.json" + ] elif key.endswith((".zattrs", ".zgroup", ".zarray", ".zmetadata")): # Zarr-Python expects store classes to return None when metadata JSONs are not found. # Zarr-Python uses this behavior to distinguish between V2/V3 and consolidated/unconsolidated stores. # This upstream behavior will hopefully change in the future to be more Zarr-hierarchy aware, in # which case this may need refactoring. return None - var = key.split("/")[0] - marr = self._group.arrays[var] - manifest = marr.manifest + if isinstance(node, ManifestGroup): + raise ValueError( + "Key requested is a group but the key does not end in `zarr.json`" + ) + manifest = node.manifest separator: Literal[".", "/"] = getattr( - marr.metadata.chunk_key_encoding, "separator", "." + node.metadata.chunk_key_encoding, "separator", "." ) chunk_indexes = parse_manifest_index(key, separator) diff --git a/virtualizarr/parsers/hdf/hdf.py b/virtualizarr/parsers/hdf/hdf.py index c3968de6..1943bf58 100644 --- a/virtualizarr/parsers/hdf/hdf.py +++ b/virtualizarr/parsers/hdf/hdf.py @@ -1,6 +1,7 @@ from __future__ import annotations import math +from pathlib import Path from typing import ( TYPE_CHECKING, Iterable, @@ -98,7 +99,6 @@ def _construct_manifest_group( """ Construct a virtual Group from a HDF dataset. """ - import h5py with h5py.File(reader, mode="r") as f: @@ -116,11 +116,22 @@ def _construct_manifest_group( arrays = { key: _construct_manifest_array(filepath, dataset, group_name) for key in g.keys() - if key not in drop_variables and isinstance(dataset := g[key], h5py.Dataset) + if key not in drop_variables + if isinstance(dataset := g[key], h5py.Dataset) + } + groups = { + key: _construct_manifest_group( + filepath, + reader, + group=str(Path(group) / key) if group is not None else key, + ) + for key in g.keys() + if key not in drop_variables + if isinstance(g[key], h5py.Group) } attributes = _extract_attrs(g) - return ManifestGroup(arrays=arrays, attributes=attributes) + return ManifestGroup(arrays=arrays, groups=groups, attributes=attributes) class HDFParser: diff --git a/virtualizarr/tests/test_manifests/test_store.py b/virtualizarr/tests/test_manifests/test_store.py index 033dc528..2e571725 100644 --- a/virtualizarr/tests/test_manifests/test_store.py +++ b/virtualizarr/tests/test_manifests/test_store.py @@ -145,6 +145,7 @@ def _generate_manifest_store( "bar": manifest_array, "scalar": scalar_manifest_array, }, + groups={"subgroup": ManifestGroup(arrays={"foo": manifest_array})}, attributes={"Zarr": "Hooray!"}, ) registry = ObjectStoreRegistry({prefix: store}) @@ -205,7 +206,10 @@ def empty_memory_store(): fill_value=0, ) manifest_array = ManifestArray(metadata=array_metadata, chunkmanifest=manifest) - manifest_group = ManifestGroup(arrays={"foo": manifest_array}) + sub_group = ManifestGroup(arrays={"foo": manifest_array}) + manifest_group = ManifestGroup( + arrays={"foo": manifest_array}, groups={"subgroup": sub_group} + ) registry = ObjectStoreRegistry({"memory://": store}) return ManifestStore(registry=registry, group=manifest_group) @@ -229,6 +233,16 @@ async def test_get_empty_chunk(self, manifest_store, request): observed = await store.get("foo/c.0.0", prototype=default_buffer_prototype()) assert observed is None + @pytest.mark.asyncio + @pytest.mark.parametrize( + "manifest_store", + ["empty_memory_store"], + ) + async def test_get_group_key_fails(self, manifest_store, request): + store = request.getfixturevalue(manifest_store) + with pytest.raises(ValueError, match=r"Key requested is a group"): + await store.get("subgroup", prototype=default_buffer_prototype()) + @pytest.mark.asyncio @pytest.mark.parametrize( "manifest_store", @@ -269,10 +283,14 @@ async def test_get_data(self, manifest_store, request): "manifest_store", ["local_store", pytest.param("s3_store", marks=requires_minio)], ) - async def test_get_metadata(self, manifest_store, request): + @pytest.mark.parametrize( + "subgroup", + ["", "subgroup/"], + ) + async def test_get_metadata(self, manifest_store, request, subgroup): store = request.getfixturevalue(manifest_store) observed = await store.get( - "foo/zarr.json", prototype=default_buffer_prototype() + f"{subgroup}foo/zarr.json", prototype=default_buffer_prototype() ) metadata = json.loads(observed.to_bytes()) assert metadata["chunk_grid"]["configuration"]["chunk_shape"] == [2, 2] diff --git a/virtualizarr/tests/test_parsers/conftest.py b/virtualizarr/tests/test_parsers/conftest.py index 02db235a..83e44e8e 100644 --- a/virtualizarr/tests/test_parsers/conftest.py +++ b/virtualizarr/tests/test_parsers/conftest.py @@ -151,7 +151,8 @@ def nested_group_hdf5_url(tmp_path: Path) -> str: g = f.create_group("group") data = np.random.random((10, 10)) g.create_dataset("data", data=data) - g.create_group("nested_group") + g_nested = g.create_group("nested_group") + g_nested.create_dataset("data", data=data) return f"file://{filepath}" diff --git a/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py b/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py index 114ed718..f0e04594 100644 --- a/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py +++ b/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py @@ -2,6 +2,7 @@ import numpy as np import pytest import xarray as xr +import zarr from obstore.store import from_url from virtualizarr import open_virtual_dataset @@ -129,12 +130,29 @@ def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_url): manifest_store = manifest_store_from_hdf_url(chunked_dimensions_netcdf4_url) assert len(manifest_store._group.arrays) == 3 - def test_nested_groups_are_ignored(self, nested_group_hdf5_url): + def test_nested_groups_are_ignored_when_group_is_specificed( + self, nested_group_hdf5_url + ): manifest_store = manifest_store_from_hdf_url( nested_group_hdf5_url, group="group" ) assert len(manifest_store._group.arrays) == 1 + def test_nested_groups_are_detected(self, nested_group_hdf5_url): + manifest_store = manifest_store_from_hdf_url(nested_group_hdf5_url) + assert len(manifest_store._group["group"]["nested_group"].arrays) == 1 + + def test_nested_data(self, nested_group_hdf5_url): + manifest_store = manifest_store_from_hdf_url(nested_group_hdf5_url) + z = zarr.open_group(manifest_store, mode="r", zarr_format=3) + + with h5py.File(nested_group_hdf5_url.removeprefix("file://"), mode="r") as f: + np.testing.assert_array_equal(f["group"]["data"], z["group"]["data"][...]) + np.testing.assert_array_equal( + f["group"]["nested_group"]["data"][...], + z["group"]["nested_group"]["data"][...], + ) + def test_drop_variables(self, multiple_datasets_hdf5_url, local_registry): parser = HDFParser(drop_variables=["data2"]) manifest_store = parser(url=multiple_datasets_hdf5_url, registry=local_registry) diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 3634c524..bcd63b20 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -333,6 +333,7 @@ def construct_virtual_dataset( """ + # TODO: Remove private API `._group` if group: raise NotImplementedError("ManifestStore does not yet support nested groups") else: From 1e1db55fbca2dee6d9262e47c7cc8ca628fc07cc Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Thu, 13 Nov 2025 18:49:11 -0500 Subject: [PATCH 3/7] Squashed commit of the following: commit 128138b45212af069f4e0c4bd1d7766c863ed20b Author: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Thu Nov 13 18:39:24 2025 -0500 Apply suggestion from code review commit 9c0e8032cc55affcd57b764270db98fbbde25de0 Author: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Thu Nov 13 18:25:39 2025 -0500 Apply suggestions from code review Co-authored-by: Chuck Daniels commit d1595b4d05dd5a671b69a247da35d7806f00ade3 Author: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Thu Nov 13 18:25:10 2025 -0500 Update virtualizarr/manifests/store.py Co-authored-by: Chuck Daniels commit 15739922e5f95235030cd2536d5a8027e8358a3b Author: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Thu Nov 13 18:17:32 2025 -0500 Add test for listing array in subgroup commit dcf80bbcb0135ca7ab955554d6173f4405fbbcbd Author: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Thu Nov 13 15:25:10 2025 -0500 Fix typing commit 2448c5df9bfe7d62290d1bc796965f314a29d2cc Author: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Thu Nov 13 15:08:33 2025 -0500 Improve ManifestStore.list_dir for arrays and nested groups --- virtualizarr/manifests/store.py | 79 +++++++++++++++---- .../tests/test_manifests/test_store.py | 10 ++- 2 files changed, 72 insertions(+), 17 deletions(-) diff --git a/virtualizarr/manifests/store.py b/virtualizarr/manifests/store.py index 895af001..874a253e 100644 --- a/virtualizarr/manifests/store.py +++ b/virtualizarr/manifests/store.py @@ -80,7 +80,7 @@ def parse_manifest_index( """ # Keys ending in `/c` are scalar arrays. The paths, offsets, and lengths in a chunk manifest # of a scalar array should also be scalar arrays that can be indexed with an empty tuple. - if key.endswith("/c"): + if key.endswith("/c") or key == "c": return () pattern = construct_chunk_pattern(chunk_key_encoding) @@ -98,14 +98,33 @@ def parse_manifest_index( return tuple(int(ind) for ind in chunk_component.split(chunk_key_encoding)) -def get_deepest_group_or_array( - node: ManifestGroup, key: Iterable[str] -) -> ManifestGroup | ManifestArray: - for var in key: - if var in node.arrays: - return node.arrays[var] - node = node.groups[var] - return node +def _get_deepest_group_or_array( + node: ManifestGroup, key: str +) -> tuple[ManifestGroup | ManifestArray, str]: + """ + Traverse the manifest hierarchy as deeply as possible following the given key path. + + Traversal stops when: + - A key part doesn't match any array or group in the current node + - A ManifestArray is reached (arrays cannot be traversed further) + - All key parts have been successfully matched + + Args: + node: The starting ManifestGroup to begin traversal from + key: The key to use to traverse through groups and arrays + + Returns: + A tuple containing: + - The deepest node reached (ManifestGroup or ManifestArray) + - String with remaining unmatched key portion + """ + var, suffix = key.split("/", 1) if "/" in key else (key, "") + if var in node.arrays: + return node.arrays[var], suffix + if var in node.groups: + return _get_deepest_group_or_array(node.groups[var], suffix) + # Can't traverse deeper - return last node and remainder + return node, suffix or var class ManifestStore(Store): @@ -169,13 +188,13 @@ async def get( byte_range: ByteRequest | None = None, ) -> Buffer | None: # docstring inherited - node = get_deepest_group_or_array(self._group, key.split("/")[:-1]) - if key.endswith("zarr.json"): + node, suffix = _get_deepest_group_or_array(self._group, key) + if suffix.endswith("zarr.json"): # Return metadata return node.metadata.to_buffer_dict(prototype=default_buffer_prototype())[ "zarr.json" ] - elif key.endswith((".zattrs", ".zgroup", ".zarray", ".zmetadata")): + elif suffix.endswith((".zattrs", ".zgroup", ".zarray", ".zmetadata")): # Zarr-Python expects store classes to return None when metadata JSONs are not found. # Zarr-Python uses this behavior to distinguish between V2/V3 and consolidated/unconsolidated stores. # This upstream behavior will hopefully change in the future to be more Zarr-hierarchy aware, in @@ -190,7 +209,7 @@ async def get( separator: Literal[".", "/"] = getattr( node.metadata.chunk_key_encoding, "separator", "." ) - chunk_indexes = parse_manifest_index(key, separator) + chunk_indexes = parse_manifest_index(suffix, separator) path = manifest._paths[chunk_indexes] if path == "": @@ -286,9 +305,37 @@ def list_prefix(self, prefix: str) -> AsyncGenerator[str, None]: async def list_dir(self, prefix: str) -> AsyncGenerator[str, None]: # docstring inherited - yield "zarr.json" - for k in self._group.arrays.keys(): - yield k + # Navigate to the target node + node, suffix = _get_deepest_group_or_array(self._group, prefix) + # Zarr-Python lists using a per-path basis, so we don't have anything to list + # as long as there is a suffix remaining and we require a '.' chunk separator in the ManifestArrays + if suffix: + return + # List contents based on node type + if isinstance(node, ManifestGroup): + # Groups contain a metadata document and the name of sub-groups/arrays + yield "zarr.json" + for member_name in node._members.keys(): + yield member_name + # TODO: Support listing when using other chunk_key_encodings + elif ( + separator := getattr(node.metadata.chunk_key_encoding, "separator", None) + != "." + ): + raise NotImplementedError( + f"Array listing only supports '.' as chunk key separator, " + f"got {separator!r}" + ) + else: + # Arrays contain a metadata document and chunks + yield "zarr.json" + if node.shape == (): + # Scalar arrays have a single chunk named 'c' + yield "c" + else: + # Multi-dimensional arrays have chunks named 'c.{key}' + for chunk_key in node.manifest.keys(): + yield f"c.{chunk_key}" def to_virtual_dataset( self, diff --git a/virtualizarr/tests/test_manifests/test_store.py b/virtualizarr/tests/test_manifests/test_store.py index 2e571725..0fc11168 100644 --- a/virtualizarr/tests/test_manifests/test_store.py +++ b/virtualizarr/tests/test_manifests/test_store.py @@ -326,7 +326,15 @@ async def test_pickling(self, local_store): async def test_list_dir(self, manifest_store, request) -> None: store = request.getfixturevalue(manifest_store) observed = await _collect_aiterator(store.list_dir("")) - assert observed == ("zarr.json", "foo", "bar", "scalar") + assert observed == ("zarr.json", "foo", "bar", "scalar", "subgroup") + observed = await _collect_aiterator(store.list_dir("scalar")) + assert observed == ("zarr.json", "c") + observed = await _collect_aiterator(store.list_dir("scalar/d")) + assert observed == () + observed = await _collect_aiterator(store.list_dir("foo/")) + assert observed == ("zarr.json", "c.0.0", "c.0.1", "c.1.0", "c.1.1") + observed = await _collect_aiterator(store.list_dir("subgroup/foo/")) + assert observed == ("zarr.json", "c.0.0", "c.0.1", "c.1.0", "c.1.1") @pytest.mark.asyncio async def test_store_raises(self, local_store) -> None: From 437a5d87f1d3d74dfe57988ab57d1ab4be36ccf8 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 14 Nov 2025 12:31:40 -0500 Subject: [PATCH 4/7] Feat: add top-level open_virtual_datatree --- pyproject.toml | 1 + virtualizarr/__init__.py | 7 +- virtualizarr/manifests/group.py | 32 ++++++ virtualizarr/manifests/store.py | 40 ++++++++ virtualizarr/tests/test_parsers/test_tiff.py | 24 ++++- virtualizarr/xarray.py | 102 ++++++++++++++++++- 6 files changed, 201 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9108b15a..898c7da6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,6 +101,7 @@ upstream = [ 's3fs @ git+https://github.com/fsspec/s3fs', 'kerchunk @ git+https://github.com/fsspec/kerchunk', 'icechunk @ git+https://github.com/earth-mover/icechunk#subdirectory=icechunk-python', + 'virtual_tiff @ git+https://github.com/virtual-zarr/virtual-tiff', ] docs = [ "mkdocs-material[imaging]>=9.6.14", diff --git a/virtualizarr/__init__.py b/virtualizarr/__init__.py index 34264206..8ffc6e71 100644 --- a/virtualizarr/__init__.py +++ b/virtualizarr/__init__.py @@ -4,7 +4,11 @@ VirtualiZarrDatasetAccessor, VirtualiZarrDataTreeAccessor, ) -from virtualizarr.xarray import open_virtual_dataset, open_virtual_mfdataset +from virtualizarr.xarray import ( + open_virtual_dataset, + open_virtual_datatree, + open_virtual_mfdataset, +) try: __version__ = _version("virtualizarr") @@ -18,4 +22,5 @@ "VirtualiZarrDataTreeAccessor", "open_virtual_dataset", "open_virtual_mfdataset", + "open_virtual_datatree", ] diff --git a/virtualizarr/manifests/group.py b/virtualizarr/manifests/group.py index 7f637fcf..3a5a6091 100644 --- a/virtualizarr/manifests/group.py +++ b/virtualizarr/manifests/group.py @@ -75,6 +75,11 @@ def groups(self) -> dict[str, "ManifestGroup"]: """Subgroups contained in this group.""" return {k: v for k, v in self._members.items() if isinstance(v, ManifestGroup)} + @property + def contains_groups(self) -> bool: + """True if this group has subgroups.""" + return any(isinstance(v, ManifestGroup) for v in self._members.values()) + def __getitem__(self, path: str) -> "ManifestArray | ManifestGroup": """Obtain a group member.""" if "/" in path: @@ -125,3 +130,30 @@ def to_virtual_dataset(self) -> xr.Dataset: coord_names=coord_names, attrs=attributes, ) + + def to_virtual_datasets(self) -> dict[str, xr.Dataset]: + """ + Create a "virtual" [xarray.DataTree][] containing the contents of one zarr group. + + All variables in the returned DataTree will be "virtual", i.e. they will wrap ManifestArray objects. + """ + result = {"": self.to_virtual_dataset()} + + # Recursively process all subgroups + for group_name, subgroup in self.groups.items(): + subgroup_datasets = subgroup.to_virtual_datasets() + + # Add the subgroup's datasets with proper path prefixes + for subpath, dataset in subgroup_datasets.items(): + if subpath == "": + # Direct child group + full_path = group_name + else: + # Nested subgroup + full_path = f"{group_name}/{subpath}" + result[full_path] = dataset + return result + + def to_virtual_datatree(self) -> xr.DataTree: + datasets = self.to_virtual_datasets() + return xr.DataTree.from_dict(datasets) diff --git a/virtualizarr/manifests/store.py b/virtualizarr/manifests/store.py index 874a253e..d72b70e7 100644 --- a/virtualizarr/manifests/store.py +++ b/virtualizarr/manifests/store.py @@ -374,6 +374,46 @@ def to_virtual_dataset( decode_times=decode_times, ) + def to_virtual_datatree( + self, + group="", + *, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + ) -> "xr.DataTree": + """ + Create a "virtual" [xarray.Datatree][] containing the contents of a zarr group. Default is the root group and all sub-groups. + + Will ignore the contents of any other groups in the store. + + Requires xarray. + + Parameters + ---------- + group : Group to convert to a virtual DataTree + drop_variables + Variables in the data source to drop before returning. + loadable_variables + Variables in the data source to load as Dask/NumPy arrays instead of as virtual arrays. + decode_times + Bool that is passed into [xarray.open_dataset][]. Allows time to be decoded into a datetime object. + + Returns + ------- + vdt : xarray.Datatree + """ + + from virtualizarr.xarray import construct_virtual_datatree + + return construct_virtual_datatree( + manifest_store=self, + group=group, + loadable_variables=loadable_variables, + decode_times=decode_times, + drop_variables=drop_variables, + ) + def _transform_byte_range( byte_range: ByteRequest | None, *, chunk_start: int, chunk_end_exclusive: int diff --git a/virtualizarr/tests/test_parsers/test_tiff.py b/virtualizarr/tests/test_parsers/test_tiff.py index 38ea85a0..02397c0c 100644 --- a/virtualizarr/tests/test_parsers/test_tiff.py +++ b/virtualizarr/tests/test_parsers/test_tiff.py @@ -1,8 +1,8 @@ import pytest from obstore.store import S3Store -from xarray import Dataset +from xarray import Dataset, DataTree -from virtualizarr import open_virtual_dataset +from virtualizarr import open_virtual_dataset, open_virtual_datatree from virtualizarr.registry import ObjectStoreRegistry from virtualizarr.tests import requires_network, requires_tiff @@ -11,7 +11,25 @@ @requires_tiff @requires_network -def test_virtual_tiff() -> None: +def test_virtual_tiff_datatree() -> None: + store = S3Store("sentinel-cogs", region="us-west-2", skip_signature=True) + registry = ObjectStoreRegistry({"s3://sentinel-cogs/": store}) + url = "s3://sentinel-cogs/sentinel-s2-l2a-cogs/12/S/UF/2022/6/S2B_12SUF_20220609_0_L2A/B04.tif" + parser = virtual_tiff.VirtualTIFF(ifd_layout="nested") + with open_virtual_datatree(url=url, parser=parser, registry=registry) as vdt: + assert isinstance(vdt, DataTree) + assert list(vdt["0"].ds.variables) == ["0"] + var = vdt["0"].ds["0"].variable + assert var.sizes == {"y": 10980, "x": 10980} + assert var.dtype == " None: store = S3Store("sentinel-cogs", region="us-west-2", skip_signature=True) registry = ObjectStoreRegistry({"s3://sentinel-cogs/": store}) url = "s3://sentinel-cogs/sentinel-s2-l2a-cogs/12/S/UF/2022/6/S2B_12SUF_20220609_0_L2A/B04.tif" diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index bcd63b20..e868589e 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -21,7 +21,7 @@ from xarray.core.types import NestedSequence from xarray.structure.combine import _infer_concat_order_from_positions, _nested_combine -from virtualizarr.manifests import ManifestStore +from virtualizarr.manifests import ManifestArray, ManifestGroup, ManifestStore from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri from virtualizarr.parallel import get_executor from virtualizarr.parsers.typing import Parser @@ -35,6 +35,68 @@ ) +def open_virtual_datatree( + url: str, + registry: ObjectStoreRegistry, + parser: Parser, + *, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, +) -> xr.DataTree: + """ + Open an archival data source as an [xarray.Datatree][] wrapping virtualized zarr arrays. + + No data variables will be loaded unless specified in the ``loadable_variables`` kwarg (in which case they will open as lazily indexed arrays using xarray's standard lazy indexing classes). + + Xarray indexes can optionally be created (the default behaviour is to create indexes for any 1D coordinate variables). To avoid creating any xarray indexes pass ``indexes={}``. + + Parameters + ---------- + url + The url of the data source to virtualize. The URL should include a scheme. For example: + + - `url="file:///Users/my-name/Documents/my-project/my-data.nc"` for a local data source. + - `url="s3://my-bucket/my-project/my-data.nc"` for a remote data source on an S3 compatible cloud. + + registry + An [ObjectStoreRegistry][virtualizarr.registry.ObjectStoreRegistry] for resolving urls and reading data. + parser + A parser to use for the given data source. For example: + + - [virtualizarr.parsers.HDFParser][] for virtualizing NetCDF4 or HDF5 files. + - [virtualizarr.parsers.FITSParser][] for virtualizing FITS files. + - [virtualizarr.parsers.NetCDF3Parser][] for virtualizing NetCDF3 files. + - [virtualizarr.parsers.KerchunkJSONParser][] for re-opening Kerchunk JSONs. + - [virtualizarr.parsers.KerchunkParquetParser][] for re-opening Kerchunk Parquets. + - [virtualizarr.parsers.ZarrParser][] for virtualizing Zarr stores. + - [virtualizarr.parsers.ZarrParser][] for virtualizing Zarr stores. + drop_variables + Variables in the data source to drop before returning. + loadable_variables + Variables in the data source to load as Dask/NumPy arrays instead of as virtual arrays. + decode_times + Bool that is passed into [xarray.open_dataset][]. Allows time to be decoded into a datetime object. + + Returns + ------- + vds + An [xarray.DataTree][] containing virtual chunk references for all variables. + """ + filepath = validate_and_normalize_path_to_uri(url, fs_root=Path.cwd().as_uri()) + + manifest_store = parser( + url=filepath, + registry=registry, + ) + + return manifest_store.to_virtual_datatree( + loadable_variables=loadable_variables, + decode_times=decode_times, + drop_variables=drop_variables, + ) + + def open_virtual_dataset( url: str, registry: ObjectStoreRegistry, @@ -354,6 +416,44 @@ def construct_virtual_dataset( ) +def construct_virtual_datatree( + manifest_store: ManifestStore, + group: str = "", + *, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, +) -> xr.DataTree: + """ + Construct a fully or partly virtual datatree from a ManifestStore. + """ + fully_loadable_datatree = xr.open_datatree( + manifest_store, # type: ignore[arg-type] + group=group, + engine="zarr", + consolidated=False, + zarr_format=3, + decode_times=decode_times, + ) + if group: + node = manifest_store._group[group] + else: + node = manifest_store._group + if isinstance(node, ManifestArray): + node = ManifestGroup(arrays={group: node}, attributes={}) + fully_virtual_datatree = node.to_virtual_datatree() + + partially_loaded_datasets = {} + for name, virtual_node in fully_virtual_datatree.subtree_with_keys: + loadable_node = fully_loadable_datatree[name] + node_dataset = replace_virtual_with_loadable_vars( + virtual_node.to_dataset(), loadable_node.to_dataset(), loadable_variables + ) + node_dataset = node_dataset.drop_vars(list(drop_variables or ())) + partially_loaded_datasets[name] = node_dataset + return xr.DataTree.from_dict(partially_loaded_datasets) + + def replace_virtual_with_loadable_vars( fully_virtual_ds: xr.Dataset, loadable_ds: xr.Dataset, From 5c35e1b3f922a0255ea943723d1d06f97a238281 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 14 Nov 2025 12:36:04 -0500 Subject: [PATCH 5/7] Remove unused function --- virtualizarr/manifests/group.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/virtualizarr/manifests/group.py b/virtualizarr/manifests/group.py index 3a5a6091..e51a0141 100644 --- a/virtualizarr/manifests/group.py +++ b/virtualizarr/manifests/group.py @@ -75,11 +75,6 @@ def groups(self) -> dict[str, "ManifestGroup"]: """Subgroups contained in this group.""" return {k: v for k, v in self._members.items() if isinstance(v, ManifestGroup)} - @property - def contains_groups(self) -> bool: - """True if this group has subgroups.""" - return any(isinstance(v, ManifestGroup) for v in self._members.values()) - def __getitem__(self, path: str) -> "ManifestArray | ManifestGroup": """Obtain a group member.""" if "/" in path: From e563aa0d2e70a06e1ea038492f215a5ed32cb35b Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 14 Nov 2025 12:50:43 -0500 Subject: [PATCH 6/7] Fix cross-reference --- virtualizarr/manifests/store.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/virtualizarr/manifests/store.py b/virtualizarr/manifests/store.py index d72b70e7..a5dff589 100644 --- a/virtualizarr/manifests/store.py +++ b/virtualizarr/manifests/store.py @@ -383,7 +383,7 @@ def to_virtual_datatree( decode_times: bool | None = None, ) -> "xr.DataTree": """ - Create a "virtual" [xarray.Datatree][] containing the contents of a zarr group. Default is the root group and all sub-groups. + Create a "virtual" [xarray.DataTree][] containing the contents of a zarr group. Default is the root group and all sub-groups. Will ignore the contents of any other groups in the store. @@ -401,7 +401,7 @@ def to_virtual_datatree( Returns ------- - vdt : xarray.Datatree + vdt : xarray.DataTree """ from virtualizarr.xarray import construct_virtual_datatree From 0f45437033808482a88ab09b23127280b651ce6b Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 14 Nov 2025 12:54:12 -0500 Subject: [PATCH 7/7] Fixup docstrings --- virtualizarr/manifests/group.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/virtualizarr/manifests/group.py b/virtualizarr/manifests/group.py index e51a0141..d226e844 100644 --- a/virtualizarr/manifests/group.py +++ b/virtualizarr/manifests/group.py @@ -128,9 +128,11 @@ def to_virtual_dataset(self) -> xr.Dataset: def to_virtual_datasets(self) -> dict[str, xr.Dataset]: """ - Create a "virtual" [xarray.DataTree][] containing the contents of one zarr group. + Create a dictionary containing virtual datasets for all the sub-groups of a ManifestGroup. All the + variables in the datasets will be "virtual", i.e., they will wrap ManifestArray objects. - All variables in the returned DataTree will be "virtual", i.e. they will wrap ManifestArray objects. + It is convenient to have a separate `to_virtual_datasets` function from `to_virtual_datatree` so that + it can be called recursively without needing to use `DataTree.to_dict() and `.from_dict()` repeatedly. """ result = {"": self.to_virtual_dataset()} @@ -150,5 +152,10 @@ def to_virtual_datasets(self) -> dict[str, xr.Dataset]: return result def to_virtual_datatree(self) -> xr.DataTree: + """ + Create a "virtual" [xarray.DataTree][] containing the contents of one zarr group. + + All variables in the returned DataTree will be "virtual", i.e. they will wrap ManifestArray objects. + """ datasets = self.to_virtual_datasets() return xr.DataTree.from_dict(datasets)