Skip to content

Commit adc7f01

Browse files
authored
Refactor ManifestGroup (#518)
* improve type hinting by just using Mapping * rename manifest array attribute * add metadata property * make ManifestGroup a Mapping * rename manifest_group -> group * typing * improve the repr * test new group API
1 parent 810d4d0 commit adc7f01

File tree

5 files changed

+141
-52
lines changed

5 files changed

+141
-52
lines changed

virtualizarr/manifests/__init__.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Note: This directory is named "manifests" rather than "manifest".
22
# This is just to avoid conflicting with some type of file called manifest that .gitignore recommends ignoring.
33

4-
from .array import ManifestArray # type: ignore # noqa
5-
from .group import ManifestGroup # type: ignore # noqa
6-
from .manifest import ChunkEntry, ChunkManifest # type: ignore # noqa
7-
from .store import ManifestStore # type: ignore # noqa
4+
from virtualizarr.manifests.array import ManifestArray # type: ignore # noqa
5+
from virtualizarr.manifests.group import ManifestGroup # type: ignore # noqa
6+
from virtualizarr.manifests.manifest import ChunkEntry, ChunkManifest # type: ignore # noqa
7+
from virtualizarr.manifests.store import ManifestStore # type: ignore # noqa

virtualizarr/manifests/group.py

Lines changed: 86 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,107 @@
1-
from typing import Mapping, TypeAlias
1+
from __future__ import annotations
2+
3+
import textwrap
4+
from typing import Iterator, Mapping
25

36
from zarr.core.group import GroupMetadata
47

58
from virtualizarr.manifests import ManifestArray
69

7-
ManifestArrayVariableMapping: TypeAlias = dict[str, ManifestArray]
8-
910

10-
class ManifestGroup:
11+
class ManifestGroup(
12+
Mapping[str, "ManifestArray | ManifestGroup"],
13+
):
1114
"""
12-
Virtualized representation of multiple ManifestArrays as a Zarr Group.
15+
Immutable representation of a single virtual zarr group.
1316
"""
1417

15-
# TODO: Consider refactoring according to https://github.com/zarr-developers/VirtualiZarr/pull/490#discussion_r2007805272
16-
_manifest_arrays: Mapping[str, ManifestArray]
18+
_members: Mapping[str, "ManifestArray | ManifestGroup"]
1719
_metadata: GroupMetadata
1820

1921
def __init__(
2022
self,
21-
manifest_arrays: ManifestArrayVariableMapping,
22-
attributes: dict,
23+
arrays: Mapping[str, ManifestArray] | None = None,
24+
groups: Mapping[str, "ManifestGroup"] | None = None,
25+
attributes: dict | None = None,
2326
) -> None:
2427
"""
25-
Create a ManifestGroup from the dictionary of ManifestArrays and the group / dataset level metadata
28+
Create a ManifestGroup containing ManifestArrays and/or sub-groups, as well as any group-level metadata.
2629
2730
Parameters
2831
----------
29-
attributes : attributes to include in Group metadata
30-
manifest_dict : ManifestArrayVariableMapping
32+
arrays : Mapping[str, ManifestArray], optional
33+
ManifestArray objects to represent virtual zarr arrays.
34+
groups : Mapping[str, ManifestGroup], optional
35+
ManifestGroup objects to represent virtual zarr subgroups.
36+
attributes : dict, optional
37+
Zarr attributes to add as zarr group metadata.
3138
"""
32-
3339
self._metadata = GroupMetadata(attributes=attributes)
34-
self._manifest_arrays = manifest_arrays
3540

36-
def __str__(self) -> str:
37-
return f"ManifestGroup(manifest_arrays={self._manifest_arrays}, metadata={self._metadata})"
41+
_arrays: Mapping[str, ManifestArray] = {} if arrays is None else arrays
42+
43+
if groups:
44+
# TODO add support for nested groups
45+
raise NotImplementedError
46+
else:
47+
_groups: Mapping[str, ManifestGroup] = {} if groups is None else groups
48+
49+
for name, arr in _arrays.items():
50+
if not isinstance(arr, ManifestArray):
51+
raise TypeError(
52+
f"ManifestGroup can only wrap ManifestArray objects, but array {name} passed is of type {type(arr)}"
53+
)
54+
55+
# TODO type check groups passed
56+
57+
# TODO check that all arrays have the same shapes or dimensions?
58+
# Technically that's allowed by the zarr model, so we should theoretically only check that upon converting to xarray
59+
60+
colliding_names = set(_arrays.keys()).intersection(set(_groups.keys()))
61+
if colliding_names:
62+
raise ValueError(
63+
f"Some names collide as they are present in both the array and group keys: {colliding_names}"
64+
)
65+
66+
self._members = {**_arrays, **_groups}
67+
68+
@property
69+
def metadata(self) -> GroupMetadata:
70+
"""Zarr group metadata."""
71+
return self._metadata
72+
73+
@property
74+
def arrays(self) -> dict[str, ManifestArray]:
75+
"""ManifestArrays contained in this group."""
76+
return {k: v for k, v in self._members.items() if isinstance(v, ManifestArray)}
77+
78+
@property
79+
def groups(self) -> dict[str, "ManifestGroup"]:
80+
"""Subgroups contained in this group."""
81+
return {k: v for k, v in self._members.items() if isinstance(v, ManifestGroup)}
82+
83+
def __getitem__(self, path: str) -> "ManifestArray | ManifestGroup":
84+
"""Obtain a group member."""
85+
if "/" in path:
86+
raise ValueError(
87+
f"ManifestGroup.__getitem__ can only be used to get immediate subgroups and subarrays, but received multi-part path {path}"
88+
)
89+
90+
return self._members[path]
91+
92+
def __iter__(self) -> Iterator[str]:
93+
return iter(self._members.keys())
94+
95+
def __len__(self) -> int:
96+
return len(self._members)
97+
98+
def __repr__(self) -> str:
99+
return textwrap.dedent(
100+
f"""
101+
ManifestGroup(
102+
arrays={self.arrays},
103+
groups={self.groups},
104+
metadata={self.metadata},
105+
)
106+
"""
107+
)

virtualizarr/manifests/store.py

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import pickle
44
from collections.abc import Iterable
5-
from typing import TYPE_CHECKING, Any
5+
from typing import TYPE_CHECKING, Any, Mapping
66
from urllib.parse import urlparse
77

88
from zarr.abc.store import (
@@ -15,6 +15,7 @@
1515
from zarr.core.buffer import Buffer
1616
from zarr.core.buffer.core import BufferPrototype
1717

18+
from virtualizarr.manifests.array import ManifestArray
1819
from virtualizarr.manifests.group import ManifestGroup
1920

2021
if TYPE_CHECKING:
@@ -27,6 +28,7 @@
2728

2829
__all__ = ["ManifestStore"]
2930

31+
3032
_ALLOWED_EXCEPTIONS: tuple[type[Exception], ...] = (
3133
FileNotFoundError,
3234
IsADirectoryError,
@@ -39,7 +41,6 @@
3941

4042
from zarr.core.buffer import default_buffer_prototype
4143

42-
from virtualizarr.manifests.group import ManifestArrayVariableMapping
4344
from virtualizarr.vendor.zarr.metadata import dict_to_buffer
4445

4546
if TYPE_CHECKING:
@@ -59,20 +60,20 @@ class StoreRequest:
5960

6061

6162
async def list_dir_from_manifest_arrays(
62-
manifest_arrays: ManifestArrayVariableMapping, prefix: str
63+
arrays: Mapping[str, ManifestArray], prefix: str
6364
) -> AsyncGenerator[str]:
6465
"""Create the expected results for Zarr's `store.list_dir()` from an Xarray DataArrray or Dataset
6566
6667
Parameters
6768
----------
68-
manifest_arrays : ManifestArrayVariableMapping
69+
arrays : Mapping[str, ManifestArrays]
6970
prefix : str
7071
71-
7272
Returns
7373
-------
7474
AsyncIterator[str]
7575
"""
76+
# TODO shouldn't this just accept a ManifestGroup instead?
7677
# Start with expected group level metadata
7778
raise NotImplementedError
7879

@@ -99,11 +100,11 @@ def get_zarr_metadata(manifest_group: ManifestGroup, key: str) -> Buffer:
99100
# If requesting the root metadata, return the standard group metadata with additional dataset specific attributes
100101

101102
if key == "zarr.json":
102-
metadata = manifest_group._metadata.to_dict()
103+
metadata = manifest_group.metadata.to_dict()
103104
return dict_to_buffer(metadata, prototype=default_buffer_prototype())
104105
else:
105106
var, _ = key.split("/")
106-
metadata = manifest_group._manifest_arrays[var].metadata.to_dict()
107+
metadata = manifest_group.arrays[var].metadata.to_dict()
107108
return dict_to_buffer(metadata, prototype=default_buffer_prototype())
108109

109110

@@ -170,16 +171,17 @@ def find_matching_store(stores: StoreDict, request_key: str) -> StoreRequest:
170171

171172

172173
class ManifestStore(Store):
173-
"""A read-only Zarr store that uses obstore to access data on AWS, GCP, Azure. The requests
174+
"""
175+
A read-only Zarr store that uses obstore to access data on AWS, GCP, Azure. The requests
174176
from the Zarr API are redirected using the :class:`virtualizarr.manifests.ManifestGroup` containing
175177
multiple :class:`virtualizarr.manifests.ManifestArray`,
176178
allowing for virtually interfacing with underlying data in other file format.
177179
178-
179180
Parameters
180181
----------
181-
manifest_group : ManifestGroup
182-
Manifest Group containing Group metadata and mapping variable names to ManifestArrays
182+
group : ManifestGroup
183+
Root group of the store.
184+
Contains group metadata, ManifestArrays, and any subgroups.
183185
stores : dict[prefix, :class:`obstore.store.ObjectStore`]
184186
A mapping of url prefixes to obstore Store instances set up with the proper credentials.
185187
@@ -196,15 +198,15 @@ class ManifestStore(Store):
196198
Modified from https://github.com/zarr-developers/zarr-python/pull/1661
197199
"""
198200

199-
_manifest_group: ManifestGroup
201+
_group: ManifestGroup
200202
_stores: StoreDict
201203

202204
def __eq__(self, value: object):
203205
NotImplementedError
204206

205207
def __init__(
206208
self,
207-
manifest_group: ManifestGroup,
209+
group: ManifestGroup,
208210
*,
209211
stores: StoreDict, # TODO: Consider using a sequence of tuples rather than a dict (see https://github.com/zarr-developers/VirtualiZarr/pull/490#discussion_r2010717898).
210212
) -> None:
@@ -223,14 +225,17 @@ def __init__(
223225
for store in stores.values():
224226
if not store.__class__.__module__.startswith("obstore"):
225227
raise TypeError(f"expected ObjectStore class, got {store!r}")
228+
226229
# TODO: Don't allow stores with prefix
227-
# TODO: Type check the manifest arrays
230+
if not isinstance(group, ManifestGroup):
231+
raise TypeError
232+
228233
super().__init__(read_only=True)
229234
self._stores = stores
230-
self._manifest_group = manifest_group
235+
self._group = group
231236

232237
def __str__(self) -> str:
233-
return f"ManifestStore({self._manifest_group}, {self._stores})"
238+
return f"ManifestStore(group={self._group}, stores={self._stores})"
234239

235240
def __getstate__(self) -> dict[Any, Any]:
236241
state = self.__dict__.copy()
@@ -257,10 +262,10 @@ async def get(
257262
import obstore as obs
258263

259264
if key.endswith("zarr.json"):
260-
return get_zarr_metadata(self._manifest_group, key)
265+
return get_zarr_metadata(self._group, key)
261266
var, chunk_key = parse_manifest_index(key)
262-
marr = self._manifest_group._manifest_arrays[var]
263-
manifest = marr._manifest
267+
marr = self._group.arrays[var]
268+
manifest = marr.manifest
264269

265270
path = manifest._paths[*chunk_key]
266271
offset = manifest._offsets[*chunk_key]
@@ -345,7 +350,7 @@ def list_prefix(self, prefix: str) -> AsyncGenerator[str, None]:
345350
async def list_dir(self, prefix: str) -> AsyncGenerator[str, None]:
346351
# docstring inherited
347352
yield "zarr.json"
348-
for k in self._manifest_group._manifest_arrays.keys():
353+
for k in self._group.arrays.keys():
349354
yield k
350355

351356

virtualizarr/tests/test_manifests/test_group.py

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import textwrap
2+
13
import pytest
24
from zarr.core.group import GroupMetadata
35

@@ -20,17 +22,28 @@ def manifest_array(array_v3_metadata):
2022

2123

2224
class TestManifestGroup:
23-
def test_manifest_array(self, array_v3_metadata, manifest_array):
25+
def test_group_containing_array(self, manifest_array):
2426
var = "foo"
25-
manifest_group = ManifestGroup(
26-
manifest_arrays={var: manifest_array}, attributes={}
27-
)
28-
assert isinstance(manifest_group._manifest_arrays, dict)
29-
assert isinstance(manifest_group._manifest_arrays[var], ManifestArray)
30-
assert isinstance(manifest_group._metadata, GroupMetadata)
27+
manifest_group = ManifestGroup(arrays={var: manifest_array}, attributes={})
28+
29+
assert manifest_group.arrays == {var: manifest_array}
30+
assert manifest_group.groups == {}
31+
assert isinstance(manifest_group[var], ManifestArray)
32+
with pytest.raises(KeyError):
33+
manifest_group["bar"]
34+
assert isinstance(manifest_group.metadata, GroupMetadata)
35+
assert len(manifest_group) == 1
36+
assert list(manifest_group) == [var]
3137

3238
def test_manifest_repr(self, manifest_array):
33-
manifest_group = ManifestGroup(
34-
manifest_arrays={"foo": manifest_array}, attributes={}
39+
manifest_group = ManifestGroup(arrays={"foo": manifest_array}, attributes={})
40+
expected_repr = textwrap.dedent(
41+
"""
42+
ManifestGroup(
43+
arrays={'foo': ManifestArray<shape=(5, 2, 20), dtype=int32, chunks=(5, 1, 10)>},
44+
groups={},
45+
metadata=GroupMetadata(attributes={}, zarr_format=3, consolidated_metadata=None, node_type='group'),
46+
)
47+
"""
3548
)
36-
assert str(manifest_group)
49+
assert repr(manifest_group) == expected_repr

virtualizarr/tests/test_manifests/test_store.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,9 +71,10 @@ def _generate_manifest_store(
7171
array_metadata = array_v3_metadata(shape=shape, chunks=chunks)
7272
manifest_array = ManifestArray(metadata=array_metadata, chunkmanifest=manifest)
7373
manifest_group = ManifestGroup(
74-
{"foo": manifest_array, "bar": manifest_array}, attributes={"Zarr": "Hooray!"}
74+
arrays={"foo": manifest_array, "bar": manifest_array},
75+
attributes={"Zarr": "Hooray!"},
7576
)
76-
return ManifestStore(stores={prefix: store}, manifest_group=manifest_group)
77+
return ManifestStore(stores={prefix: store}, group=manifest_group)
7778

7879

7980
@pytest.fixture()

0 commit comments

Comments
 (0)