Skip to content

Commit e5fff19

Browse files
Add methods to create virtual dataset from manifeststore (#522)
* improve type hinting by just using Mapping * rename manifest array attribute * add metadata property * make ManifestGroup a Mapping * rename manifest_group -> group * typing * improve the repr * test new group API * sketch out conversion method * more sketching * add to_virtual_variable * add ManifestGroup.to_virtual_dataset() * linting * consolidate duplicated manifest_array fixtures * ensure coordinates are understood * move test to store-level instead of group-level * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * docs * Un-comment out icechunk upstream --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 8b67060 commit e5fff19

File tree

17 files changed

+193
-41
lines changed

17 files changed

+193
-41
lines changed

conftest.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# Standard library imports
44
import itertools
55
from pathlib import Path
6-
from typing import Any, Callable, Mapping, Optional
6+
from typing import Any, Callable, Iterable, Mapping, Optional
77

88
# Third-party imports
99
import h5py # type: ignore[import]
@@ -266,6 +266,8 @@ def _create_metadata(
266266
data_type: np.dtype = np.dtype("int32"),
267267
codecs: list[dict] | None = None,
268268
fill_value: int | float | None = None,
269+
attributes: dict | None = None,
270+
dimension_names: Iterable[str] | None = None,
269271
):
270272
codecs = codecs or [{"configuration": {"endian": "little"}, "name": "bytes"}]
271273
return create_v3_array_metadata(
@@ -274,6 +276,8 @@ def _create_metadata(
274276
data_type=data_type,
275277
codecs=codecs,
276278
fill_value=fill_value or 0,
279+
attributes=attributes,
280+
dimension_names=dimension_names,
277281
)
278282

279283
return _create_metadata
@@ -288,11 +292,14 @@ def manifest_array(array_v3_metadata):
288292
"""
289293

290294
def _manifest_array(
291-
shape: tuple = (5, 5),
292-
chunks: tuple = (5, 5),
295+
shape: tuple = (5, 2),
296+
chunks: tuple = (5, 2),
293297
codecs: list[dict] | None = [ARRAYBYTES_CODEC, ZLIB_CODEC],
298+
dimension_names: Iterable[str] | None = None,
294299
):
295-
metadata = array_v3_metadata(shape=shape, chunks=chunks, codecs=codecs)
300+
metadata = array_v3_metadata(
301+
shape=shape, chunks=chunks, codecs=codecs, dimension_names=dimension_names
302+
)
296303
entries = _generate_chunk_entries(shape, chunks, _entry_from_chunk_key)
297304
chunkmanifest = ChunkManifest(entries=entries)
298305
return ManifestArray(chunkmanifest=chunkmanifest, metadata=metadata)

docs/releases.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ New Features
1010
~~~~~~~~~~~~
1111

1212
- Added experimental ManifestStore (:pull:`490`).
13+
- Added :py:meth:`ManifestStore.to_virtual_dataset()` method (:pull:`522`).
14+
By `Tom Nicholas <https://github.com/TomNicholas>`_.
1315
- Added experimental :py:func:`open_virtual_mfdataset` function (:issue:`345`, :pull:`349`).
1416
By `Tom Nicholas <https://github.com/TomNicholas>`_.
1517

File renamed without changes.

virtualizarr/manifests/array.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from typing import Any, Callable, Union
33

44
import numpy as np
5+
import xarray as xr
56
from zarr.core.metadata.v3 import ArrayV3Metadata, RegularChunkGrid
67

78
import virtualizarr.manifests.utils as utils
@@ -277,6 +278,30 @@ def rename_paths(
277278
renamed_manifest = self.manifest.rename_paths(new)
278279
return ManifestArray(metadata=self.metadata, chunkmanifest=renamed_manifest)
279280

281+
def to_virtual_variable(self) -> xr.Variable:
282+
"""
283+
Create a "virtual" xarray.Variable containing the contents of one zarr array.
284+
285+
The returned variable will be "virtual", i.e. it will wrap a single ManifestArray object.
286+
"""
287+
288+
# The xarray data model stores dimension names and arbitrary extra metadata outside of the wrapped array class,
289+
# so to avoid that information being duplicated we strip it from the ManifestArray before wrapping it.
290+
dims = self.metadata.dimension_names
291+
attrs = self.metadata.attributes
292+
stripped_metadata = utils.copy_and_replace_metadata(
293+
self.metadata, new_dimension_names=None, new_attributes={}
294+
)
295+
stripped_marr = ManifestArray(
296+
chunkmanifest=self.manifest, metadata=stripped_metadata
297+
)
298+
299+
return xr.Variable(
300+
data=stripped_marr,
301+
dims=dims,
302+
attrs=attrs,
303+
)
304+
280305

281306
def _possibly_expand_trailing_ellipsis(key, ndim: int):
282307
if key[-1] == ...:

virtualizarr/manifests/group.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import textwrap
44
from typing import Iterator, Mapping
55

6+
import xarray as xr
67
from zarr.core.group import GroupMetadata
78

89
from virtualizarr.manifests import ManifestArray
@@ -105,3 +106,28 @@ def __repr__(self) -> str:
105106
)
106107
"""
107108
)
109+
110+
def to_virtual_dataset(self) -> xr.Dataset:
111+
"""
112+
Create a "virtual" xarray.Dataset containing the contents of one zarr group.
113+
114+
All variables in the returned Dataset will be "virtual", i.e. they will wrap ManifestArray objects.
115+
"""
116+
117+
from virtualizarr.common import construct_fully_virtual_dataset
118+
119+
# The xarray data model stores coordinate names outside of the arbitrary extra metadata it can store on a Dataset,
120+
# so to avoid that information being duplicated we strip it from the zarr group attributes before storing it.
121+
metadata_dict = self.metadata.to_dict()
122+
attributes = metadata_dict["attributes"]
123+
coord_names = attributes.pop("coordinates", [])
124+
125+
virtual_vars = {
126+
name: marr.to_virtual_variable() for name, marr in self.arrays.items()
127+
}
128+
129+
return construct_fully_virtual_dataset(
130+
virtual_vars=virtual_vars,
131+
coord_names=coord_names,
132+
attrs=attributes,
133+
)

virtualizarr/manifests/store.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from typing import TYPE_CHECKING, Any, Mapping
66
from urllib.parse import urlparse
77

8+
import xarray as xr
89
from zarr.abc.store import (
910
ByteRequest,
1011
OffsetByteRequest,
@@ -351,6 +352,32 @@ async def list_dir(self, prefix: str) -> AsyncGenerator[str, None]:
351352
for k in self._group.arrays.keys():
352353
yield k
353354

355+
def to_virtual_dataset(self, group="") -> xr.Dataset:
356+
"""
357+
Create a "virtual" xarray dataset containing the contents of one zarr group.
358+
359+
All variables in the returned Dataset will be "virtual", i.e. they will wrap ManifestArray objects.
360+
361+
Will ignore the contents of any other groups in the store.
362+
363+
Parameters
364+
----------
365+
group : str
366+
367+
Returns
368+
-------
369+
vds : xarray.Dataset
370+
"""
371+
372+
if group:
373+
raise NotImplementedError(
374+
"ManifestStore does not yet support nested groups"
375+
)
376+
else:
377+
manifestgroup = self._group
378+
379+
return manifestgroup.to_virtual_dataset()
380+
354381

355382
def _transform_byte_range(
356383
byte_range: ByteRequest | None, *, chunk_start: int, chunk_end_exclusive: int

virtualizarr/manifests/utils.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
1-
from typing import TYPE_CHECKING, Any, Dict, Iterable, Optional, Union
1+
from typing import TYPE_CHECKING, Any, Dict, Iterable, Literal, Optional, Union
22

33
import numpy as np
44
from zarr import Array
55
from zarr.core.chunk_key_encodings import ChunkKeyEncodingLike
6-
from zarr.core.metadata.v3 import ArrayV3Metadata
6+
from zarr.core.metadata.v3 import (
7+
ArrayV3Metadata,
8+
parse_dimension_names,
9+
parse_shapelike,
10+
)
711

812
from virtualizarr.codecs import convert_to_codec_pipeline, get_codecs
913

@@ -19,7 +23,7 @@ def create_v3_array_metadata(
1923
fill_value: Any = None,
2024
codecs: Optional[list[Dict[str, Any]]] = None,
2125
attributes: Optional[Dict[str, Any]] = None,
22-
dimension_names: Optional[tuple[str, ...]] = None,
26+
dimension_names: Iterable[str] | None = None,
2327
) -> ArrayV3Metadata:
2428
"""
2529
Create an ArrayV3Metadata instance with standard configuration.
@@ -198,17 +202,29 @@ def copy_and_replace_metadata(
198202
old_metadata: ArrayV3Metadata,
199203
new_shape: list[int] | None = None,
200204
new_chunks: list[int] | None = None,
205+
new_dimension_names: Iterable[str] | None | Literal["default"] = "default",
206+
new_attributes: dict | None = None,
201207
) -> ArrayV3Metadata:
202208
"""
203209
Update metadata to reflect a new shape and/or chunk shape.
204210
"""
211+
# TODO this should really be upstreamed into zarr-python
212+
205213
metadata_copy = old_metadata.to_dict().copy()
206-
metadata_copy["shape"] = new_shape # type: ignore[assignment]
214+
215+
if new_shape is not None:
216+
metadata_copy["shape"] = parse_shapelike(new_shape) # type: ignore[assignment]
207217
if new_chunks is not None:
208218
metadata_copy["chunk_grid"] = {
209219
"name": "regular",
210220
"configuration": {"chunk_shape": tuple(new_chunks)},
211221
}
222+
if new_dimension_names != "default":
223+
# need the option to use the literal string "default" as a sentinel value because None is a valid choice for zarr dimension_names
224+
metadata_copy["dimension_names"] = parse_dimension_names(new_dimension_names)
225+
if new_attributes is not None:
226+
metadata_copy["attributes"] = new_attributes
227+
212228
# ArrayV3Metadata.from_dict removes extra keys zarr_format and node_type
213229
new_metadata = ArrayV3Metadata.from_dict(metadata_copy)
214230
return new_metadata

virtualizarr/readers/fits.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@
33

44
from xarray import Dataset, Index
55

6+
from virtualizarr.common import construct_fully_virtual_dataset
67
from virtualizarr.readers.api import (
78
VirtualBackend,
89
)
9-
from virtualizarr.readers.common import construct_fully_virtual_dataset
1010
from virtualizarr.translators.kerchunk import (
1111
extract_group,
1212
virtual_vars_and_metadata_from_kerchunk_refs,

virtualizarr/readers/hdf/hdf.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@
1919
from xarray.backends.zarr import FillValueCoder
2020

2121
from virtualizarr.codecs import numcodec_config_to_configurable
22+
from virtualizarr.common import (
23+
construct_fully_virtual_dataset,
24+
replace_virtual_with_loadable_vars,
25+
)
2226
from virtualizarr.manifests import (
2327
ChunkEntry,
2428
ChunkManifest,
@@ -27,10 +31,6 @@
2731
from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri
2832
from virtualizarr.manifests.utils import create_v3_array_metadata
2933
from virtualizarr.readers.api import VirtualBackend
30-
from virtualizarr.readers.common import (
31-
construct_fully_virtual_dataset,
32-
replace_virtual_with_loadable_vars,
33-
)
3434
from virtualizarr.readers.hdf.filters import cfcodec_from_dataset, codecs_from_dataset
3535
from virtualizarr.types import ChunkKey
3636
from virtualizarr.utils import _FsspecFSFromFilepath, soft_import

virtualizarr/readers/hdf5.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,11 @@
33

44
from xarray import Dataset, Index
55

6-
from virtualizarr.readers.api import VirtualBackend
7-
from virtualizarr.readers.common import (
6+
from virtualizarr.common import (
87
construct_fully_virtual_dataset,
98
replace_virtual_with_loadable_vars,
109
)
10+
from virtualizarr.readers.api import VirtualBackend
1111
from virtualizarr.translators.kerchunk import (
1212
extract_group,
1313
virtual_vars_and_metadata_from_kerchunk_refs,

0 commit comments

Comments
 (0)