diff --git a/virtualizarr/manifests/group.py b/virtualizarr/manifests/group.py index c3caf685..8b624c16 100644 --- a/virtualizarr/manifests/group.py +++ b/virtualizarr/manifests/group.py @@ -1,13 +1,17 @@ from __future__ import annotations import textwrap -from typing import Iterator, Mapping +from time import datetime +from typing import TYPE_CHECKING, Iterator, Mapping, Optional import xarray as xr from zarr.core.group import GroupMetadata from virtualizarr.manifests import ManifestArray +if TYPE_CHECKING: + from icechunk import IcechunkStore # type: ignore[import-not-found] + class ManifestGroup( Mapping[str, "ManifestArray | ManifestGroup"], @@ -131,3 +135,70 @@ def to_virtual_dataset(self) -> xr.Dataset: coord_names=coord_names, attrs=attributes, ) + + @classmethod + def from_virtual_dataset( + cls, + vds: xr.Dataset, + ) -> "ManifestGroup": + """ + Create a new ManifestGroup from a virtual dataset object. + + The virtual dataset should contain only virtual variables, i.e. those backed by ManifestArrays. + + Parameters + ---------- + vds: xr.Dataset + Virtual dataset, containing only virtual variables. + """ + + for name, var in vds.variables.items(): + if not isinstance(var.data, ManifestArray): + raise TypeError( + f"Cannot convert a dataset containing a loadable variable directly to a ManifestGroup, but found variable {name} has type {type(var.data)}" + ) + + manifestarrays = {name: var.data for name, var in vds.variables.items()} + + attributes = vds.attrs + # TODO test this is correct + attributes["dimension_names"] = " ".join(list(vds.dims)) + # TODO test this constructor round-trips coordinates + attributes["coordinates"] = " ".join(list(vds.coords)) + + return cls(arrays=manifestarrays, attributes=attributes) + + def to_icechunk( + store: "IcechunkStore", + group: Optional[str] = None, + append_dim: Optional[str] = None, + last_updated_at: Optional[datetime] = None, + ) -> None: + """ + Write the contents of this ManifestGroup into an Icechunk Store as virtual chunks. + + Parameters + ---------- + store: IcechunkStore + Store to write dataset into. + group: str, optional + Path of the group to write the dataset into (default: the root group). + append_dim: str, optional + Dimension along which to append the virtual dataset. + last_updated_at: datetime, optional + Datetime to use as a checksum for any virtual chunks written to the store + with this operation. When not provided, no check is performed. + + Raises + ------ + ValueError + If the store is read-only. + """ + from virtualizarr.writers.icechunk import write_manifestgroup_to_icechunk + + write_manifestgroup_to_icechunk( + store=store, + group=group, + append_dim=append_dim, + last_updated_at=last_updated_at, + ) diff --git a/virtualizarr/writers/icechunk.py b/virtualizarr/writers/icechunk.py index 536fe727..9a4ef74b 100644 --- a/virtualizarr/writers/icechunk.py +++ b/virtualizarr/writers/icechunk.py @@ -6,10 +6,9 @@ from xarray.backends.zarr import encode_zarr_attr_value from zarr import Array, Group -from virtualizarr.codecs import get_codecs -from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.codecs import extract_codecs, get_codecs +from virtualizarr.manifests import ChunkManifest, ManifestArray, ManifestGroup from virtualizarr.manifests.utils import ( - check_compatible_encodings, check_same_chunk_shapes, check_same_codecs, check_same_dtypes, @@ -294,20 +293,43 @@ def write_virtual_dataset_to_icechunk_group( append_dim=append_dim, ) + # TODO make sure coordinates get set + fully_virtual_ds = xr.Dataset(loadable_variables, attrs=vds.attrs) + manifest_group = ManifestGroup.from_virtual_dataset(fully_virtual_ds) + + # note: this calls write_manifestgroup_to_icechunk + manifest_group.to_icechunk( + store=store, + group=group.name, + append_dim=append_dim, + last_updated_at=last_updated_at, + ) + + +def write_manifestgroup_to_icechunk( + manifest_group: ManifestGroup, + store: "IcechunkStore", + *, + group: Optional[str] = None, + append_dim: Optional[str] = None, + last_updated_at: Optional[datetime] = None, +) -> None: + """Write the contents of a single ManifestGroup into an Icechunk Store as virtual chunks.""" + # Then write the virtual variables to the same group - for name, var in virtual_variables.items(): - write_virtual_variable_to_icechunk( + for name, marr in manifest_group.arrays(): + write_manifestarray_to_icechunk( store=store, group=group, name=name, # type: ignore[arg-type] - var=var, + marr=marr, append_dim=append_dim, last_updated_at=last_updated_at, ) # finish by writing group-level attributes # note: group attributes must be set after writing individual variables else it gets overwritten - update_attributes(group, vds.attrs, coords=vds.coords) + update_attributes(group, group.metadata.attributes) def update_attributes( @@ -370,23 +392,19 @@ def check_compatible_arrays( check_same_shapes_except_on_concat_axis(arr_shapes, append_axis) -def write_virtual_variable_to_icechunk( +def write_manifestarray_to_icechunk( store: "IcechunkStore", group: "Group", name: str, - var: xr.Variable, + ma: ManifestArray, append_dim: Optional[str] = None, last_updated_at: Optional[datetime] = None, ) -> None: """Write a single virtual variable into an icechunk store""" - from zarr import Array - from virtualizarr.codecs import extract_codecs - - ma = cast(ManifestArray, var.data) metadata = ma.metadata - dims: list[str] = cast(list[str], list(var.dims)) + dims: list[str] = cast(list[str], list(ma.metadata.dimension_names)) existing_num_chunks = 0 if append_dim and append_dim in dims: # TODO: MRP - zarr, or icechunk zarr, array assignment to a variable doesn't work to point to the same object @@ -397,7 +415,9 @@ def write_virtual_variable_to_icechunk( # check if arrays can be concatenated check_compatible_arrays(ma, group[name], append_axis) # type: ignore[arg-type] - check_compatible_encodings(var.encoding, group[name].attrs) + + # TODO how to check encodings for manifestarrays here? + # check_compatible_encodings(var.encoding, group[name].attrs) # determine number of existing chunks along the append axis existing_num_chunks = num_chunks( @@ -427,6 +447,7 @@ def write_virtual_variable_to_icechunk( fill_value=metadata.fill_value, ) + # TODO update_attributes(arr, var.attrs, encoding=var.encoding) write_manifest_virtual_refs(