Add group param to virtualize Dataset accessor (#391)

chuckwondo · web-flow · commit e8ade460d887 · 2025-01-27T11:10:36.000-05:00
diff --git a/.gitignore b/.gitignore
@@ -161,3 +161,22 @@ cython_debug/
 virtualizarr/_version.py
 docs/generated/
 examples/
+
+# Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode
+# Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode
+
+### VisualStudioCode ###
+.vscode
+
+# Local History for Visual Studio Code
+.history/
+
+# Built Visual Studio Code Extensions
+*.vsix
+
+### VisualStudioCode Patch ###
+# Ignore all local history of files
+.history
+.ionide
+
+# End of https://www.toptal.com/developers/gitignore/api/visualstudiocode
diff --git a/docs/releases.rst b/docs/releases.rst
@@ -15,9 +15,10 @@ New Features
   for the `to_icechunk` method to add timestamps as checksums when writing virtual references to an icechunk store. This
   is useful for ensuring that virtual references are not stale when reading from an icechunk store, which can happen if the
   underlying data has changed since the virtual references were written.
-- Add ``group=None`` keyword-only parameter to ``dataset_to_icechunk`` function to
-  allow writing to a nested group at the specified path (root group, if not specified).
-  (:issue:`341`) By `Chuck Daniels <https://github.com/chuckwondo>`_.
+- Add ``group=None`` keyword-only parameter to the
+  ``VirtualiZarrDatasetAccessor.to_icechunk`` method to allow writing to a nested group
+  at a specified group path (rather than defaulting to the root group, when no group is
+  specified).  (:issue:`341`) By `Chuck Daniels <https://github.com/chuckwondo>`_.
 
 Breaking changes
 ~~~~~~~~~~~~~~~~
@@ -29,10 +30,11 @@ Breaking changes
   Also a warning is no longer thrown when ``indexes=None`` is passed to ``open_virtual_dataset``, and the recommendations in the docs updated to match.
   This also means that ``xarray.combine_by_coords`` will now work when the necessary dimension coordinates are specified in ``loadable_variables``.
   (:issue:`18`, :pull:`357`, :pull:`358`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
-- For function ``dataset_to_icechunk``, parameters ``append_dim`` and ``last_updated_at``
-  are now keyword-only parameters, rather than positional or keyword.  This change is
-  breaking _only_ where arguments for these parameters are currently given positionally.
-  (:issue:`341`) By `Chuck Daniels <https://github.com/chuckwondo>`_.
+- The ``append_dim`` and ``last_updated_at`` parameters of the
+  ``VirtualiZarrDatasetAccessor.to_icechunk`` method are now keyword-only parameters,
+  rather than positional or keyword.  This change is breaking _only_ where arguments for
+  these parameters are currently given positionally.  (:issue:`341`) By
+  `Chuck Daniels <https://github.com/chuckwondo>`_.
 
 Deprecations
 ~~~~~~~~~~~~
diff --git a/virtualizarr/accessor.py b/virtualizarr/accessor.py
@@ -1,6 +1,6 @@
 from datetime import datetime
 from pathlib import Path
-from typing import TYPE_CHECKING, Callable, Literal, Optional, overload
+from typing import TYPE_CHECKING, Callable, Literal, overload
 
 from xarray import Dataset, register_dataset_accessor
 
@@ -42,44 +42,67 @@ def to_zarr(self, storepath: str) -> None:
     def to_icechunk(
         self,
         store: "IcechunkStore",
-        append_dim: Optional[str] = None,
-        last_updated_at: Optional[datetime] = None,
+        *,
+        group: str | None = None,
+        append_dim: str | None = None,
+        last_updated_at: datetime | None = None,
     ) -> None:
         """
         Write an xarray dataset to an Icechunk store.
 
-        Any variables backed by ManifestArray objects will be be written as virtual references, any other variables will be loaded into memory before their binary chunk data is written into the store.
+        Any variables backed by ManifestArray objects will be be written as virtual
+        references. Any other variables will be loaded into memory before their binary
+        chunk data is written into the store.
 
-        If `append_dim` is provided, the virtual dataset will be appended to the existing IcechunkStore along the `append_dim` dimension.
+        If `append_dim` is provided, the virtual dataset will be appended to the
+        existing IcechunkStore along the `append_dim` dimension.
 
-        If `last_updated_at` is provided, it will be used as a checksum for any virtual chunks written to the store with this operation.
-        At read time, if any of the virtual chunks have been updated since this provided datetime, an error will be raised.
-        This protects against reading outdated virtual chunks that have been updated since the last read. When not provided, no check is performed.
-        This value is stored in Icechunk with seconds precision, so be sure to take that into account when providing this value.
+        If `last_updated_at` is provided, it will be used as a checksum for any virtual
+        chunks written to the store with this operation.  At read time, if any of the
+        virtual chunks have been updated since this provided datetime, an error will be
+        raised.  This protects against reading outdated virtual chunks that have been
+        updated since the last read.  When not provided, no check is performed.  This
+        value is stored in Icechunk with seconds precision, so be sure to take that into
+        account when providing this value.
 
         Parameters
         ----------
         store: IcechunkStore
+            Store to write dataset into.
+        group: str, optional
+            Path of the group to write the dataset into (default: the root group).
         append_dim: str, optional
-            When provided, specifies the dimension along which to append the virtual dataset.
+            Dimension along which to append the virtual dataset.
         last_updated_at: datetime, optional
-            When provided, uses provided datetime as a checksum for any virtual chunks written to the store with this operation.
-            When not provided (default), no check is performed.
+            Datetime to use as a checksum for any virtual chunks written to the store
+            with this operation.  When not provided, no check is performed.
+
+        Raises
+        ------
+        ValueError
+            If the store is read-only.
 
         Examples
         --------
-        To ensure an error is raised if the files containing referenced virtual chunks are modified at any time from now on, pass the current time to ``last_updated_at``.
+        To ensure an error is raised if the files containing referenced virtual chunks
+        are modified at any time from now on, pass the current time to
+        ``last_updated_at``.
 
         >>> from datetime import datetime
-        >>>
-        >>> vds.virtualize.to_icechunk(
+        >>> vds.virtualize.to_icechunk(  # doctest: +SKIP
         ...     icechunkstore,
         ...     last_updated_at=datetime.now(),
         ... )
         """
         from virtualizarr.writers.icechunk import dataset_to_icechunk
 
-        dataset_to_icechunk(self.ds, store, append_dim=append_dim)
+        dataset_to_icechunk(
+            self.ds,
+            store,
+            group=group,
+            append_dim=append_dim,
+            last_updated_at=last_updated_at,
+        )
 
     @overload
     def to_kerchunk(
diff --git a/virtualizarr/tests/test_writers/test_icechunk.py b/virtualizarr/tests/test_writers/test_icechunk.py
@@ -16,7 +16,7 @@
 
 from virtualizarr.manifests import ChunkManifest, ManifestArray
 from virtualizarr.readers.common import separate_coords
-from virtualizarr.writers.icechunk import dataset_to_icechunk, generate_chunk_key
+from virtualizarr.writers.icechunk import generate_chunk_key
 from virtualizarr.zarr import ZArray
 
 if TYPE_CHECKING:
@@ -47,8 +47,8 @@ def test_invalid_kwarg_type(
 ):
     name, value = kwarg
     with pytest.raises(TypeError, match=name):
-        dataset_to_icechunk(
-            vds_with_manifest_arrays, icechunk_filestore, **{name: value}
+        vds_with_manifest_arrays.virtualize.to_icechunk(
+            icechunk_filestore, **{name: value}
         )
 
 
@@ -60,7 +60,7 @@ def test_write_new_virtual_variable(
 ):
     vds = vds_with_manifest_arrays
 
-    dataset_to_icechunk(vds, icechunk_filestore, group=group_path)
+    vds.virtualize.to_icechunk(icechunk_filestore, group=group_path)
 
     # check attrs
     group = zarr.group(store=icechunk_filestore, path=group_path)
@@ -121,7 +121,7 @@ def test_set_single_virtual_ref_without_encoding(
         {"foo": foo},
     )
 
-    dataset_to_icechunk(vds, icechunk_filestore)
+    vds.virtualize.to_icechunk(icechunk_filestore)
 
     root_group = zarr.group(store=icechunk_filestore)
     array = root_group["foo"]
@@ -175,7 +175,7 @@ def test_set_single_virtual_ref_with_encoding(
         )
         vds = xr.Dataset({"air": air}, attrs=expected_ds.attrs)
 
-        dataset_to_icechunk(vds, icechunk_filestore)
+        vds.virtualize.to_icechunk(icechunk_filestore)
 
         root_group = zarr.group(store=icechunk_filestore)
         air_array = root_group["air"]
@@ -239,7 +239,7 @@ def test_set_grid_virtual_refs(icechunk_filestore: "IcechunkStore", netcdf4_file
         {"air": air},
     )
 
-    dataset_to_icechunk(vds, icechunk_filestore)
+    vds.virtualize.to_icechunk(icechunk_filestore)
 
     root_group = zarr.group(store=icechunk_filestore)
     air_array = root_group["air"]
@@ -298,7 +298,7 @@ def test_write_loadable_variable(
     # Icechunk checksums currently store with second precision, so we need to make sure
     # the checksum_date is at least one second in the future
     checksum_date = datetime.now(timezone.utc) + timedelta(seconds=1)
-    dataset_to_icechunk(vds, icechunk_filestore, last_updated_at=checksum_date)
+    vds.virtualize.to_icechunk(icechunk_filestore, last_updated_at=checksum_date)
 
     root_group = zarr.group(store=icechunk_filestore)
     air_array = root_group["air"]
@@ -354,11 +354,11 @@ def test_checksum(
     # Icechunk checksums currently store with second precision, so we need to make sure
     # the checksum_date is at least one second in the future
     checksum_date = datetime.now(timezone.utc) + timedelta(seconds=1)
-    dataset_to_icechunk(vds, icechunk_filestore, last_updated_at=checksum_date)
+    vds.virtualize.to_icechunk(icechunk_filestore, last_updated_at=checksum_date)
 
     # Fail if anything but None or a datetime is passed to last_updated_at
     with pytest.raises(TypeError):
-        dataset_to_icechunk(vds, icechunk_filestore, last_updated_at="not a datetime")  # type: ignore
+        vds.virtualize.to_icechunk(icechunk_filestore, last_updated_at="not a datetime")  # type: ignore
 
     root_group = zarr.group(store=icechunk_filestore)
     pres_array = root_group["pres"]
@@ -547,18 +547,18 @@ def test_append_virtual_ref_without_encoding(
         # create the icechunk store and commit the first virtual dataset
         repo = Repository.create(storage=icechunk_storage)
         session = repo.writable_session("main")
-        dataset_to_icechunk(vds, session.store)
+        vds.virtualize.to_icechunk(session.store)
         session.commit(
             "test commit"
         )  # need to commit it in order to append to it in the next lines
 
         # Append the same dataset to the same store
         icechunk_filestore_append = repo.writable_session("main")
-        dataset_to_icechunk(vds, icechunk_filestore_append.store, append_dim="x")
+        vds.virtualize.to_icechunk(icechunk_filestore_append.store, append_dim="x")
         icechunk_filestore_append.commit("appended data")
 
         icechunk_filestore_append = repo.writable_session("main")
-        dataset_to_icechunk(vds, icechunk_filestore_append.store, append_dim="x")
+        vds.virtualize.to_icechunk(icechunk_filestore_append.store, append_dim="x")
         icechunk_filestore_append.commit("appended data again")
 
         with (
@@ -608,14 +608,14 @@ def test_append_virtual_ref_with_encoding(
         # create the icechunk store and commit the first virtual dataset
         icechunk_repo = Repository.create(storage=icechunk_storage)
         icechunk_filestore = icechunk_repo.writable_session("main")
-        dataset_to_icechunk(vds1, icechunk_filestore.store)
+        vds1.virtualize.to_icechunk(icechunk_filestore.store)
         icechunk_filestore.commit(
             "test commit"
         )  # need to commit it in order to append to it in the next lines
 
         # Append the same dataset to the same store
         icechunk_filestore_append = icechunk_repo.writable_session("main")
-        dataset_to_icechunk(vds2, icechunk_filestore_append.store, append_dim="time")
+        vds2.virtualize.to_icechunk(icechunk_filestore_append.store, append_dim="time")
         icechunk_filestore_append.commit("appended data")
 
         with (
@@ -716,7 +716,7 @@ async def test_append_with_multiple_root_arrays(
         # create the icechunk store and commit the first virtual dataset
         icechunk_repo = Repository.create(storage=icechunk_storage)
         icechunk_filestore = icechunk_repo.writable_session("main")
-        dataset_to_icechunk(vds1, icechunk_filestore.store)
+        vds1.virtualize.to_icechunk(icechunk_filestore.store)
         icechunk_filestore.commit(
             "test commit"
         )  # need to commit it in order to append to it in the next lines
@@ -726,7 +726,7 @@ async def test_append_with_multiple_root_arrays(
 
         # Append the same dataset to the same store
         icechunk_filestore_append = icechunk_repo.writable_session("main")
-        dataset_to_icechunk(vds2, icechunk_filestore_append.store, append_dim="time")
+        vds2.virtualize.to_icechunk(icechunk_filestore_append.store, append_dim="time")
         icechunk_filestore_append.commit("appended data")
         assert (
             await icechunk_filestore_append.store.get(
@@ -795,12 +795,12 @@ def test_append_with_compression_succeeds(
         # Create icechunk store and commit the compressed dataset
         icechunk_repo = Repository.create(storage=icechunk_storage)
         icechunk_filestore = icechunk_repo.writable_session("main")
-        dataset_to_icechunk(vds1, icechunk_filestore.store)
+        vds1.virtualize.to_icechunk(icechunk_filestore.store)
         icechunk_filestore.commit("test commit")
 
         # Append another dataset with compatible compression
         icechunk_filestore_append = icechunk_repo.writable_session("main")
-        dataset_to_icechunk(vds2, icechunk_filestore_append.store, append_dim="time")
+        vds2.virtualize.to_icechunk(icechunk_filestore_append.store, append_dim="time")
         icechunk_filestore_append.commit("appended data")
         with (
             xr.open_zarr(
@@ -825,7 +825,7 @@ def test_append_with_different_chunking_fails(
         # Create icechunk store and commit the dataset
         icechunk_repo = Repository.create(storage=icechunk_storage)
         icechunk_filestore = icechunk_repo.writable_session("main")
-        dataset_to_icechunk(vds, icechunk_filestore.store)
+        vds.virtualize.to_icechunk(icechunk_filestore.store)
         icechunk_filestore.commit("test commit")
 
         # Try to append dataset with different chunking, expect failure
@@ -836,8 +836,8 @@ def test_append_with_different_chunking_fails(
         with pytest.raises(
             ValueError, match="Cannot concatenate arrays with inconsistent chunk shapes"
         ):
-            dataset_to_icechunk(
-                vds_different_chunking, icechunk_filestore_append.store, append_dim="x"
+            vds_different_chunking.virtualize.to_icechunk(
+                icechunk_filestore_append.store, append_dim="x"
             )
 
     ## When encoding is different it fails
@@ -857,7 +857,7 @@ def test_append_with_different_encoding_fails(
         # Create icechunk store and commit the first dataset
         icechunk_repo = Repository.create(storage=icechunk_storage)
         icechunk_filestore = icechunk_repo.writable_session("main")
-        dataset_to_icechunk(vds1, icechunk_filestore.store)
+        vds1.virtualize.to_icechunk(icechunk_filestore.store)
         icechunk_filestore.commit("test commit")
 
         # Try to append with different encoding, expect failure
@@ -866,7 +866,7 @@ def test_append_with_different_encoding_fails(
             ValueError,
             match="Cannot concatenate arrays with different values for encoding",
         ):
-            dataset_to_icechunk(vds2, icechunk_filestore_append.store, append_dim="x")
+            vds2.virtualize.to_icechunk(icechunk_filestore_append.store, append_dim="x")
 
     def test_dimensions_do_not_align(
         self, icechunk_storage: "Storage", simple_netcdf4: str
@@ -888,13 +888,13 @@ def test_dimensions_do_not_align(
         # Create icechunk store and commit the first dataset
         icechunk_repo = Repository.create(storage=icechunk_storage)
         icechunk_filestore = icechunk_repo.writable_session("main")
-        dataset_to_icechunk(vds1, icechunk_filestore.store)
+        vds1.virtualize.to_icechunk(icechunk_filestore.store)
         icechunk_filestore.commit("test commit")
 
         # Attempt to append dataset with different length in non-append dimension, expect failure
         icechunk_filestore_append = icechunk_repo.writable_session("main")
         with pytest.raises(ValueError, match="Cannot concatenate arrays with shapes"):
-            dataset_to_icechunk(vds2, icechunk_filestore_append.store, append_dim="y")
+            vds2.virtualize.to_icechunk(icechunk_filestore_append.store, append_dim="y")
 
     def test_append_dim_not_in_dims_raises_error(
         self, icechunk_storage: "Storage", simple_netcdf4: str
@@ -910,7 +910,7 @@ def test_append_dim_not_in_dims_raises_error(
 
         icechunk_repo = Repository.create(storage=icechunk_storage)
         icechunk_filestore = icechunk_repo.writable_session("main")
-        dataset_to_icechunk(vds, icechunk_filestore.store)
+        vds.virtualize.to_icechunk(icechunk_filestore.store)
         icechunk_filestore.commit("initial commit")
 
         # Attempt to append using a non-existent append_dim "z"
@@ -920,7 +920,7 @@ def test_append_dim_not_in_dims_raises_error(
             ValueError,
             match="append_dim 'z' does not match any existing dataset dimensions",
         ):
-            dataset_to_icechunk(vds, icechunk_filestore_append.store, append_dim="z")
+            vds.virtualize.to_icechunk(icechunk_filestore_append.store, append_dim="z")
 
 
 # TODO test with S3 / minio