Skip to content

Commit e8ade46

Browse files
authored
Add group param to virtualize Dataset accessor (#391)
1 parent 0db5ff8 commit e8ade46

File tree

4 files changed

+95
-51
lines changed

4 files changed

+95
-51
lines changed

.gitignore

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,3 +161,22 @@ cython_debug/
161161
virtualizarr/_version.py
162162
docs/generated/
163163
examples/
164+
165+
# Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode
166+
# Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode
167+
168+
### VisualStudioCode ###
169+
.vscode
170+
171+
# Local History for Visual Studio Code
172+
.history/
173+
174+
# Built Visual Studio Code Extensions
175+
*.vsix
176+
177+
### VisualStudioCode Patch ###
178+
# Ignore all local history of files
179+
.history
180+
.ionide
181+
182+
# End of https://www.toptal.com/developers/gitignore/api/visualstudiocode

docs/releases.rst

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,10 @@ New Features
1515
for the `to_icechunk` method to add timestamps as checksums when writing virtual references to an icechunk store. This
1616
is useful for ensuring that virtual references are not stale when reading from an icechunk store, which can happen if the
1717
underlying data has changed since the virtual references were written.
18-
- Add ``group=None`` keyword-only parameter to ``dataset_to_icechunk`` function to
19-
allow writing to a nested group at the specified path (root group, if not specified).
20-
(:issue:`341`) By `Chuck Daniels <https://github.com/chuckwondo>`_.
18+
- Add ``group=None`` keyword-only parameter to the
19+
``VirtualiZarrDatasetAccessor.to_icechunk`` method to allow writing to a nested group
20+
at a specified group path (rather than defaulting to the root group, when no group is
21+
specified). (:issue:`341`) By `Chuck Daniels <https://github.com/chuckwondo>`_.
2122

2223
Breaking changes
2324
~~~~~~~~~~~~~~~~
@@ -29,10 +30,11 @@ Breaking changes
2930
Also a warning is no longer thrown when ``indexes=None`` is passed to ``open_virtual_dataset``, and the recommendations in the docs updated to match.
3031
This also means that ``xarray.combine_by_coords`` will now work when the necessary dimension coordinates are specified in ``loadable_variables``.
3132
(:issue:`18`, :pull:`357`, :pull:`358`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
32-
- For function ``dataset_to_icechunk``, parameters ``append_dim`` and ``last_updated_at``
33-
are now keyword-only parameters, rather than positional or keyword. This change is
34-
breaking _only_ where arguments for these parameters are currently given positionally.
35-
(:issue:`341`) By `Chuck Daniels <https://github.com/chuckwondo>`_.
33+
- The ``append_dim`` and ``last_updated_at`` parameters of the
34+
``VirtualiZarrDatasetAccessor.to_icechunk`` method are now keyword-only parameters,
35+
rather than positional or keyword. This change is breaking _only_ where arguments for
36+
these parameters are currently given positionally. (:issue:`341`) By
37+
`Chuck Daniels <https://github.com/chuckwondo>`_.
3638

3739
Deprecations
3840
~~~~~~~~~~~~

virtualizarr/accessor.py

Lines changed: 39 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from datetime import datetime
22
from pathlib import Path
3-
from typing import TYPE_CHECKING, Callable, Literal, Optional, overload
3+
from typing import TYPE_CHECKING, Callable, Literal, overload
44

55
from xarray import Dataset, register_dataset_accessor
66

@@ -42,44 +42,67 @@ def to_zarr(self, storepath: str) -> None:
4242
def to_icechunk(
4343
self,
4444
store: "IcechunkStore",
45-
append_dim: Optional[str] = None,
46-
last_updated_at: Optional[datetime] = None,
45+
*,
46+
group: str | None = None,
47+
append_dim: str | None = None,
48+
last_updated_at: datetime | None = None,
4749
) -> None:
4850
"""
4951
Write an xarray dataset to an Icechunk store.
5052
51-
Any variables backed by ManifestArray objects will be be written as virtual references, any other variables will be loaded into memory before their binary chunk data is written into the store.
53+
Any variables backed by ManifestArray objects will be be written as virtual
54+
references. Any other variables will be loaded into memory before their binary
55+
chunk data is written into the store.
5256
53-
If `append_dim` is provided, the virtual dataset will be appended to the existing IcechunkStore along the `append_dim` dimension.
57+
If `append_dim` is provided, the virtual dataset will be appended to the
58+
existing IcechunkStore along the `append_dim` dimension.
5459
55-
If `last_updated_at` is provided, it will be used as a checksum for any virtual chunks written to the store with this operation.
56-
At read time, if any of the virtual chunks have been updated since this provided datetime, an error will be raised.
57-
This protects against reading outdated virtual chunks that have been updated since the last read. When not provided, no check is performed.
58-
This value is stored in Icechunk with seconds precision, so be sure to take that into account when providing this value.
60+
If `last_updated_at` is provided, it will be used as a checksum for any virtual
61+
chunks written to the store with this operation. At read time, if any of the
62+
virtual chunks have been updated since this provided datetime, an error will be
63+
raised. This protects against reading outdated virtual chunks that have been
64+
updated since the last read. When not provided, no check is performed. This
65+
value is stored in Icechunk with seconds precision, so be sure to take that into
66+
account when providing this value.
5967
6068
Parameters
6169
----------
6270
store: IcechunkStore
71+
Store to write dataset into.
72+
group: str, optional
73+
Path of the group to write the dataset into (default: the root group).
6374
append_dim: str, optional
64-
When provided, specifies the dimension along which to append the virtual dataset.
75+
Dimension along which to append the virtual dataset.
6576
last_updated_at: datetime, optional
66-
When provided, uses provided datetime as a checksum for any virtual chunks written to the store with this operation.
67-
When not provided (default), no check is performed.
77+
Datetime to use as a checksum for any virtual chunks written to the store
78+
with this operation. When not provided, no check is performed.
79+
80+
Raises
81+
------
82+
ValueError
83+
If the store is read-only.
6884
6985
Examples
7086
--------
71-
To ensure an error is raised if the files containing referenced virtual chunks are modified at any time from now on, pass the current time to ``last_updated_at``.
87+
To ensure an error is raised if the files containing referenced virtual chunks
88+
are modified at any time from now on, pass the current time to
89+
``last_updated_at``.
7290
7391
>>> from datetime import datetime
74-
>>>
75-
>>> vds.virtualize.to_icechunk(
92+
>>> vds.virtualize.to_icechunk( # doctest: +SKIP
7693
... icechunkstore,
7794
... last_updated_at=datetime.now(),
7895
... )
7996
"""
8097
from virtualizarr.writers.icechunk import dataset_to_icechunk
8198

82-
dataset_to_icechunk(self.ds, store, append_dim=append_dim)
99+
dataset_to_icechunk(
100+
self.ds,
101+
store,
102+
group=group,
103+
append_dim=append_dim,
104+
last_updated_at=last_updated_at,
105+
)
83106

84107
@overload
85108
def to_kerchunk(

virtualizarr/tests/test_writers/test_icechunk.py

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
from virtualizarr.manifests import ChunkManifest, ManifestArray
1818
from virtualizarr.readers.common import separate_coords
19-
from virtualizarr.writers.icechunk import dataset_to_icechunk, generate_chunk_key
19+
from virtualizarr.writers.icechunk import generate_chunk_key
2020
from virtualizarr.zarr import ZArray
2121

2222
if TYPE_CHECKING:
@@ -47,8 +47,8 @@ def test_invalid_kwarg_type(
4747
):
4848
name, value = kwarg
4949
with pytest.raises(TypeError, match=name):
50-
dataset_to_icechunk(
51-
vds_with_manifest_arrays, icechunk_filestore, **{name: value}
50+
vds_with_manifest_arrays.virtualize.to_icechunk(
51+
icechunk_filestore, **{name: value}
5252
)
5353

5454

@@ -60,7 +60,7 @@ def test_write_new_virtual_variable(
6060
):
6161
vds = vds_with_manifest_arrays
6262

63-
dataset_to_icechunk(vds, icechunk_filestore, group=group_path)
63+
vds.virtualize.to_icechunk(icechunk_filestore, group=group_path)
6464

6565
# check attrs
6666
group = zarr.group(store=icechunk_filestore, path=group_path)
@@ -121,7 +121,7 @@ def test_set_single_virtual_ref_without_encoding(
121121
{"foo": foo},
122122
)
123123

124-
dataset_to_icechunk(vds, icechunk_filestore)
124+
vds.virtualize.to_icechunk(icechunk_filestore)
125125

126126
root_group = zarr.group(store=icechunk_filestore)
127127
array = root_group["foo"]
@@ -175,7 +175,7 @@ def test_set_single_virtual_ref_with_encoding(
175175
)
176176
vds = xr.Dataset({"air": air}, attrs=expected_ds.attrs)
177177

178-
dataset_to_icechunk(vds, icechunk_filestore)
178+
vds.virtualize.to_icechunk(icechunk_filestore)
179179

180180
root_group = zarr.group(store=icechunk_filestore)
181181
air_array = root_group["air"]
@@ -239,7 +239,7 @@ def test_set_grid_virtual_refs(icechunk_filestore: "IcechunkStore", netcdf4_file
239239
{"air": air},
240240
)
241241

242-
dataset_to_icechunk(vds, icechunk_filestore)
242+
vds.virtualize.to_icechunk(icechunk_filestore)
243243

244244
root_group = zarr.group(store=icechunk_filestore)
245245
air_array = root_group["air"]
@@ -298,7 +298,7 @@ def test_write_loadable_variable(
298298
# Icechunk checksums currently store with second precision, so we need to make sure
299299
# the checksum_date is at least one second in the future
300300
checksum_date = datetime.now(timezone.utc) + timedelta(seconds=1)
301-
dataset_to_icechunk(vds, icechunk_filestore, last_updated_at=checksum_date)
301+
vds.virtualize.to_icechunk(icechunk_filestore, last_updated_at=checksum_date)
302302

303303
root_group = zarr.group(store=icechunk_filestore)
304304
air_array = root_group["air"]
@@ -354,11 +354,11 @@ def test_checksum(
354354
# Icechunk checksums currently store with second precision, so we need to make sure
355355
# the checksum_date is at least one second in the future
356356
checksum_date = datetime.now(timezone.utc) + timedelta(seconds=1)
357-
dataset_to_icechunk(vds, icechunk_filestore, last_updated_at=checksum_date)
357+
vds.virtualize.to_icechunk(icechunk_filestore, last_updated_at=checksum_date)
358358

359359
# Fail if anything but None or a datetime is passed to last_updated_at
360360
with pytest.raises(TypeError):
361-
dataset_to_icechunk(vds, icechunk_filestore, last_updated_at="not a datetime") # type: ignore
361+
vds.virtualize.to_icechunk(icechunk_filestore, last_updated_at="not a datetime") # type: ignore
362362

363363
root_group = zarr.group(store=icechunk_filestore)
364364
pres_array = root_group["pres"]
@@ -547,18 +547,18 @@ def test_append_virtual_ref_without_encoding(
547547
# create the icechunk store and commit the first virtual dataset
548548
repo = Repository.create(storage=icechunk_storage)
549549
session = repo.writable_session("main")
550-
dataset_to_icechunk(vds, session.store)
550+
vds.virtualize.to_icechunk(session.store)
551551
session.commit(
552552
"test commit"
553553
) # need to commit it in order to append to it in the next lines
554554

555555
# Append the same dataset to the same store
556556
icechunk_filestore_append = repo.writable_session("main")
557-
dataset_to_icechunk(vds, icechunk_filestore_append.store, append_dim="x")
557+
vds.virtualize.to_icechunk(icechunk_filestore_append.store, append_dim="x")
558558
icechunk_filestore_append.commit("appended data")
559559

560560
icechunk_filestore_append = repo.writable_session("main")
561-
dataset_to_icechunk(vds, icechunk_filestore_append.store, append_dim="x")
561+
vds.virtualize.to_icechunk(icechunk_filestore_append.store, append_dim="x")
562562
icechunk_filestore_append.commit("appended data again")
563563

564564
with (
@@ -608,14 +608,14 @@ def test_append_virtual_ref_with_encoding(
608608
# create the icechunk store and commit the first virtual dataset
609609
icechunk_repo = Repository.create(storage=icechunk_storage)
610610
icechunk_filestore = icechunk_repo.writable_session("main")
611-
dataset_to_icechunk(vds1, icechunk_filestore.store)
611+
vds1.virtualize.to_icechunk(icechunk_filestore.store)
612612
icechunk_filestore.commit(
613613
"test commit"
614614
) # need to commit it in order to append to it in the next lines
615615

616616
# Append the same dataset to the same store
617617
icechunk_filestore_append = icechunk_repo.writable_session("main")
618-
dataset_to_icechunk(vds2, icechunk_filestore_append.store, append_dim="time")
618+
vds2.virtualize.to_icechunk(icechunk_filestore_append.store, append_dim="time")
619619
icechunk_filestore_append.commit("appended data")
620620

621621
with (
@@ -716,7 +716,7 @@ async def test_append_with_multiple_root_arrays(
716716
# create the icechunk store and commit the first virtual dataset
717717
icechunk_repo = Repository.create(storage=icechunk_storage)
718718
icechunk_filestore = icechunk_repo.writable_session("main")
719-
dataset_to_icechunk(vds1, icechunk_filestore.store)
719+
vds1.virtualize.to_icechunk(icechunk_filestore.store)
720720
icechunk_filestore.commit(
721721
"test commit"
722722
) # need to commit it in order to append to it in the next lines
@@ -726,7 +726,7 @@ async def test_append_with_multiple_root_arrays(
726726

727727
# Append the same dataset to the same store
728728
icechunk_filestore_append = icechunk_repo.writable_session("main")
729-
dataset_to_icechunk(vds2, icechunk_filestore_append.store, append_dim="time")
729+
vds2.virtualize.to_icechunk(icechunk_filestore_append.store, append_dim="time")
730730
icechunk_filestore_append.commit("appended data")
731731
assert (
732732
await icechunk_filestore_append.store.get(
@@ -795,12 +795,12 @@ def test_append_with_compression_succeeds(
795795
# Create icechunk store and commit the compressed dataset
796796
icechunk_repo = Repository.create(storage=icechunk_storage)
797797
icechunk_filestore = icechunk_repo.writable_session("main")
798-
dataset_to_icechunk(vds1, icechunk_filestore.store)
798+
vds1.virtualize.to_icechunk(icechunk_filestore.store)
799799
icechunk_filestore.commit("test commit")
800800

801801
# Append another dataset with compatible compression
802802
icechunk_filestore_append = icechunk_repo.writable_session("main")
803-
dataset_to_icechunk(vds2, icechunk_filestore_append.store, append_dim="time")
803+
vds2.virtualize.to_icechunk(icechunk_filestore_append.store, append_dim="time")
804804
icechunk_filestore_append.commit("appended data")
805805
with (
806806
xr.open_zarr(
@@ -825,7 +825,7 @@ def test_append_with_different_chunking_fails(
825825
# Create icechunk store and commit the dataset
826826
icechunk_repo = Repository.create(storage=icechunk_storage)
827827
icechunk_filestore = icechunk_repo.writable_session("main")
828-
dataset_to_icechunk(vds, icechunk_filestore.store)
828+
vds.virtualize.to_icechunk(icechunk_filestore.store)
829829
icechunk_filestore.commit("test commit")
830830

831831
# Try to append dataset with different chunking, expect failure
@@ -836,8 +836,8 @@ def test_append_with_different_chunking_fails(
836836
with pytest.raises(
837837
ValueError, match="Cannot concatenate arrays with inconsistent chunk shapes"
838838
):
839-
dataset_to_icechunk(
840-
vds_different_chunking, icechunk_filestore_append.store, append_dim="x"
839+
vds_different_chunking.virtualize.to_icechunk(
840+
icechunk_filestore_append.store, append_dim="x"
841841
)
842842

843843
## When encoding is different it fails
@@ -857,7 +857,7 @@ def test_append_with_different_encoding_fails(
857857
# Create icechunk store and commit the first dataset
858858
icechunk_repo = Repository.create(storage=icechunk_storage)
859859
icechunk_filestore = icechunk_repo.writable_session("main")
860-
dataset_to_icechunk(vds1, icechunk_filestore.store)
860+
vds1.virtualize.to_icechunk(icechunk_filestore.store)
861861
icechunk_filestore.commit("test commit")
862862

863863
# Try to append with different encoding, expect failure
@@ -866,7 +866,7 @@ def test_append_with_different_encoding_fails(
866866
ValueError,
867867
match="Cannot concatenate arrays with different values for encoding",
868868
):
869-
dataset_to_icechunk(vds2, icechunk_filestore_append.store, append_dim="x")
869+
vds2.virtualize.to_icechunk(icechunk_filestore_append.store, append_dim="x")
870870

871871
def test_dimensions_do_not_align(
872872
self, icechunk_storage: "Storage", simple_netcdf4: str
@@ -888,13 +888,13 @@ def test_dimensions_do_not_align(
888888
# Create icechunk store and commit the first dataset
889889
icechunk_repo = Repository.create(storage=icechunk_storage)
890890
icechunk_filestore = icechunk_repo.writable_session("main")
891-
dataset_to_icechunk(vds1, icechunk_filestore.store)
891+
vds1.virtualize.to_icechunk(icechunk_filestore.store)
892892
icechunk_filestore.commit("test commit")
893893

894894
# Attempt to append dataset with different length in non-append dimension, expect failure
895895
icechunk_filestore_append = icechunk_repo.writable_session("main")
896896
with pytest.raises(ValueError, match="Cannot concatenate arrays with shapes"):
897-
dataset_to_icechunk(vds2, icechunk_filestore_append.store, append_dim="y")
897+
vds2.virtualize.to_icechunk(icechunk_filestore_append.store, append_dim="y")
898898

899899
def test_append_dim_not_in_dims_raises_error(
900900
self, icechunk_storage: "Storage", simple_netcdf4: str
@@ -910,7 +910,7 @@ def test_append_dim_not_in_dims_raises_error(
910910

911911
icechunk_repo = Repository.create(storage=icechunk_storage)
912912
icechunk_filestore = icechunk_repo.writable_session("main")
913-
dataset_to_icechunk(vds, icechunk_filestore.store)
913+
vds.virtualize.to_icechunk(icechunk_filestore.store)
914914
icechunk_filestore.commit("initial commit")
915915

916916
# Attempt to append using a non-existent append_dim "z"
@@ -920,7 +920,7 @@ def test_append_dim_not_in_dims_raises_error(
920920
ValueError,
921921
match="append_dim 'z' does not match any existing dataset dimensions",
922922
):
923-
dataset_to_icechunk(vds, icechunk_filestore_append.store, append_dim="z")
923+
vds.virtualize.to_icechunk(icechunk_filestore_append.store, append_dim="z")
924924

925925

926926
# TODO test with S3 / minio

0 commit comments

Comments
 (0)