Skip to content

Commit afb1336

Browse files
Make virtual chunk staleness check opt out (#480)
* change virtual chunk staleness check to be opt-out * change test to match * ds->vds * ensure timestamp is one second in the future * move sleep(1) to test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Clarify supported and non-supported filetypes in FAQ (#577) * fix tests and behaviour to account for limited precision of icechunk's timestamps * remove evidence of bad merge from main * release note * correct bad merge from main * Re-trigger CI --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 3c97895 commit afb1336

File tree

4 files changed

+26
-32
lines changed

4 files changed

+26
-32
lines changed

docs/releases.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@
3434
([#335](https://github.com/zarr-developers/VirtualiZarr/issues/335), [#477](https://github.com/zarr-developers/VirtualiZarr/pull/477)) by [Tom Nicholas](https://github.com/TomNicholas).
3535
- Moved `ChunkManifest`, `ManifestArray` etc. to be behind a dedicated `.manifests` namespace. ([#620](https://github.com/zarr-developers/VirtualiZarr/issues/620), [#624](https://github.com/zarr-developers/VirtualiZarr/pull/624))
3636
By [Tom Nicholas](https://github.com/TomNicholas).
37+
- Now by default when writing virtual chunks to Icechunk, the `last_updated_time` for the chunk will be set to the current time. This helps protect users against reading from stale or overwritten chunks stored in Icechunk, by default.
38+
([#436](https://github.com/zarr-developers/VirtualiZarr/issues/436), [#480](https://github.com/zarr-developers/VirtualiZarr/pull/480)) by [Tom Nicholas](https://github.com/TomNicholas).
3739

3840
### Deprecations
3941

virtualizarr/accessor.py

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def to_icechunk(
4545
chunks written to the store with this operation. At read time, if any of the
4646
virtual chunks have been updated since this provided datetime, an error will be
4747
raised. This protects against reading outdated virtual chunks that have been
48-
updated since the last read. When not provided, no check is performed. This
48+
updated since the last read. When not provided, the current time is used. This
4949
value is stored in Icechunk with seconds precision, so be sure to take that into
5050
account when providing this value.
5151
@@ -59,24 +59,12 @@ def to_icechunk(
5959
Dimension along which to append the virtual dataset.
6060
last_updated_at
6161
Datetime to use as a checksum for any virtual chunks written to the store
62-
with this operation. When not provided, no check is performed.
62+
with this operation. When not provided, the current time is used.
6363
6464
Raises
6565
------
6666
ValueError
6767
If the store is read-only.
68-
69-
Examples
70-
--------
71-
To ensure an error is raised if the files containing referenced virtual chunks
72-
are modified at any time from now on, pass the current time to
73-
``last_updated_at``.
74-
75-
>>> from datetime import datetime
76-
>>> vds.virtualize.to_icechunk( # doctest: +SKIP
77-
... icechunkstore,
78-
... last_updated_at=datetime.now(),
79-
... )
8068
"""
8169
from virtualizarr.writers.icechunk import virtual_dataset_to_icechunk
8270

virtualizarr/tests/test_writers/test_icechunk.py

Lines changed: 13 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010
import numpy as np
1111
import numpy.testing as npt
1212
import xarray as xr
13+
import xarray.testing as xrt
1314
import zarr
15+
from zarr.core.buffer import default_buffer_prototype
1416
from zarr.core.metadata import ArrayV3Metadata
1517

1618
from virtualizarr.manifests import ChunkManifest, ManifestArray
@@ -103,7 +105,6 @@ def test_write_new_virtual_variable(
103105
def test_set_single_virtual_ref_without_encoding(
104106
icechunk_filestore: "IcechunkStore", simple_netcdf4: Path, array_v3_metadata
105107
):
106-
import xarray.testing as xrt
107108
# TODO kerchunk doesn't work with zarr-python v3 yet so we can't use open_virtual_dataset and icechunk together!
108109
# vds = open_virtual_dataset(netcdf4_file, indexes={})
109110

@@ -139,6 +140,7 @@ def test_set_single_virtual_ref_without_encoding(
139140
xr.open_dataset(simple_netcdf4) as expected_ds,
140141
):
141142
expected_array = expected_ds["foo"].to_numpy()
143+
142144
npt.assert_equal(array, expected_array)
143145
xrt.assert_identical(ds.foo, expected_ds.foo)
144146

@@ -149,8 +151,6 @@ def test_set_single_virtual_ref_without_encoding(
149151
def test_set_single_virtual_ref_with_encoding(
150152
icechunk_filestore: "IcechunkStore", netcdf4_file: Path, array_v3_metadata
151153
):
152-
import xarray.testing as xrt
153-
154154
with xr.open_dataset(netcdf4_file) as ds:
155155
# We drop the coordinates because we don't have them in the zarr test case
156156
expected_ds = ds.drop_vars(["lon", "lat", "time"])
@@ -348,10 +348,12 @@ def test_checksum(
348348

349349
vds = xr.Dataset({"pressure": ma_v})
350350

351-
# Icechunk checksums currently store with second precision, so we need to make sure
352-
# the checksum_date is at least one second in the future
353-
checksum_date = datetime.now(timezone.utc) + timedelta(seconds=1)
354-
vds.virtualize.to_icechunk(icechunk_filestore, last_updated_at=checksum_date)
351+
# default behaviour is to create a checksum based on the current time
352+
vds.virtualize.to_icechunk(icechunk_filestore)
353+
354+
# Make sure the checksum_date is at least one second in the past before trying to overwrite referenced file with new data
355+
# This represents someone coming back much later and overwriting archival data
356+
time.sleep(1)
355357

356358
# Fail if anything but None or a datetime is passed to last_updated_at
357359
with pytest.raises(TypeError):
@@ -372,9 +374,12 @@ def test_checksum(
372374
arr = np.arange(12, dtype=np.dtype("int32")).reshape(3, 4) * 2
373375
var = xr.Variable(data=arr, dims=["x", "y"])
374376
ds = xr.Dataset({"foo": var})
375-
time.sleep(1) # Make sure the checksum_date is at least one second in the future
376377
ds.to_netcdf(netcdf_path)
377378

379+
# TODO assert that icechunk knows the correct last_updated_at for this chunk
380+
# TODO ideally use icechunk's get_chunk_ref to directly interrogate the last_updated_time
381+
# however this is currently only available in rust
382+
378383
# Now if we try to read the data back in, it should fail because the checksum_date
379384
# is newer than the last_updated_at
380385
with pytest.raises(IcechunkError):
@@ -463,8 +468,6 @@ def test_append_virtual_ref_without_encoding(
463468
simple_netcdf4: str,
464469
virtual_dataset: Callable,
465470
):
466-
import xarray.testing as xrt
467-
468471
# generate virtual dataset
469472
vds = virtual_dataset(file_uri=simple_netcdf4)
470473
# Commit the first virtual dataset
@@ -499,8 +502,6 @@ def test_append_virtual_ref_with_encoding(
499502
netcdf4_files_factory: Callable,
500503
virtual_dataset: Callable,
501504
):
502-
import xarray.testing as xrt
503-
504505
scale_factor = 0.01
505506
encoding = {"air": {"scale_factor": scale_factor}}
506507
filepath1, filepath2 = netcdf4_files_factory(encoding=encoding)
@@ -562,9 +563,6 @@ async def test_append_with_multiple_root_arrays(
562563
virtual_variable: Callable,
563564
virtual_dataset: Callable,
564565
):
565-
import xarray.testing as xrt
566-
from zarr.core.buffer import default_buffer_prototype
567-
568566
filepath1, filepath2 = netcdf4_files_factory(
569567
encoding={"air": {"dtype": "float64", "chunksizes": (1460, 25, 53)}}
570568
)
@@ -675,8 +673,6 @@ def test_append_with_compression_succeeds(
675673
netcdf4_files_factory: Callable,
676674
virtual_dataset: Callable,
677675
):
678-
import xarray.testing as xrt
679-
680676
encoding = {
681677
"air": {
682678
"zlib": True,

virtualizarr/writers/icechunk.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from datetime import datetime
1+
from datetime import datetime, timedelta, timezone
22
from typing import TYPE_CHECKING, List, Optional, Union, cast
33

44
import numpy as np
@@ -414,6 +414,14 @@ def write_manifest_virtual_refs(
414414
op_flags=[["readonly"]] * 3, # type: ignore
415415
)
416416

417+
if last_updated_at is None:
418+
# Icechunk rounds timestamps to the nearest second, but filesystems have higher precision,
419+
# so we need to add a buffer, so that if you immediately read data back from this icechunk store,
420+
# and the referenced data was literally just created (<1s ago),
421+
# you don't get an IcechunkError warning you that your referenced chunk has changed.
422+
# In practice this should only really come up in synthetic examples, e.g. tests and docs.
423+
last_updated_at = datetime.now(timezone.utc) + timedelta(seconds=1)
424+
417425
virtual_chunk_spec_list = [
418426
VirtualChunkSpec(
419427
index=generate_chunk_key(it.multi_index, append_axis, existing_num_chunks),

0 commit comments

Comments
 (0)