diff --git a/docs/releases.md b/docs/releases.md index e2043436..79bc56d7 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -34,6 +34,8 @@ ([#335](https://github.com/zarr-developers/VirtualiZarr/issues/335), [#477](https://github.com/zarr-developers/VirtualiZarr/pull/477)) by [Tom Nicholas](https://github.com/TomNicholas). - Moved `ChunkManifest`, `ManifestArray` etc. to be behind a dedicated `.manifests` namespace. ([#620](https://github.com/zarr-developers/VirtualiZarr/issues/620), [#624](https://github.com/zarr-developers/VirtualiZarr/pull/624)) By [Tom Nicholas](https://github.com/TomNicholas). +- Now by default when writing virtual chunks to Icechunk, the `last_updated_time` for the chunk will be set to the current time. This helps protect users against reading from stale or overwritten chunks stored in Icechunk, by default. + ([#436](https://github.com/zarr-developers/VirtualiZarr/issues/436), [#480](https://github.com/zarr-developers/VirtualiZarr/pull/480)) by [Tom Nicholas](https://github.com/TomNicholas). ### Deprecations diff --git a/virtualizarr/accessor.py b/virtualizarr/accessor.py index dceb6c6e..c48f35a3 100644 --- a/virtualizarr/accessor.py +++ b/virtualizarr/accessor.py @@ -45,7 +45,7 @@ def to_icechunk( chunks written to the store with this operation. At read time, if any of the virtual chunks have been updated since this provided datetime, an error will be raised. This protects against reading outdated virtual chunks that have been - updated since the last read. When not provided, no check is performed. This + updated since the last read. When not provided, the current time is used. This value is stored in Icechunk with seconds precision, so be sure to take that into account when providing this value. @@ -59,24 +59,12 @@ def to_icechunk( Dimension along which to append the virtual dataset. last_updated_at Datetime to use as a checksum for any virtual chunks written to the store - with this operation. When not provided, no check is performed. + with this operation. When not provided, the current time is used. Raises ------ ValueError If the store is read-only. - - Examples - -------- - To ensure an error is raised if the files containing referenced virtual chunks - are modified at any time from now on, pass the current time to - ``last_updated_at``. - - >>> from datetime import datetime - >>> vds.virtualize.to_icechunk( # doctest: +SKIP - ... icechunkstore, - ... last_updated_at=datetime.now(), - ... ) """ from virtualizarr.writers.icechunk import virtual_dataset_to_icechunk diff --git a/virtualizarr/tests/test_writers/test_icechunk.py b/virtualizarr/tests/test_writers/test_icechunk.py index b7ad1b60..a4dc2377 100644 --- a/virtualizarr/tests/test_writers/test_icechunk.py +++ b/virtualizarr/tests/test_writers/test_icechunk.py @@ -10,7 +10,9 @@ import numpy as np import numpy.testing as npt import xarray as xr +import xarray.testing as xrt import zarr +from zarr.core.buffer import default_buffer_prototype from zarr.core.metadata import ArrayV3Metadata from virtualizarr.manifests import ChunkManifest, ManifestArray @@ -103,7 +105,6 @@ def test_write_new_virtual_variable( def test_set_single_virtual_ref_without_encoding( icechunk_filestore: "IcechunkStore", simple_netcdf4: Path, array_v3_metadata ): - import xarray.testing as xrt # TODO kerchunk doesn't work with zarr-python v3 yet so we can't use open_virtual_dataset and icechunk together! # vds = open_virtual_dataset(netcdf4_file, indexes={}) @@ -139,6 +140,7 @@ def test_set_single_virtual_ref_without_encoding( xr.open_dataset(simple_netcdf4) as expected_ds, ): expected_array = expected_ds["foo"].to_numpy() + npt.assert_equal(array, expected_array) xrt.assert_identical(ds.foo, expected_ds.foo) @@ -149,8 +151,6 @@ def test_set_single_virtual_ref_without_encoding( def test_set_single_virtual_ref_with_encoding( icechunk_filestore: "IcechunkStore", netcdf4_file: Path, array_v3_metadata ): - import xarray.testing as xrt - with xr.open_dataset(netcdf4_file) as ds: # We drop the coordinates because we don't have them in the zarr test case expected_ds = ds.drop_vars(["lon", "lat", "time"]) @@ -348,10 +348,12 @@ def test_checksum( vds = xr.Dataset({"pressure": ma_v}) - # Icechunk checksums currently store with second precision, so we need to make sure - # the checksum_date is at least one second in the future - checksum_date = datetime.now(timezone.utc) + timedelta(seconds=1) - vds.virtualize.to_icechunk(icechunk_filestore, last_updated_at=checksum_date) + # default behaviour is to create a checksum based on the current time + vds.virtualize.to_icechunk(icechunk_filestore) + + # Make sure the checksum_date is at least one second in the past before trying to overwrite referenced file with new data + # This represents someone coming back much later and overwriting archival data + time.sleep(1) # Fail if anything but None or a datetime is passed to last_updated_at with pytest.raises(TypeError): @@ -372,9 +374,12 @@ def test_checksum( arr = np.arange(12, dtype=np.dtype("int32")).reshape(3, 4) * 2 var = xr.Variable(data=arr, dims=["x", "y"]) ds = xr.Dataset({"foo": var}) - time.sleep(1) # Make sure the checksum_date is at least one second in the future ds.to_netcdf(netcdf_path) + # TODO assert that icechunk knows the correct last_updated_at for this chunk + # TODO ideally use icechunk's get_chunk_ref to directly interrogate the last_updated_time + # however this is currently only available in rust + # Now if we try to read the data back in, it should fail because the checksum_date # is newer than the last_updated_at with pytest.raises(IcechunkError): @@ -463,8 +468,6 @@ def test_append_virtual_ref_without_encoding( simple_netcdf4: str, virtual_dataset: Callable, ): - import xarray.testing as xrt - # generate virtual dataset vds = virtual_dataset(file_uri=simple_netcdf4) # Commit the first virtual dataset @@ -499,8 +502,6 @@ def test_append_virtual_ref_with_encoding( netcdf4_files_factory: Callable, virtual_dataset: Callable, ): - import xarray.testing as xrt - scale_factor = 0.01 encoding = {"air": {"scale_factor": scale_factor}} filepath1, filepath2 = netcdf4_files_factory(encoding=encoding) @@ -562,9 +563,6 @@ async def test_append_with_multiple_root_arrays( virtual_variable: Callable, virtual_dataset: Callable, ): - import xarray.testing as xrt - from zarr.core.buffer import default_buffer_prototype - filepath1, filepath2 = netcdf4_files_factory( encoding={"air": {"dtype": "float64", "chunksizes": (1460, 25, 53)}} ) @@ -675,8 +673,6 @@ def test_append_with_compression_succeeds( netcdf4_files_factory: Callable, virtual_dataset: Callable, ): - import xarray.testing as xrt - encoding = { "air": { "zlib": True, diff --git a/virtualizarr/writers/icechunk.py b/virtualizarr/writers/icechunk.py index cbce92ab..85f85162 100644 --- a/virtualizarr/writers/icechunk.py +++ b/virtualizarr/writers/icechunk.py @@ -1,4 +1,4 @@ -from datetime import datetime +from datetime import datetime, timedelta, timezone from typing import TYPE_CHECKING, List, Optional, Union, cast import numpy as np @@ -414,6 +414,14 @@ def write_manifest_virtual_refs( op_flags=[["readonly"]] * 3, # type: ignore ) + if last_updated_at is None: + # Icechunk rounds timestamps to the nearest second, but filesystems have higher precision, + # so we need to add a buffer, so that if you immediately read data back from this icechunk store, + # and the referenced data was literally just created (<1s ago), + # you don't get an IcechunkError warning you that your referenced chunk has changed. + # In practice this should only really come up in synthetic examples, e.g. tests and docs. + last_updated_at = datetime.now(timezone.utc) + timedelta(seconds=1) + virtual_chunk_spec_list = [ VirtualChunkSpec( index=generate_chunk_key(it.multi_index, append_axis, existing_num_chunks),