Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/releases.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
([#335](https://github.com/zarr-developers/VirtualiZarr/issues/335), [#477](https://github.com/zarr-developers/VirtualiZarr/pull/477)) by [Tom Nicholas](https://github.com/TomNicholas).
- Moved `ChunkManifest`, `ManifestArray` etc. to be behind a dedicated `.manifests` namespace. ([#620](https://github.com/zarr-developers/VirtualiZarr/issues/620), [#624](https://github.com/zarr-developers/VirtualiZarr/pull/624))
By [Tom Nicholas](https://github.com/TomNicholas).
- Now by default when writing virtual chunks to Icechunk, the `last_updated_time` for the chunk will be set to the current time. This helps protect users against reading from stale or overwritten chunks stored in Icechunk, by default.
([#436](https://github.com/zarr-developers/VirtualiZarr/issues/436), [#480](https://github.com/zarr-developers/VirtualiZarr/pull/480)) by [Tom Nicholas](https://github.com/TomNicholas).

### Deprecations

Expand Down
16 changes: 2 additions & 14 deletions virtualizarr/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def to_icechunk(
chunks written to the store with this operation. At read time, if any of the
virtual chunks have been updated since this provided datetime, an error will be
raised. This protects against reading outdated virtual chunks that have been
updated since the last read. When not provided, no check is performed. This
updated since the last read. When not provided, the current time is used. This
value is stored in Icechunk with seconds precision, so be sure to take that into
account when providing this value.

Expand All @@ -59,24 +59,12 @@ def to_icechunk(
Dimension along which to append the virtual dataset.
last_updated_at
Datetime to use as a checksum for any virtual chunks written to the store
with this operation. When not provided, no check is performed.
with this operation. When not provided, the current time is used.

Raises
------
ValueError
If the store is read-only.

Examples
--------
To ensure an error is raised if the files containing referenced virtual chunks
are modified at any time from now on, pass the current time to
``last_updated_at``.

>>> from datetime import datetime
>>> vds.virtualize.to_icechunk( # doctest: +SKIP
... icechunkstore,
... last_updated_at=datetime.now(),
... )
"""
from virtualizarr.writers.icechunk import virtual_dataset_to_icechunk

Expand Down
30 changes: 13 additions & 17 deletions virtualizarr/tests/test_writers/test_icechunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
import numpy as np
import numpy.testing as npt
import xarray as xr
import xarray.testing as xrt
import zarr
from zarr.core.buffer import default_buffer_prototype
from zarr.core.metadata import ArrayV3Metadata

from virtualizarr.manifests import ChunkManifest, ManifestArray
Expand Down Expand Up @@ -103,7 +105,6 @@ def test_write_new_virtual_variable(
def test_set_single_virtual_ref_without_encoding(
icechunk_filestore: "IcechunkStore", simple_netcdf4: Path, array_v3_metadata
):
import xarray.testing as xrt
# TODO kerchunk doesn't work with zarr-python v3 yet so we can't use open_virtual_dataset and icechunk together!
# vds = open_virtual_dataset(netcdf4_file, indexes={})

Expand Down Expand Up @@ -139,6 +140,7 @@ def test_set_single_virtual_ref_without_encoding(
xr.open_dataset(simple_netcdf4) as expected_ds,
):
expected_array = expected_ds["foo"].to_numpy()

npt.assert_equal(array, expected_array)
xrt.assert_identical(ds.foo, expected_ds.foo)

Expand All @@ -149,8 +151,6 @@ def test_set_single_virtual_ref_without_encoding(
def test_set_single_virtual_ref_with_encoding(
icechunk_filestore: "IcechunkStore", netcdf4_file: Path, array_v3_metadata
):
import xarray.testing as xrt

with xr.open_dataset(netcdf4_file) as ds:
# We drop the coordinates because we don't have them in the zarr test case
expected_ds = ds.drop_vars(["lon", "lat", "time"])
Expand Down Expand Up @@ -348,10 +348,12 @@ def test_checksum(

vds = xr.Dataset({"pressure": ma_v})

# Icechunk checksums currently store with second precision, so we need to make sure
# the checksum_date is at least one second in the future
checksum_date = datetime.now(timezone.utc) + timedelta(seconds=1)
vds.virtualize.to_icechunk(icechunk_filestore, last_updated_at=checksum_date)
# default behaviour is to create a checksum based on the current time
vds.virtualize.to_icechunk(icechunk_filestore)

# Make sure the checksum_date is at least one second in the past before trying to overwrite referenced file with new data
# This represents someone coming back much later and overwriting archival data
time.sleep(1)

# Fail if anything but None or a datetime is passed to last_updated_at
with pytest.raises(TypeError):
Expand All @@ -372,9 +374,12 @@ def test_checksum(
arr = np.arange(12, dtype=np.dtype("int32")).reshape(3, 4) * 2
var = xr.Variable(data=arr, dims=["x", "y"])
ds = xr.Dataset({"foo": var})
time.sleep(1) # Make sure the checksum_date is at least one second in the future
ds.to_netcdf(netcdf_path)

# TODO assert that icechunk knows the correct last_updated_at for this chunk
# TODO ideally use icechunk's get_chunk_ref to directly interrogate the last_updated_time
# however this is currently only available in rust

# Now if we try to read the data back in, it should fail because the checksum_date
# is newer than the last_updated_at
with pytest.raises(IcechunkError):
Expand Down Expand Up @@ -463,8 +468,6 @@ def test_append_virtual_ref_without_encoding(
simple_netcdf4: str,
virtual_dataset: Callable,
):
import xarray.testing as xrt

# generate virtual dataset
vds = virtual_dataset(file_uri=simple_netcdf4)
# Commit the first virtual dataset
Expand Down Expand Up @@ -499,8 +502,6 @@ def test_append_virtual_ref_with_encoding(
netcdf4_files_factory: Callable,
virtual_dataset: Callable,
):
import xarray.testing as xrt

scale_factor = 0.01
encoding = {"air": {"scale_factor": scale_factor}}
filepath1, filepath2 = netcdf4_files_factory(encoding=encoding)
Expand Down Expand Up @@ -562,9 +563,6 @@ async def test_append_with_multiple_root_arrays(
virtual_variable: Callable,
virtual_dataset: Callable,
):
import xarray.testing as xrt
from zarr.core.buffer import default_buffer_prototype

filepath1, filepath2 = netcdf4_files_factory(
encoding={"air": {"dtype": "float64", "chunksizes": (1460, 25, 53)}}
)
Expand Down Expand Up @@ -675,8 +673,6 @@ def test_append_with_compression_succeeds(
netcdf4_files_factory: Callable,
virtual_dataset: Callable,
):
import xarray.testing as xrt

encoding = {
"air": {
"zlib": True,
Expand Down
10 changes: 9 additions & 1 deletion virtualizarr/writers/icechunk.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from datetime import datetime
from datetime import datetime, timedelta, timezone
from typing import TYPE_CHECKING, List, Optional, Union, cast

import numpy as np
Expand Down Expand Up @@ -414,6 +414,14 @@ def write_manifest_virtual_refs(
op_flags=[["readonly"]] * 3, # type: ignore
)

if last_updated_at is None:
# Icechunk rounds timestamps to the nearest second, but filesystems have higher precision,
# so we need to add a buffer, so that if you immediately read data back from this icechunk store,
# and the referenced data was literally just created (<1s ago),
# you don't get an IcechunkError warning you that your referenced chunk has changed.
# In practice this should only really come up in synthetic examples, e.g. tests and docs.
last_updated_at = datetime.now(timezone.utc) + timedelta(seconds=1)

virtual_chunk_spec_list = [
VirtualChunkSpec(
index=generate_chunk_key(it.multi_index, append_axis, existing_num_chunks),
Expand Down