Skip to content

Commit 2b32174

Browse files
Merge branch 'main' into ab/fix-fill-value-dmrpp
2 parents 01a7010 + bd010c4 commit 2b32174

File tree

14 files changed

+278
-121
lines changed

14 files changed

+278
-121
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ repos:
1111

1212
- repo: https://github.com/astral-sh/ruff-pre-commit
1313
# Ruff version.
14-
rev: "v0.8.1"
14+
rev: "v0.8.6"
1515
hooks:
1616
# Run the linter.
1717
- id: ruff

ci/upstream.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,6 @@ dependencies:
2828
- fsspec
2929
- pip
3030
- pip:
31-
- icechunk>=0.1.0a7 # Installs zarr v3 as dependency
31+
- icechunk>=0.1.0a8 # Installs zarr v3 as dependency
3232
# - git+https://github.com/fsspec/kerchunk@main # kerchunk is currently incompatible with zarr-python v3 (https://github.com/fsspec/kerchunk/pull/516)
3333
- imagecodecs-numcodecs==2024.6.1

docs/releases.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@ New Features
1111

1212
- Added a ``.nbytes`` accessor method which displays the bytes needed to hold the virtual references in memory.
1313
(:issue:`167`, :pull:`227`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
14+
- Sync with Icechunk v0.1.0a8 (:pull:`368`) By `Matthew Iannucci <https://github.com/mpiannucci>`. This also adds support
15+
for the `to_icechunk` method to add timestamps as checksums when writing virtual references to an icechunk store. This
16+
is useful for ensuring that virtual references are not stale when reading from an icechunk store, which can happen if the
17+
underlying data has changed since the virtual references were written.
1418

1519
Breaking changes
1620
~~~~~~~~~~~~~~~~
@@ -33,6 +37,8 @@ Bug fixes
3337
(:issue:`336`, :pull:`338`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
3438
- Fix bug in dmrpp reader so _FillValue is passed to variables' encodings.
3539
(:pull:`369`) By `Aimee Barciauskas <https://github.com/abarciauskas-bgse>`_.
40+
- Fix bug passing arguments to FITS reader, and test it on Hubble Space Telescope data.
41+
(:pull:`363`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
3642

3743
Documentation
3844
~~~~~~~~~~~~~

docs/usage.md

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -421,14 +421,16 @@ By default references are placed in separate parquet file when the total number
421421
We can also write these references out as an [IcechunkStore](https://icechunk.io/). `Icechunk` is a Open-source, cloud-native transactional tensor storage engine that is compatible with zarr version 3. To export our virtual dataset to an `Icechunk` Store, we simply use the {py:meth}`vds.virtualize.to_icechunk <virtualizarr.VirtualiZarrDatasetAccessor.to_icechunk>` accessor method.
422422

423423
```python
424-
# create an icechunk store
425-
from icechunk import IcechunkStore, StorageConfig, StoreConfig, VirtualRefConfig
426-
storage = StorageConfig.filesystem(str('combined'))
427-
store = IcechunkStore.create(storage=storage, mode="w", config=StoreConfig(
428-
virtual_ref_config=VirtualRefConfig.s3_anonymous(region='us-east-1'),
429-
))
430-
431-
combined_vds.virtualize.to_icechunk(store)
424+
# create an icechunk repository, session and write the virtual dataset to the session
425+
from icechunk import Repository, Storage, VirtualChunkContainer, local_filesystem_storage
426+
storage = local_filesystem_storage(str('combined'))
427+
428+
# By default, local virtual references and public remote virtual references can be read wihtout extra configuration.
429+
repo = Repository.create(storage=storage)
430+
session = repo.writeable_session("main")
431+
432+
# write the virtual dataset to the session with the IcechunkStore
433+
combined_vds.virtualize.to_icechunk(session.store)
432434
```
433435

434436
See the [Icechunk documentation](https://icechunk.io/icechunk-python/virtual/#creating-a-virtual-dataset-with-virtualizarr) for more details.

pyproject.toml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ hdf_reader = [
3939
"numcodecs"
4040
]
4141
icechunk = [
42-
"icechunk>=0.1.0a7",
42+
"icechunk>=0.1.0a8",
4343
]
4444
test = [
4545
"codecov",
@@ -103,6 +103,10 @@ ignore_missing_imports = true
103103
module = "ujson.*"
104104
ignore_missing_imports = true
105105

106+
[[tool.mypy.overrides]]
107+
module = "zarr.*"
108+
ignore_missing_imports = true
109+
106110
[tool.ruff]
107111
# Same as Black.
108112
line-length = 88

virtualizarr/accessor.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from datetime import datetime
12
from pathlib import Path
23
from typing import TYPE_CHECKING, Callable, Literal, Optional, overload
34

@@ -39,7 +40,10 @@ def to_zarr(self, storepath: str) -> None:
3940
dataset_to_zarr(self.ds, storepath)
4041

4142
def to_icechunk(
42-
self, store: "IcechunkStore", append_dim: Optional[str] = None
43+
self,
44+
store: "IcechunkStore",
45+
append_dim: Optional[str] = None,
46+
last_updated_at: Optional[datetime] = None,
4347
) -> None:
4448
"""
4549
Write an xarray dataset to an Icechunk store.
@@ -48,10 +52,30 @@ def to_icechunk(
4852
4953
If `append_dim` is provided, the virtual dataset will be appended to the existing IcechunkStore along the `append_dim` dimension.
5054
55+
If `last_updated_at` is provided, it will be used as a checksum for any virtual chunks written to the store with this operation.
56+
At read time, if any of the virtual chunks have been updated since this provided datetime, an error will be raised.
57+
This protects against reading outdated virtual chunks that have been updated since the last read. When not provided, no check is performed.
58+
This value is stored in Icechunk with seconds precision, so be sure to take that into account when providing this value.
59+
5160
Parameters
5261
----------
5362
store: IcechunkStore
5463
append_dim: str, optional
64+
When provided, specifies the dimension along which to append the virtual dataset.
65+
last_updated_at: datetime, optional
66+
When provided, uses provided datetime as a checksum for any virtual chunks written to the store with this operation.
67+
When not provided (default), no check is performed.
68+
69+
Examples
70+
--------
71+
To ensure an error is raised if the files containing referenced virtual chunks are modified at any time from now on, pass the current time to ``last_updated_at``.
72+
73+
>>> from datetime import datetime
74+
>>>
75+
>>> vds.virtualize.to_icechunk(
76+
... icechunkstore,
77+
... last_updated_at=datetime.now(),
78+
... )
5579
"""
5680
from virtualizarr.writers.icechunk import dataset_to_icechunk
5781

virtualizarr/backend.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def open_virtual_dataset(
128128
----------
129129
filepath : str, default None
130130
File path to open as a set of virtualized zarr arrays.
131-
filetype : FileType, default None
131+
filetype : FileType or str, default None
132132
Type of file to be opened. Used to determine which kerchunk file format backend to use.
133133
Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'dmrpp', 'zarr_v3', 'kerchunk'}.
134134
If not provided will attempt to automatically infer the correct filetype from header bytes.
@@ -182,13 +182,16 @@ def open_virtual_dataset(
182182
if backend and filetype:
183183
raise ValueError("Cannot pass both a filetype and an explicit VirtualBackend")
184184

185-
if filetype is not None:
186-
# if filetype is user defined, convert to FileType
187-
filetype = FileType(filetype)
188-
else:
185+
if filetype is None:
189186
filetype = automatically_determine_filetype(
190187
filepath=filepath, reader_options=reader_options
191188
)
189+
elif isinstance(filetype, str):
190+
# if filetype is a user defined string, convert to FileType
191+
filetype = FileType(filetype.lower())
192+
elif not isinstance(filetype, FileType):
193+
raise ValueError("Filetype must be a valid string or FileType")
194+
192195
if backend:
193196
backend_cls = backend
194197
else:

virtualizarr/readers/fits.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def open_virtual_dataset(
4242

4343
# TODO This wouldn't work until either you had an xarray backend for FITS installed, or issue #124 is implemented to load data from ManifestArrays directly
4444
# TODO Once we have one of those we can use ``maybe_open_loadable_vars_and_indexes`` here
45-
if loadable_variables != [] or indexes != {} or decode_times:
45+
if loadable_variables or indexes:
4646
raise NotImplementedError(
4747
"Cannot load variables or indexes from FITS files as there is no xarray backend engine for FITS"
4848
)

virtualizarr/tests/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from virtualizarr.manifests.manifest import join
1010
from virtualizarr.zarr import ZArray, ceildiv
1111

12-
network = pytest.mark.network
12+
requires_network = pytest.mark.network
1313

1414

1515
def _importorskip(

virtualizarr/tests/test_backend.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@
1515
from virtualizarr.readers.hdf import HDFVirtualBackend
1616
from virtualizarr.tests import (
1717
has_astropy,
18-
network,
1918
requires_kerchunk,
19+
requires_network,
2020
requires_s3fs,
2121
requires_scipy,
2222
)
@@ -193,7 +193,7 @@ def test_var_attr_coords(self, netcdf4_file_with_2d_coords):
193193
assert set(vds.coords) == set(expected_coords)
194194

195195

196-
@network
196+
@requires_network
197197
@requires_s3fs
198198
class TestReadFromS3:
199199
@pytest.mark.parametrize(
@@ -216,7 +216,7 @@ def test_anon_read_s3(self, indexes, hdf_backend):
216216
assert isinstance(vds[var].data, ManifestArray), var
217217

218218

219-
@network
219+
@requires_network
220220
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
221221
class TestReadFromURL:
222222
@pytest.mark.parametrize(
@@ -383,9 +383,14 @@ def test_explicit_filetype(self, netcdf4_file):
383383
with pytest.raises(ValueError):
384384
open_virtual_dataset(netcdf4_file, filetype="unknown")
385385

386+
with pytest.raises(ValueError):
387+
open_virtual_dataset(netcdf4_file, filetype=ManifestArray)
388+
386389
with pytest.raises(NotImplementedError):
387390
open_virtual_dataset(netcdf4_file, filetype="grib")
388391

392+
open_virtual_dataset(netcdf4_file, filetype="netCDF4")
393+
389394
def test_explicit_filetype_and_backend(self, netcdf4_file):
390395
with pytest.raises(ValueError):
391396
open_virtual_dataset(

0 commit comments

Comments
 (0)