Skip to content

Commit a2fe1a7

Browse files
authored
Merge branch 'main' into fix_subgroup_dims_HDF
2 parents 2f1f637 + 0d2d6ab commit a2fe1a7

File tree

16 files changed

+323
-148
lines changed

16 files changed

+323
-148
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ repos:
1111

1212
- repo: https://github.com/astral-sh/ruff-pre-commit
1313
# Ruff version.
14-
rev: "v0.8.1"
14+
rev: "v0.8.6"
1515
hooks:
1616
# Run the linter.
1717
- id: ruff

ci/upstream.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ channels:
33
- conda-forge
44
- nodefaults
55
dependencies:
6-
- xarray>=2024.10.0
6+
- xarray>=2024.10.0,<2025.0.0
77
- h5netcdf
88
- h5py
99
- hdf5
@@ -28,6 +28,6 @@ dependencies:
2828
- fsspec
2929
- pip
3030
- pip:
31-
- icechunk>=0.1.0a7 # Installs zarr v3 as dependency
31+
- icechunk==0.1.0a8 # Installs zarr v3 beta 3 as dependency
3232
# - git+https://github.com/fsspec/kerchunk@main # kerchunk is currently incompatible with zarr-python v3 (https://github.com/fsspec/kerchunk/pull/516)
3333
- imagecodecs-numcodecs==2024.6.1

docs/releases.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@ New Features
1111

1212
- Added a ``.nbytes`` accessor method which displays the bytes needed to hold the virtual references in memory.
1313
(:issue:`167`, :pull:`227`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
14+
- Sync with Icechunk v0.1.0a8 (:pull:`368`) By `Matthew Iannucci <https://github.com/mpiannucci>`. This also adds support
15+
for the `to_icechunk` method to add timestamps as checksums when writing virtual references to an icechunk store. This
16+
is useful for ensuring that virtual references are not stale when reading from an icechunk store, which can happen if the
17+
underlying data has changed since the virtual references were written.
1418

1519
Breaking changes
1620
~~~~~~~~~~~~~~~~
@@ -33,6 +37,10 @@ Bug fixes
3337
(:issue:`336`, :pull:`338`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
3438
- Fix bug in HDF reader where dimension names of dimensions in a subgroup would be incorrect.
3539
(:issue:`364`, :pull:`366`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
40+
- Fix bug in dmrpp reader so _FillValue is included in variables' encodings.
41+
(:pull:`369`) By `Aimee Barciauskas <https://github.com/abarciauskas-bgse>`_.
42+
- Fix bug passing arguments to FITS reader, and test it on Hubble Space Telescope data.
43+
(:pull:`363`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
3644

3745
Documentation
3846
~~~~~~~~~~~~~

docs/usage.md

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -421,14 +421,16 @@ By default references are placed in separate parquet file when the total number
421421
We can also write these references out as an [IcechunkStore](https://icechunk.io/). `Icechunk` is a Open-source, cloud-native transactional tensor storage engine that is compatible with zarr version 3. To export our virtual dataset to an `Icechunk` Store, we simply use the {py:meth}`vds.virtualize.to_icechunk <virtualizarr.VirtualiZarrDatasetAccessor.to_icechunk>` accessor method.
422422

423423
```python
424-
# create an icechunk store
425-
from icechunk import IcechunkStore, StorageConfig, StoreConfig, VirtualRefConfig
426-
storage = StorageConfig.filesystem(str('combined'))
427-
store = IcechunkStore.create(storage=storage, mode="w", config=StoreConfig(
428-
virtual_ref_config=VirtualRefConfig.s3_anonymous(region='us-east-1'),
429-
))
430-
431-
combined_vds.virtualize.to_icechunk(store)
424+
# create an icechunk repository, session and write the virtual dataset to the session
425+
from icechunk import Repository, Storage, VirtualChunkContainer, local_filesystem_storage
426+
storage = local_filesystem_storage(str('combined'))
427+
428+
# By default, local virtual references and public remote virtual references can be read wihtout extra configuration.
429+
repo = Repository.create(storage=storage)
430+
session = repo.writeable_session("main")
431+
432+
# write the virtual dataset to the session with the IcechunkStore
433+
combined_vds.virtualize.to_icechunk(session.store)
432434
```
433435

434436
See the [Icechunk documentation](https://icechunk.io/icechunk-python/virtual/#creating-a-virtual-dataset-with-virtualizarr) for more details.

pyproject.toml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ classifiers = [
2121
requires-python = ">=3.10"
2222
dynamic = ["version"]
2323
dependencies = [
24-
"xarray>=2024.10.0",
24+
"xarray>=2024.10.0,<2025.0.0",
2525
"numpy>=2.0.0",
2626
"packaging",
2727
"universal-pathlib",
@@ -39,7 +39,7 @@ hdf_reader = [
3939
"numcodecs"
4040
]
4141
icechunk = [
42-
"icechunk>=0.1.0a7",
42+
"icechunk==0.1.0a8",
4343
]
4444
test = [
4545
"codecov",
@@ -103,6 +103,10 @@ ignore_missing_imports = true
103103
module = "ujson.*"
104104
ignore_missing_imports = true
105105

106+
[[tool.mypy.overrides]]
107+
module = "zarr.*"
108+
ignore_missing_imports = true
109+
106110
[tool.ruff]
107111
# Same as Black.
108112
line-length = 88

virtualizarr/accessor.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from datetime import datetime
12
from pathlib import Path
23
from typing import TYPE_CHECKING, Callable, Literal, Optional, overload
34

@@ -39,7 +40,10 @@ def to_zarr(self, storepath: str) -> None:
3940
dataset_to_zarr(self.ds, storepath)
4041

4142
def to_icechunk(
42-
self, store: "IcechunkStore", append_dim: Optional[str] = None
43+
self,
44+
store: "IcechunkStore",
45+
append_dim: Optional[str] = None,
46+
last_updated_at: Optional[datetime] = None,
4347
) -> None:
4448
"""
4549
Write an xarray dataset to an Icechunk store.
@@ -48,10 +52,30 @@ def to_icechunk(
4852

4953
If `append_dim` is provided, the virtual dataset will be appended to the existing IcechunkStore along the `append_dim` dimension.
5054

55+
If `last_updated_at` is provided, it will be used as a checksum for any virtual chunks written to the store with this operation.
56+
At read time, if any of the virtual chunks have been updated since this provided datetime, an error will be raised.
57+
This protects against reading outdated virtual chunks that have been updated since the last read. When not provided, no check is performed.
58+
This value is stored in Icechunk with seconds precision, so be sure to take that into account when providing this value.
59+
5160
Parameters
5261
----------
5362
store: IcechunkStore
5463
append_dim: str, optional
64+
When provided, specifies the dimension along which to append the virtual dataset.
65+
last_updated_at: datetime, optional
66+
When provided, uses provided datetime as a checksum for any virtual chunks written to the store with this operation.
67+
When not provided (default), no check is performed.
68+
69+
Examples
70+
--------
71+
To ensure an error is raised if the files containing referenced virtual chunks are modified at any time from now on, pass the current time to ``last_updated_at``.
72+
73+
>>> from datetime import datetime
74+
>>>
75+
>>> vds.virtualize.to_icechunk(
76+
... icechunkstore,
77+
... last_updated_at=datetime.now(),
78+
... )
5579
"""
5680
from virtualizarr.writers.icechunk import dataset_to_icechunk
5781

virtualizarr/backend.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def open_virtual_dataset(
128128
----------
129129
filepath : str, default None
130130
File path to open as a set of virtualized zarr arrays.
131-
filetype : FileType, default None
131+
filetype : FileType or str, default None
132132
Type of file to be opened. Used to determine which kerchunk file format backend to use.
133133
Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'dmrpp', 'zarr_v3', 'kerchunk'}.
134134
If not provided will attempt to automatically infer the correct filetype from header bytes.
@@ -182,13 +182,16 @@ def open_virtual_dataset(
182182
if backend and filetype:
183183
raise ValueError("Cannot pass both a filetype and an explicit VirtualBackend")
184184

185-
if filetype is not None:
186-
# if filetype is user defined, convert to FileType
187-
filetype = FileType(filetype)
188-
else:
185+
if filetype is None:
189186
filetype = automatically_determine_filetype(
190187
filepath=filepath, reader_options=reader_options
191188
)
189+
elif isinstance(filetype, str):
190+
# if filetype is a user defined string, convert to FileType
191+
filetype = FileType(filetype.lower())
192+
elif not isinstance(filetype, FileType):
193+
raise ValueError("Filetype must be a valid string or FileType")
194+
192195
if backend:
193196
backend_cls = backend
194197
else:

virtualizarr/readers/dmrpp.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -411,7 +411,8 @@ def _parse_variable(self, var_tag: ET.Element) -> Variable:
411411
attrs: dict[str, Any] = {}
412412
for attr_tag in var_tag.iterfind("dap:Attribute", self._NS):
413413
attrs.update(self._parse_attribute(attr_tag))
414-
# Fill value is placed in encoding and thus removed from attributes
414+
# Fill value is placed in zarr array's fill_value and variable encoding and removed from attributes
415+
encoding = {k: attrs.get(k) for k in self._ENCODING_KEYS if k in attrs}
415416
fill_value = attrs.pop("_FillValue", None)
416417
# create ManifestArray and ZArray
417418
zarray = ZArray(
@@ -423,7 +424,6 @@ def _parse_variable(self, var_tag: ET.Element) -> Variable:
423424
shape=shape,
424425
)
425426
marr = ManifestArray(zarray=zarray, chunkmanifest=chunkmanifest)
426-
encoding = {k: attrs.get(k) for k in self._ENCODING_KEYS if k in attrs}
427427
return Variable(dims=dims.keys(), data=marr, attrs=attrs, encoding=encoding)
428428

429429
def _parse_attribute(self, attr_tag: ET.Element) -> dict[str, Any]:

virtualizarr/readers/fits.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def open_virtual_dataset(
4242

4343
# TODO This wouldn't work until either you had an xarray backend for FITS installed, or issue #124 is implemented to load data from ManifestArrays directly
4444
# TODO Once we have one of those we can use ``maybe_open_loadable_vars_and_indexes`` here
45-
if loadable_variables != [] or indexes != {} or decode_times:
45+
if loadable_variables or indexes:
4646
raise NotImplementedError(
4747
"Cannot load variables or indexes from FITS files as there is no xarray backend engine for FITS"
4848
)

virtualizarr/readers/hdf/hdf.py

Lines changed: 30 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,18 @@
11
import math
22
from pathlib import Path
3-
from typing import TYPE_CHECKING, Dict, Iterable, List, Mapping, Optional, Union
3+
from typing import (
4+
TYPE_CHECKING,
5+
Any,
6+
Dict,
7+
Iterable,
8+
List,
9+
Mapping,
10+
Optional,
11+
Union,
12+
)
413

514
import numpy as np
615
import xarray as xr
7-
from xarray import Dataset, Index, Variable
816

917
from virtualizarr.manifests import (
1018
ChunkEntry,
@@ -22,17 +30,15 @@
2230
from virtualizarr.utils import _FsspecFSFromFilepath, check_for_collisions, soft_import
2331
from virtualizarr.zarr import ZArray
2432

25-
if TYPE_CHECKING:
26-
import h5py # type: ignore
27-
from h5py import Dataset, Group # type: ignore
28-
2933
h5py = soft_import("h5py", "For reading hdf files", strict=False)
30-
if h5py:
31-
Dataset = h5py.Dataset # type: ignore
32-
Group = h5py.Group # type: ignore
34+
35+
36+
if TYPE_CHECKING:
37+
from h5py import Dataset as H5Dataset # type: ignore[import-untyped]
38+
from h5py import Group as H5Group # type: ignore[import-untyped]
3339
else:
34-
Dataset = dict() # type: ignore
35-
Group = dict() # type: ignore
40+
H5Dataset: Any = None
41+
H5Group: Any = None
3642

3743

3844
class HDFVirtualBackend(VirtualBackend):
@@ -43,7 +49,7 @@ def open_virtual_dataset(
4349
drop_variables: Iterable[str] | None = None,
4450
loadable_variables: Iterable[str] | None = None,
4551
decode_times: bool | None = None,
46-
indexes: Mapping[str, Index] | None = None,
52+
indexes: Mapping[str, xr.Index] | None = None,
4753
virtual_backend_kwargs: Optional[dict] = None,
4854
reader_options: Optional[dict] = None,
4955
) -> xr.Dataset:
@@ -92,7 +98,10 @@ def open_virtual_dataset(
9298
)
9399

94100
@staticmethod
95-
def _dataset_chunk_manifest(path: str, dataset: Dataset) -> Optional[ChunkManifest]:
101+
def _dataset_chunk_manifest(
102+
path: str,
103+
dataset: H5Dataset,
104+
) -> Optional[ChunkManifest]:
96105
"""
97106
Generate ChunkManifest for HDF5 dataset.
98107

@@ -116,7 +125,7 @@ def _dataset_chunk_manifest(path: str, dataset: Dataset) -> Optional[ChunkManife
116125
key_list = [0] * (len(dataset.shape) or 1)
117126
key = ".".join(map(str, key_list))
118127

119-
chunk_entry = ChunkEntry.with_validation(
128+
chunk_entry: ChunkEntry = ChunkEntry.with_validation( # type: ignore[attr-defined]
120129
path=path, offset=dsid.get_offset(), length=dsid.get_storage_size()
121130
)
122131
chunk_key = ChunkKey(key)
@@ -160,7 +169,7 @@ def add_chunk_info(blob):
160169
return chunk_manifest
161170

162171
@staticmethod
163-
def _dataset_dims(dataset: Dataset, group: str = "") -> List[str]:
172+
def _dataset_dims(dataset: H5Dataset, group: str = "") -> List[str]:
164173
"""
165174
Get a list of dimension scale names attached to input HDF5 dataset.
166175

@@ -208,7 +217,7 @@ def _dataset_dims(dataset: Dataset, group: str = "") -> List[str]:
208217
return [dim.removeprefix(group) for dim in dims]
209218

210219
@staticmethod
211-
def _extract_attrs(h5obj: Union[Dataset, Group]):
220+
def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
212221
"""
213222
Extract attributes from an HDF5 group or dataset.
214223

@@ -256,7 +265,7 @@ def _extract_attrs(h5obj: Union[Dataset, Group]):
256265
@staticmethod
257266
def _dataset_to_variable(
258267
path: str,
259-
dataset: Dataset,
268+
dataset: H5Dataset,
260269
group: str,
261270
) -> Optional[Variable]:
262271
"""
@@ -311,9 +320,9 @@ def _dataset_to_variable(
311320
manifest = HDFVirtualBackend._dataset_chunk_manifest(path, dataset)
312321
if manifest:
313322
marray = ManifestArray(zarray=zarray, chunkmanifest=manifest)
314-
variable = Variable(data=marray, dims=dims, attrs=attrs)
323+
variable = xr.Variable(data=marray, dims=dims, attrs=attrs)
315324
else:
316-
variable = Variable(data=np.empty(dataset.shape), dims=dims, attrs=attrs)
325+
variable = xr.Variable(data=np.empty(dataset.shape), dims=dims, attrs=attrs)
317326
return variable
318327

319328
@staticmethod
@@ -324,7 +333,7 @@ def _virtual_vars_from_hdf(
324333
reader_options: Optional[dict] = {
325334
"storage_options": {"key": "", "secret": "", "anon": True}
326335
},
327-
) -> Dict[str, Variable]:
336+
) -> Dict[str, xr.Variable]:
328337
"""
329338
Extract xarray Variables with ManifestArray data from an HDF file or group
330339

@@ -364,7 +373,7 @@ def _virtual_vars_from_hdf(
364373
variables = {}
365374
for key in g.keys():
366375
if key not in drop_variables:
367-
if isinstance(g[key], Dataset):
376+
if isinstance(g[key], h5py.Dataset):
368377
variable = HDFVirtualBackend._dataset_to_variable(
369378
path=path,
370379
dataset=g[key],

0 commit comments

Comments
 (0)