-
-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Compatibility for zarr-python 3.x #9552
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 20 commits
c54a052
483eb7f
40a746c
6c8d2bb
531b521
849df40
88bd64b
ef1549a
20c22bd
6087e5e
15fe55e
8e06bc7
f22100e
f8c427f
594d36d
046d37e
6b0ca62
d315583
389cc82
1fe409a
7c29ea6
efb66dd
90c0ae6
9b3c288
3717391
8d16bb2
bd978b0
34c4c24
118e50e
e6e2066
1d1d9cb
9089508
ea00308
a330e4b
bde42ee
b15705d
38f43b9
1cfc458
af1a0b8
d9d6fee
4c54371
1ce8878
0c2e260
fc2738a
0e47c3f
5b39f42
7d9fc05
0fa94ee
c2fd6f1
ac2ef29
c6be467
5b5b77f
4f07eb7
5151bc2
00c62d7
e0390a5
2e7ec07
26081d4
0350056
a38bff6
08f0594
0e81edf
55d852d
3491137
5bf5f2a
f2f9fff
a84fa79
c280f24
9fec1d6
04c017e
625591e
3795b07
ea2cb57
4f617d2
d1e3c73
45d5a78
f208c39
968217c
45a37f6
c15e856
c10bfc0
82e6a6d
d752693
0fd4103
c2a47a1
26b2661
1d73d36
be79e88
ff0f2c0
5f37042
268e3eb
1abb2ba
7682bf4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,14 @@ | ||
from __future__ import annotations | ||
|
||
import functools | ||
import json | ||
import os | ||
import warnings | ||
from collections.abc import Callable, Iterable | ||
from typing import TYPE_CHECKING, Any | ||
from typing import TYPE_CHECKING, Any, Literal | ||
|
||
import numpy as np | ||
import packaging.version | ||
import pandas as pd | ||
|
||
from xarray import coding, conventions | ||
|
@@ -40,8 +42,20 @@ | |
from xarray.core.dataset import Dataset | ||
from xarray.core.datatree import DataTree | ||
|
||
|
||
@functools.lru_cache | ||
def _zarr_v3() -> bool: | ||
try: | ||
import zarr | ||
except ImportError: | ||
return False | ||
else: | ||
return packaging.version.parse(zarr.__version__).major >= 3 | ||
dcherian marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
||
|
||
# need some special secret attributes to tell us the dimensions | ||
DIMENSION_KEY = "_ARRAY_DIMENSIONS" | ||
ZarrFormat = Literal[2, 3] | ||
|
||
|
||
def encode_zarr_attr_value(value): | ||
|
@@ -75,8 +89,10 @@ def __init__(self, zarr_array): | |
self.shape = self._array.shape | ||
|
||
# preserve vlen string object dtype (GH 7328) | ||
if self._array.filters is not None and any( | ||
[filt.codec_id == "vlen-utf8" for filt in self._array.filters] | ||
if ( | ||
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
not _zarr_v3() | ||
and self._array.filters is not None | ||
and any([filt.codec_id == "vlen-utf8" for filt in self._array.filters]) | ||
): | ||
dtype = coding.strings.create_vlen_dtype(str) | ||
else: | ||
|
@@ -317,6 +333,7 @@ def extract_zarr_variable_encoding( | |
|
||
safe_to_drop = {"source", "original_shape"} | ||
valid_encodings = { | ||
"codecs", | ||
"chunks", | ||
"compressor", | ||
"filters", | ||
|
@@ -614,9 +631,25 @@ def open_store_variable(self, name, zarr_array=None): | |
encoding = { | ||
"chunks": zarr_array.chunks, | ||
"preferred_chunks": dict(zip(dimensions, zarr_array.chunks, strict=True)), | ||
"compressor": zarr_array.compressor, | ||
"filters": zarr_array.filters, | ||
} | ||
|
||
if _zarr_v3() and zarr_array.metadata.zarr_format == 3: | ||
encoding["codecs"] = [x.to_dict() for x in zarr_array.metadata.codecs] | ||
elif _zarr_v3(): | ||
encoding.update( | ||
{ | ||
"compressor": zarr_array.metadata.compressor, | ||
"filters": zarr_array.metadata.filters, | ||
} | ||
) | ||
else: | ||
encoding.update( | ||
{ | ||
"compressor": zarr_array.compressor, | ||
"filters": zarr_array.filters, | ||
} | ||
) | ||
|
||
# _FillValue needs to be in attributes, not encoding, so it will get | ||
# picked up by decode_cf | ||
if zarr_array.fill_value is not None: | ||
|
@@ -786,7 +819,11 @@ def store( | |
variables_to_set, check_encoding_set, writer, unlimited_dims=unlimited_dims | ||
) | ||
if self._consolidate_on_close: | ||
zarr.consolidate_metadata(self.zarr_group.store) | ||
kwargs = {} | ||
if _zarr_v3(): | ||
# https://github.com/zarr-developers/zarr-python/pull/2113#issuecomment-2386718323 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can this be removed at some point in the future? If so, it would be good to add a TODO There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll look more closely later, but for now I think this will be required, following a deliberate change in zarr v3 consolidated metadata. With v2 metadata, I think that consolidated happened at the store-level, and was all-or-nothing. If you have two Groups with Arrays, the consolidated metadata will be placed at the store root, and will contain everything: # zarr v2
In [1]: import json, xarray as xr
In [2]: store = {}
In [3]: a = xr.tutorial.load_dataset("air_temperature")
In [4]: b = xr.tutorial.load_dataset("rasm")
In [5]: a.to_zarr(store=store, group="A")
/Users/tom/gh/zarr-developers/zarr-v2/.direnv/python-3.10/lib/python3.10/site-packages/xarray/core/dataset.py:2562: SerializationWarning: saving variable None with floating point data as an integer dtype without any _FillValue to use for NaNs
return to_zarr( # type: ignore[call-overload,misc]
Out[5]: <xarray.backends.zarr.ZarrStore at 0x11113edc0>
In [6]: b.to_zarr(store=store, group="B")
Out[6]: <xarray.backends.zarr.ZarrStore at 0x10cab2440>
In [7]: list(json.loads(store['.zmetadata'])['metadata'])
Out[7]: # contains nodes from both A and B
['.zgroup',
'A/.zattrs',
'A/.zgroup',
'A/air/.zarray',
'A/air/.zattrs',
'A/lat/.zarray',
'A/lat/.zattrs',
'A/lon/.zarray',
'A/lon/.zattrs',
'A/time/.zarray',
'A/time/.zattrs',
'B/.zattrs',
'B/.zgroup',
'B/Tair/.zarray',
'B/Tair/.zattrs',
'B/time/.zarray',
'B/time/.zattrs',
'B/xc/.zarray',
'B/xc/.zattrs',
'B/yc/.zarray',
'B/yc/.zattrs'] With v3, consolidated metadata is scoped to a Group, so we can provide the group we want to consolidated (the zarr-python API does support "consolidate everything in the store at the root", but I don't think we want that because you'd need to open it at the root when reading, and I think it's kinda where for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Potentially it would make sense to have two versions of consolidated metadata:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed. zarr-developers/zarr-specs#309 has some discussion on adding a |
||
kwargs["path"] = self.zarr_group.name.lstrip("/") | ||
zarr.consolidate_metadata(self.zarr_group.store, **kwargs) | ||
|
||
def sync(self): | ||
pass | ||
|
@@ -850,9 +887,19 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No | |
# - Existing variables already have their attrs included in the consolidated metadata file. | ||
# - The size of dimensions can not be expanded, that would require a call using `append_dim` | ||
# which is mutually exclusive with `region` | ||
kwargs = {} | ||
if _zarr_v3(): | ||
kwargs["store"] = self.zarr_group.store | ||
else: | ||
kwargs["store"] = self.zarr_group.chunk_store | ||
|
||
# TODO: see if zarr should normalize these strings. | ||
zarr_array = zarr.open( | ||
store=self.zarr_group.chunk_store, | ||
path=f"{self.zarr_group.name}/{name}", | ||
**kwargs, | ||
dcherian marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
# path=f"{self.zarr_group.name}/{name}", | ||
TomAugspurger marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
path="/".join([self.zarr_group.name.rstrip("/"), name]).lstrip( | ||
"/" | ||
), | ||
write_empty_chunks=self._write_empty, | ||
) | ||
else: | ||
|
@@ -868,7 +915,10 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No | |
|
||
new_shape = list(zarr_array.shape) | ||
new_shape[append_axis] += v.shape[append_axis] | ||
zarr_array.resize(new_shape) | ||
if _zarr_v3(): | ||
zarr_array = zarr_array.resize(new_shape) | ||
else: | ||
zarr_array.resize(new_shape) | ||
|
||
zarr_shape = zarr_array.shape | ||
|
||
|
@@ -913,6 +963,10 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No | |
else: | ||
encoding["write_empty_chunks"] = self._write_empty | ||
|
||
if "codecs" in encoding: | ||
pipeline = encoding.pop("codecs") | ||
encoding["codecs"] = pipeline | ||
|
||
|
||
zarr_array = self.zarr_group.create( | ||
name, | ||
shape=shape, | ||
|
@@ -1396,40 +1450,34 @@ def _get_open_params( | |
if isinstance(store, os.PathLike): | ||
store = os.fspath(store) | ||
|
||
if zarr_version is None: | ||
# default to 2 if store doesn't specify it's version (e.g. a path) | ||
zarr_version = getattr(store, "_store_version", 2) | ||
|
||
open_kwargs = dict( | ||
# mode='a-' is a handcrafted xarray specialty | ||
mode="a" if mode == "a-" else mode, | ||
synchronizer=synchronizer, | ||
path=group, | ||
) | ||
open_kwargs["storage_options"] = storage_options | ||
if zarr_version > 2: | ||
open_kwargs["zarr_version"] = zarr_version | ||
|
||
if consolidated or consolidate_on_close: | ||
raise ValueError( | ||
"consolidated metadata has not been implemented for zarr " | ||
f"version {zarr_version} yet. Set consolidated=False for " | ||
f"zarr version {zarr_version}. See also " | ||
"https://github.com/zarr-developers/zarr-specs/issues/136" | ||
) | ||
|
||
if consolidated is None: | ||
consolidated = False | ||
if _zarr_v3(): | ||
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
open_kwargs["zarr_format"] = zarr_version | ||
else: | ||
open_kwargs["zarr_version"] = zarr_version | ||
|
||
if chunk_store is not None: | ||
open_kwargs["chunk_store"] = chunk_store | ||
if consolidated is None: | ||
consolidated = False | ||
|
||
if _zarr_v3(): | ||
missing_exc: type[Exception] = ValueError | ||
else: | ||
missing_exc = zarr.errors.GroupNotFoundError | ||
|
||
if consolidated is None: | ||
try: | ||
zarr_group = zarr.open_consolidated(store, **open_kwargs) | ||
except KeyError: | ||
except (ValueError, KeyError): | ||
dcherian marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# ValueError in zarr-python 3.x, KeyError in 2.x. | ||
try: | ||
zarr_group = zarr.open_group(store, **open_kwargs) | ||
warnings.warn( | ||
|
@@ -1447,7 +1495,7 @@ def _get_open_params( | |
RuntimeWarning, | ||
stacklevel=stacklevel, | ||
) | ||
except zarr.errors.GroupNotFoundError as err: | ||
except missing_exc as err: | ||
raise FileNotFoundError( | ||
f"No such file or directory: '{store}'" | ||
) from err | ||
|
Uh oh!
There was an error while loading. Please reload this page.