Skip to content

Commit cc50b41

Browse files
TomAugspurgerd-v-b
andauthored
base64 encode fill value for some dtypes with zarr_format=2 (#2286)
Co-authored-by: Davis Bennett <[email protected]>
1 parent 1674282 commit cc50b41

File tree

3 files changed

+56
-3
lines changed

3 files changed

+56
-3
lines changed

src/zarr/abc/metadata.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ def to_dict(self) -> dict[str, JSON]:
2222
are instances of `Metadata`. Sequences of `Metadata` are similarly recursed into, and
2323
the output of that recursion is collected in a list.
2424
"""
25-
...
2625
out_dict = {}
2726
for field in fields(self):
2827
key = field.name

src/zarr/core/metadata/v2.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
from __future__ import annotations
22

3+
import base64
34
from collections.abc import Iterable
45
from enum import Enum
5-
from typing import TYPE_CHECKING
6+
from typing import TYPE_CHECKING, cast
67

78
if TYPE_CHECKING:
89
from typing import Any, Literal, Self
@@ -31,7 +32,7 @@ class ArrayV2Metadata(ArrayMetadata):
3132
shape: ChunkCoords
3233
chunk_grid: RegularChunkGrid
3334
data_type: np.dtype[Any]
34-
fill_value: None | int | float = 0
35+
fill_value: None | int | float | str | bytes = 0
3536
order: Literal["C", "F"] = "C"
3637
filters: tuple[numcodecs.abc.Codec, ...] | None = None
3738
dimension_separator: Literal[".", "/"] = "."
@@ -140,6 +141,13 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata:
140141
_data = data.copy()
141142
# check that the zarr_format attribute is correct
142143
_ = parse_zarr_format(_data.pop("zarr_format"))
144+
dtype = parse_dtype(_data["dtype"])
145+
146+
if dtype.kind in "SV":
147+
fill_value_encoded = _data.get("fill_value")
148+
if fill_value_encoded is not None:
149+
fill_value = base64.standard_b64decode(fill_value_encoded)
150+
_data["fill_value"] = fill_value
143151

144152
# zarr v2 allowed arbitrary keys here.
145153
# We don't want the ArrayV2Metadata constructor to fail just because someone put an
@@ -155,6 +163,14 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata:
155163

156164
def to_dict(self) -> dict[str, JSON]:
157165
zarray_dict = super().to_dict()
166+
167+
if self.dtype.kind in "SV" and self.fill_value is not None:
168+
# There's a relationship between self.dtype and self.fill_value
169+
# that mypy isn't aware of. The fact that we have S or V dtype here
170+
# means we should have a bytes-type fill_value.
171+
fill_value = base64.standard_b64encode(cast(bytes, self.fill_value)).decode("ascii")
172+
zarray_dict["fill_value"] = fill_value
173+
158174
_ = zarray_dict.pop("chunk_grid")
159175
zarray_dict["chunks"] = self.chunk_grid.chunk_shape
160176

tests/v3/test_v2.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import json
12
from collections.abc import Iterator
23

34
import numpy as np
@@ -6,6 +7,9 @@
67
from numcodecs.blosc import Blosc
78

89
import zarr
10+
import zarr.core.buffer.cpu
11+
import zarr.core.metadata
12+
import zarr.storage
913
from zarr import Array
1014
from zarr.storage import MemoryStore, StorePath
1115

@@ -46,3 +50,37 @@ def test_codec_pipeline() -> None:
4650
result = array[:]
4751
expected = np.ones(1)
4852
np.testing.assert_array_equal(result, expected)
53+
54+
55+
@pytest.mark.parametrize("dtype", ["|S", "|V"])
56+
async def test_v2_encode_decode(dtype):
57+
store = zarr.storage.MemoryStore(mode="w")
58+
g = zarr.group(store=store, zarr_format=2)
59+
g.create_array(
60+
name="foo",
61+
shape=(3,),
62+
chunks=(3,),
63+
dtype=dtype,
64+
fill_value=b"X",
65+
)
66+
67+
result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype())
68+
assert result is not None
69+
70+
serialized = json.loads(result.to_bytes())
71+
expected = {
72+
"chunks": [3],
73+
"compressor": None,
74+
"dtype": f"{dtype}0",
75+
"fill_value": "WA==",
76+
"filters": None,
77+
"order": "C",
78+
"shape": [3],
79+
"zarr_format": 2,
80+
"dimension_separator": ".",
81+
}
82+
assert serialized == expected
83+
84+
data = zarr.open_array(store=store, path="foo")[:]
85+
expected = np.full((3,), b"X", dtype=dtype)
86+
np.testing.assert_equal(data, expected)

0 commit comments

Comments
 (0)