Skip to content

Commit 13be49c

Browse files
committed
Merge remote-tracking branch 'upstream/v3' into user/tom/fix/v2-no-fill-value
2 parents 8637c08 + 7e2be57 commit 13be49c

File tree

18 files changed

+663
-132
lines changed

18 files changed

+663
-132
lines changed

.pre-commit-config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ default_language_version:
77
python: python3
88
repos:
99
- repo: https://github.com/astral-sh/ruff-pre-commit
10-
rev: v0.6.8
10+
rev: v0.6.9
1111
hooks:
1212
- id: ruff
1313
args: ["--fix", "--show-fixes"]
@@ -18,7 +18,7 @@ repos:
1818
- id: codespell
1919
args: ["-L", "ba,ihs,kake,nd,noe,nwo,te,fo,zar", "-S", "fixture"]
2020
- repo: https://github.com/pre-commit/pre-commit-hooks
21-
rev: v4.6.0
21+
rev: v5.0.0
2222
hooks:
2323
- id: check-yaml
2424
- repo: https://github.com/pre-commit/mirrors-mypy

src/zarr/abc/metadata.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ def to_dict(self) -> dict[str, JSON]:
2222
are instances of `Metadata`. Sequences of `Metadata` are similarly recursed into, and
2323
the output of that recursion is collected in a list.
2424
"""
25-
...
2625
out_dict = {}
2726
for field in fields(self):
2827
key = field.name

src/zarr/codecs/__init__.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,20 @@
11
from __future__ import annotations
22

3+
from typing import TYPE_CHECKING, Any
4+
5+
if TYPE_CHECKING:
6+
import numpy as np
7+
38
from zarr.codecs.blosc import BloscCname, BloscCodec, BloscShuffle
49
from zarr.codecs.bytes import BytesCodec, Endian
510
from zarr.codecs.crc32c_ import Crc32cCodec
611
from zarr.codecs.gzip import GzipCodec
712
from zarr.codecs.pipeline import BatchedCodecPipeline
813
from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation
914
from zarr.codecs.transpose import TransposeCodec
15+
from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec
1016
from zarr.codecs.zstd import ZstdCodec
17+
from zarr.core.metadata.v3 import DataType
1118

1219
__all__ = [
1320
"BatchedCodecPipeline",
@@ -21,5 +28,19 @@
2128
"ShardingCodec",
2229
"ShardingCodecIndexLocation",
2330
"TransposeCodec",
31+
"VLenUTF8Codec",
32+
"VLenBytesCodec",
2433
"ZstdCodec",
2534
]
35+
36+
37+
def _get_default_array_bytes_codec(
38+
np_dtype: np.dtype[Any],
39+
) -> BytesCodec | VLenUTF8Codec | VLenBytesCodec:
40+
dtype = DataType.from_numpy(np_dtype)
41+
if dtype == DataType.string:
42+
return VLenUTF8Codec()
43+
elif dtype == DataType.bytes:
44+
return VLenBytesCodec()
45+
else:
46+
return BytesCodec()

src/zarr/codecs/vlen_utf8.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
from __future__ import annotations
2+
3+
from dataclasses import dataclass
4+
from typing import TYPE_CHECKING
5+
6+
import numpy as np
7+
from numcodecs.vlen import VLenBytes, VLenUTF8
8+
9+
from zarr.abc.codec import ArrayBytesCodec
10+
from zarr.core.buffer import Buffer, NDBuffer
11+
from zarr.core.common import JSON, parse_named_configuration
12+
from zarr.core.strings import cast_to_string_dtype
13+
from zarr.registry import register_codec
14+
15+
if TYPE_CHECKING:
16+
from typing import Self
17+
18+
from zarr.core.array_spec import ArraySpec
19+
20+
21+
# can use a global because there are no parameters
22+
_vlen_utf8_codec = VLenUTF8()
23+
_vlen_bytes_codec = VLenBytes()
24+
25+
26+
@dataclass(frozen=True)
27+
class VLenUTF8Codec(ArrayBytesCodec):
28+
@classmethod
29+
def from_dict(cls, data: dict[str, JSON]) -> Self:
30+
_, configuration_parsed = parse_named_configuration(
31+
data, "vlen-utf8", require_configuration=False
32+
)
33+
configuration_parsed = configuration_parsed or {}
34+
return cls(**configuration_parsed)
35+
36+
def to_dict(self) -> dict[str, JSON]:
37+
return {"name": "vlen-utf8", "configuration": {}}
38+
39+
def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
40+
return self
41+
42+
async def _decode_single(
43+
self,
44+
chunk_bytes: Buffer,
45+
chunk_spec: ArraySpec,
46+
) -> NDBuffer:
47+
assert isinstance(chunk_bytes, Buffer)
48+
49+
raw_bytes = chunk_bytes.as_array_like()
50+
decoded = _vlen_utf8_codec.decode(raw_bytes)
51+
assert decoded.dtype == np.object_
52+
decoded.shape = chunk_spec.shape
53+
# coming out of the code, we know this is safe, so don't issue a warning
54+
as_string_dtype = cast_to_string_dtype(decoded, safe=True)
55+
return chunk_spec.prototype.nd_buffer.from_numpy_array(as_string_dtype)
56+
57+
async def _encode_single(
58+
self,
59+
chunk_array: NDBuffer,
60+
chunk_spec: ArraySpec,
61+
) -> Buffer | None:
62+
assert isinstance(chunk_array, NDBuffer)
63+
return chunk_spec.prototype.buffer.from_bytes(
64+
_vlen_utf8_codec.encode(chunk_array.as_numpy_array())
65+
)
66+
67+
def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int:
68+
# what is input_byte_length for an object dtype?
69+
raise NotImplementedError("compute_encoded_size is not implemented for VLen codecs")
70+
71+
72+
@dataclass(frozen=True)
73+
class VLenBytesCodec(ArrayBytesCodec):
74+
@classmethod
75+
def from_dict(cls, data: dict[str, JSON]) -> Self:
76+
_, configuration_parsed = parse_named_configuration(
77+
data, "vlen-bytes", require_configuration=False
78+
)
79+
configuration_parsed = configuration_parsed or {}
80+
return cls(**configuration_parsed)
81+
82+
def to_dict(self) -> dict[str, JSON]:
83+
return {"name": "vlen-bytes", "configuration": {}}
84+
85+
def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
86+
return self
87+
88+
async def _decode_single(
89+
self,
90+
chunk_bytes: Buffer,
91+
chunk_spec: ArraySpec,
92+
) -> NDBuffer:
93+
assert isinstance(chunk_bytes, Buffer)
94+
95+
raw_bytes = chunk_bytes.as_array_like()
96+
decoded = _vlen_bytes_codec.decode(raw_bytes)
97+
assert decoded.dtype == np.object_
98+
decoded.shape = chunk_spec.shape
99+
return chunk_spec.prototype.nd_buffer.from_numpy_array(decoded)
100+
101+
async def _encode_single(
102+
self,
103+
chunk_array: NDBuffer,
104+
chunk_spec: ArraySpec,
105+
) -> Buffer | None:
106+
assert isinstance(chunk_array, NDBuffer)
107+
return chunk_spec.prototype.buffer.from_bytes(
108+
_vlen_bytes_codec.encode(chunk_array.as_numpy_array())
109+
)
110+
111+
def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int:
112+
# what is input_byte_length for an object dtype?
113+
raise NotImplementedError("compute_encoded_size is not implemented for VLen codecs")
114+
115+
116+
register_codec("vlen-utf8", VLenUTF8Codec)
117+
register_codec("vlen-bytes", VLenBytesCodec)

src/zarr/core/array.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
from zarr._compat import _deprecate_positional_args
1313
from zarr.abc.store import Store, set_or_delete
14-
from zarr.codecs import BytesCodec
14+
from zarr.codecs import _get_default_array_bytes_codec
1515
from zarr.codecs._v2 import V2Compressor, V2Filters
1616
from zarr.core.attributes import Attributes
1717
from zarr.core.buffer import (
@@ -318,7 +318,11 @@ async def _create_v3(
318318
await ensure_no_existing_node(store_path, zarr_format=3)
319319

320320
shape = parse_shapelike(shape)
321-
codecs = list(codecs) if codecs is not None else [BytesCodec()]
321+
codecs = (
322+
list(codecs)
323+
if codecs is not None
324+
else [_get_default_array_bytes_codec(np.dtype(dtype))]
325+
)
322326

323327
if chunk_key_encoding is None:
324328
chunk_key_encoding = ("default", "/")

src/zarr/core/buffer/core.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -313,8 +313,6 @@ class NDBuffer:
313313
"""
314314

315315
def __init__(self, array: NDArrayLike) -> None:
316-
# assert array.ndim > 0
317-
assert array.dtype != object
318316
self._data = array
319317

320318
@classmethod
@@ -467,9 +465,11 @@ def all_equal(self, other: Any, equal_nan: bool = True) -> bool:
467465
# Handle None fill_value for Zarr V2
468466
return False
469467
# use array_equal to obtain equal_nan=True functionality
468+
# Since fill-value is a scalar, isn't there a faster path than allocating a new array for fill value
469+
# every single time we have to write data?
470470
_data, other = np.broadcast_arrays(self._data, other)
471471
return np.array_equal(
472-
self._data, other, equal_nan=equal_nan if self._data.dtype.kind not in "US" else False
472+
self._data, other, equal_nan=equal_nan if self._data.dtype.kind not in "USTO" else False
473473
)
474474

475475
def fill(self, value: Any) -> None:

src/zarr/core/config.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ def reset(self) -> None:
5858
"crc32c": "zarr.codecs.crc32c_.Crc32cCodec",
5959
"sharding_indexed": "zarr.codecs.sharding.ShardingCodec",
6060
"transpose": "zarr.codecs.transpose.TransposeCodec",
61+
"vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec",
62+
"vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec",
6163
},
6264
"buffer": "zarr.core.buffer.cpu.Buffer",
6365
"ndbuffer": "zarr.core.buffer.cpu.NDBuffer",

src/zarr/core/group.py

Lines changed: 16 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -768,24 +768,20 @@ async def full(
768768
)
769769

770770
async def empty_like(
771-
self, *, name: str, prototype: async_api.ArrayLike, **kwargs: Any
771+
self, *, name: str, data: async_api.ArrayLike, **kwargs: Any
772772
) -> AsyncArray:
773-
return await async_api.empty_like(a=prototype, store=self.store_path, path=name, **kwargs)
773+
return await async_api.empty_like(a=data, store=self.store_path, path=name, **kwargs)
774774

775775
async def zeros_like(
776-
self, *, name: str, prototype: async_api.ArrayLike, **kwargs: Any
776+
self, *, name: str, data: async_api.ArrayLike, **kwargs: Any
777777
) -> AsyncArray:
778-
return await async_api.zeros_like(a=prototype, store=self.store_path, path=name, **kwargs)
778+
return await async_api.zeros_like(a=data, store=self.store_path, path=name, **kwargs)
779779

780-
async def ones_like(
781-
self, *, name: str, prototype: async_api.ArrayLike, **kwargs: Any
782-
) -> AsyncArray:
783-
return await async_api.ones_like(a=prototype, store=self.store_path, path=name, **kwargs)
780+
async def ones_like(self, *, name: str, data: async_api.ArrayLike, **kwargs: Any) -> AsyncArray:
781+
return await async_api.ones_like(a=data, store=self.store_path, path=name, **kwargs)
784782

785-
async def full_like(
786-
self, *, name: str, prototype: async_api.ArrayLike, **kwargs: Any
787-
) -> AsyncArray:
788-
return await async_api.full_like(a=prototype, store=self.store_path, path=name, **kwargs)
783+
async def full_like(self, *, name: str, data: async_api.ArrayLike, **kwargs: Any) -> AsyncArray:
784+
return await async_api.full_like(a=data, store=self.store_path, path=name, **kwargs)
789785

790786
async def move(self, source: str, dest: str) -> None:
791787
raise NotImplementedError
@@ -1171,25 +1167,17 @@ def full(
11711167
)
11721168
)
11731169

1174-
def empty_like(self, *, name: str, prototype: async_api.ArrayLike, **kwargs: Any) -> Array:
1175-
return Array(
1176-
self._sync(self._async_group.empty_like(name=name, prototype=prototype, **kwargs))
1177-
)
1170+
def empty_like(self, *, name: str, data: async_api.ArrayLike, **kwargs: Any) -> Array:
1171+
return Array(self._sync(self._async_group.empty_like(name=name, data=data, **kwargs)))
11781172

1179-
def zeros_like(self, *, name: str, prototype: async_api.ArrayLike, **kwargs: Any) -> Array:
1180-
return Array(
1181-
self._sync(self._async_group.zeros_like(name=name, prototype=prototype, **kwargs))
1182-
)
1173+
def zeros_like(self, *, name: str, data: async_api.ArrayLike, **kwargs: Any) -> Array:
1174+
return Array(self._sync(self._async_group.zeros_like(name=name, data=data, **kwargs)))
11831175

1184-
def ones_like(self, *, name: str, prototype: async_api.ArrayLike, **kwargs: Any) -> Array:
1185-
return Array(
1186-
self._sync(self._async_group.ones_like(name=name, prototype=prototype, **kwargs))
1187-
)
1176+
def ones_like(self, *, name: str, data: async_api.ArrayLike, **kwargs: Any) -> Array:
1177+
return Array(self._sync(self._async_group.ones_like(name=name, data=data, **kwargs)))
11881178

1189-
def full_like(self, *, name: str, prototype: async_api.ArrayLike, **kwargs: Any) -> Array:
1190-
return Array(
1191-
self._sync(self._async_group.full_like(name=name, prototype=prototype, **kwargs))
1192-
)
1179+
def full_like(self, *, name: str, data: async_api.ArrayLike, **kwargs: Any) -> Array:
1180+
return Array(self._sync(self._async_group.full_like(name=name, data=data, **kwargs)))
11931181

11941182
def move(self, source: str, dest: str) -> None:
11951183
return self._sync(self._async_group.move(source, dest))

src/zarr/core/metadata/v2.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
from __future__ import annotations
22

3+
import base64
34
from collections.abc import Iterable
45
from enum import Enum
5-
from typing import TYPE_CHECKING
6+
from typing import TYPE_CHECKING, cast
67

78
if TYPE_CHECKING:
89
from typing import Any, Literal, Self
@@ -31,7 +32,7 @@ class ArrayV2Metadata(ArrayMetadata):
3132
shape: ChunkCoords
3233
chunk_grid: RegularChunkGrid
3334
data_type: np.dtype[Any]
34-
fill_value: None | int | float = 0
35+
fill_value: None | int | float | str | bytes = 0
3536
order: Literal["C", "F"] = "C"
3637
filters: tuple[numcodecs.abc.Codec, ...] | None = None
3738
dimension_separator: Literal[".", "/"] = "."
@@ -140,6 +141,13 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata:
140141
_data = data.copy()
141142
# check that the zarr_format attribute is correct
142143
_ = parse_zarr_format(_data.pop("zarr_format"))
144+
dtype = parse_dtype(_data["dtype"])
145+
146+
if dtype.kind in "SV":
147+
fill_value_encoded = _data.get("fill_value")
148+
if fill_value_encoded is not None:
149+
fill_value = base64.standard_b64decode(fill_value_encoded)
150+
_data["fill_value"] = fill_value
143151

144152
# zarr v2 allowed arbitrary keys here.
145153
# We don't want the ArrayV2Metadata constructor to fail just because someone put an
@@ -155,6 +163,14 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata:
155163

156164
def to_dict(self) -> dict[str, JSON]:
157165
zarray_dict = super().to_dict()
166+
167+
if self.dtype.kind in "SV" and self.fill_value is not None:
168+
# There's a relationship between self.dtype and self.fill_value
169+
# that mypy isn't aware of. The fact that we have S or V dtype here
170+
# means we should have a bytes-type fill_value.
171+
fill_value = base64.standard_b64encode(cast(bytes, self.fill_value)).decode("ascii")
172+
zarray_dict["fill_value"] = fill_value
173+
158174
_ = zarray_dict.pop("chunk_grid")
159175
zarray_dict["chunks"] = self.chunk_grid.chunk_shape
160176

0 commit comments

Comments
 (0)