Skip to content

Commit 6636c10

Browse files
committed
wip
1 parent dedeaf6 commit 6636c10

File tree

10 files changed

+254
-131
lines changed

10 files changed

+254
-131
lines changed

src/zarr/abc/codec.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
from collections.abc import Mapping
55
from typing import (
66
TYPE_CHECKING,
7-
ClassVar,
87
Generic,
98
Literal,
109
Self,
@@ -14,7 +13,7 @@
1413
overload,
1514
)
1615

17-
from typing_extensions import Protocol, ReadOnly
16+
from typing_extensions import ReadOnly
1817

1918
from zarr.abc.metadata import Metadata
2019
from zarr.core.buffer import Buffer, NDBuffer

src/zarr/codecs/_v2.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
CodecJSON,
1515
CodecJSON_V2,
1616
)
17+
from zarr.core.chunk_grids import ChunkGrid
18+
from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType
1719
from zarr.registry import get_ndbuffer_class
1820

1921
if TYPE_CHECKING:
@@ -141,6 +143,40 @@ def _from_json_v3(cls, data: CodecJSON) -> Self:
141143
def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int:
142144
raise NotImplementedError
143145

146+
def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
147+
"""Fills in codec configuration parameters that can be automatically
148+
inferred from the array metadata.
149+
150+
Parameters
151+
----------
152+
array_spec : ArraySpec
153+
154+
Returns
155+
-------
156+
Self
157+
"""
158+
return self
159+
160+
def validate(
161+
self,
162+
*,
163+
shape: tuple[int, ...],
164+
dtype: ZDType[TBaseDType, TBaseScalar],
165+
chunk_grid: ChunkGrid,
166+
) -> None:
167+
"""Validates that the codec configuration is compatible with the array metadata.
168+
Raises errors when the codec configuration is not compatible.
169+
170+
Parameters
171+
----------
172+
shape : tuple[int, ...]
173+
The array shape
174+
dtype : np.dtype[Any]
175+
The array data type
176+
chunk_grid : ChunkGrid
177+
The array chunk grid
178+
"""
179+
144180
def to_array_array(self) -> NumcodecsArrayArrayCodec:
145181
"""
146182
Use the ``_codec`` attribute to create a NumcodecsArrayArrayCodec.

src/zarr/codecs/blosc.py

Lines changed: 153 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,59 +1,99 @@
11
from __future__ import annotations
22

33
import asyncio
4+
from collections.abc import Mapping
45
from dataclasses import dataclass, replace
5-
from enum import Enum
66
from functools import cached_property
7-
from typing import TYPE_CHECKING
7+
from typing import (
8+
TYPE_CHECKING,
9+
Final,
10+
Literal,
11+
NotRequired,
12+
TypedDict,
13+
TypeGuard,
14+
overload,
15+
)
816

917
import numcodecs
1018
from numcodecs.blosc import Blosc
1119
from packaging.version import Version
12-
13-
from zarr.abc.codec import BytesBytesCodec
14-
from zarr.core.buffer.cpu import as_numpy_array_wrapper
15-
from zarr.core.common import JSON, parse_enum, parse_named_configuration
20+
from typing_extensions import ReadOnly
21+
22+
from zarr.abc.codec import BytesBytesCodec, CodecJSON
23+
from zarr.core.common import (
24+
JSON,
25+
NamedRequiredConfig,
26+
ZarrFormat,
27+
)
1628
from zarr.core.dtype.common import HasItemSize
29+
from zarr.errors import CodecValidationError
1730

1831
if TYPE_CHECKING:
1932
from typing import Self
2033

2134
from zarr.core.array_spec import ArraySpec
2235
from zarr.core.buffer import Buffer
2336

37+
BloscShuffle = Literal["noshuffle", "shuffle", "bitshuffle"]
38+
BLOSC_SHUFFLE: Final = ("noshuffle", "shuffle", "bitshuffle")
39+
40+
BloscCname = Literal["lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib"]
41+
BLOSC_CNAME: Final = ("lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib")
42+
43+
44+
class BloscConfigV2(TypedDict):
45+
cname: BloscCname
46+
clevel: int
47+
shuffle: int
48+
blocksize: int
49+
typesize: NotRequired[int]
2450

25-
class BloscShuffle(Enum):
51+
52+
class BloscConfigV3(TypedDict):
53+
cname: BloscCname
54+
clevel: int
55+
shuffle: BloscShuffle
56+
blocksize: int
57+
typesize: int
58+
59+
60+
class BloscJSON_V2(BloscConfigV2):
2661
"""
27-
Enum for shuffle filter used by blosc.
62+
The JSON form of the Blosc codec in Zarr V2.
2863
"""
2964

30-
noshuffle = "noshuffle"
31-
shuffle = "shuffle"
32-
bitshuffle = "bitshuffle"
33-
34-
@classmethod
35-
def from_int(cls, num: int) -> BloscShuffle:
36-
blosc_shuffle_int_to_str = {
37-
0: "noshuffle",
38-
1: "shuffle",
39-
2: "bitshuffle",
40-
}
41-
if num not in blosc_shuffle_int_to_str:
42-
raise ValueError(f"Value must be between 0 and 2. Got {num}.")
43-
return BloscShuffle[blosc_shuffle_int_to_str[num]]
65+
id: ReadOnly[Literal["blosc"]]
4466

4567

46-
class BloscCname(Enum):
68+
class BloscJSON_V3(NamedRequiredConfig[Literal["blosc"], BloscConfigV3]):
4769
"""
48-
Enum for compression library used by blosc.
70+
The JSON form of the Blosc codec in Zarr V3.
4971
"""
5072

51-
lz4 = "lz4"
52-
lz4hc = "lz4hc"
53-
blosclz = "blosclz"
54-
zstd = "zstd"
55-
snappy = "snappy"
56-
zlib = "zlib"
73+
74+
def check_json_v2(data: CodecJSON) -> TypeGuard[BloscJSON_V2]:
75+
return (
76+
isinstance(data, Mapping)
77+
and set(data.keys()) == {"id", "clevel", "cname", "shuffle", "blocksize"}
78+
and data["id"] == "blosc"
79+
)
80+
81+
82+
def check_json_v3(data: CodecJSON) -> TypeGuard[BloscJSON_V3]:
83+
return (
84+
isinstance(data, Mapping)
85+
and set(data.keys()) == {"name", "configuration"}
86+
and data["name"] == "blosc"
87+
and isinstance(data["configuration"], Mapping)
88+
and set(data["configuration"].keys())
89+
== {"cname", "clevel", "shuffle", "blocksize", "typesize"}
90+
)
91+
92+
93+
def parse_cname(value: object) -> BloscCname:
94+
if value not in BLOSC_CNAME:
95+
raise ValueError(f"Value must be one of {BLOSC_CNAME}. Got {value} instead.")
96+
return value
5797

5898

5999
# See https://zarr.readthedocs.io/en/stable/user-guide/performance.html#configuring-blosc
@@ -84,31 +124,35 @@ def parse_blocksize(data: JSON) -> int:
84124
raise TypeError(f"Value should be an int. Got {type(data)} instead.")
85125

86126

127+
def parse_shuffle(data: object) -> BloscShuffle:
128+
if data in BLOSC_SHUFFLE:
129+
return data # type: ignore[return-value]
130+
raise TypeError(f"Value must be one of {BLOSC_SHUFFLE}. Got {data} instead.")
131+
132+
87133
@dataclass(frozen=True)
88134
class BloscCodec(BytesBytesCodec):
89-
"""blosc codec"""
90-
91135
is_fixed_size = False
92136

93137
typesize: int | None
94-
cname: BloscCname = BloscCname.zstd
95-
clevel: int = 5
96-
shuffle: BloscShuffle | None = BloscShuffle.noshuffle
97-
blocksize: int = 0
138+
cname: BloscCname
139+
clevel: int
140+
shuffle: BloscShuffle | None
141+
blocksize: int
98142

99143
def __init__(
100144
self,
101145
*,
102146
typesize: int | None = None,
103-
cname: BloscCname | str = BloscCname.zstd,
147+
cname: BloscCname = "zstd",
104148
clevel: int = 5,
105-
shuffle: BloscShuffle | str | None = None,
149+
shuffle: BloscShuffle | None = None,
106150
blocksize: int = 0,
107151
) -> None:
108152
typesize_parsed = parse_typesize(typesize) if typesize is not None else None
109-
cname_parsed = parse_enum(cname, BloscCname)
153+
cname_parsed = parse_cname(cname)
110154
clevel_parsed = parse_clevel(clevel)
111-
shuffle_parsed = parse_enum(shuffle, BloscShuffle) if shuffle is not None else None
155+
shuffle_parsed = parse_shuffle(shuffle) if shuffle is not None else None
112156
blocksize_parsed = parse_blocksize(blocksize)
113157

114158
object.__setattr__(self, "typesize", typesize_parsed)
@@ -119,24 +163,74 @@ def __init__(
119163

120164
@classmethod
121165
def from_dict(cls, data: dict[str, JSON]) -> Self:
122-
_, configuration_parsed = parse_named_configuration(data, "blosc")
123-
return cls(**configuration_parsed) # type: ignore[arg-type]
166+
return cls.from_json(data, zarr_format=3)
124167

125168
def to_dict(self) -> dict[str, JSON]:
126-
if self.typesize is None:
127-
raise ValueError("`typesize` needs to be set for serialization.")
128-
if self.shuffle is None:
129-
raise ValueError("`shuffle` needs to be set for serialization.")
130-
return {
131-
"name": "blosc",
132-
"configuration": {
133-
"typesize": self.typesize,
134-
"cname": self.cname.value,
169+
return self.to_json(zarr_format=3)
170+
171+
@classmethod
172+
def _from_json_v2(cls, data: CodecJSON) -> Self:
173+
if check_json_v2(data):
174+
return cls(
175+
cname=data["cname"],
176+
clevel=data["clevel"],
177+
shuffle=BLOSC_SHUFFLE[data["shuffle"]],
178+
blocksize=data["blocksize"],
179+
typesize=data.get("typesize", None),
180+
)
181+
msg = (
182+
"Invalid Zarr V2 JSON representation of the blosc codec. "
183+
f"Got {data!r}, expected a Mapping with keys ('id', 'cname', 'clevel', 'shuffle', 'blocksize', 'typesize')"
184+
)
185+
raise CodecValidationError(msg)
186+
187+
@classmethod
188+
def _from_json_v3(cls, data: CodecJSON) -> Self:
189+
if check_json_v3(data):
190+
return cls(
191+
typesize=data["configuration"]["typesize"],
192+
cname=data["configuration"]["cname"],
193+
clevel=data["configuration"]["clevel"],
194+
shuffle=data["configuration"]["shuffle"],
195+
blocksize=data["configuration"]["blocksize"],
196+
)
197+
msg = (
198+
"Invalid Zarr V3 JSON representation of the blosc codec. "
199+
f"Got {data!r}, expected a Mapping with keys ('name', 'configuration')"
200+
"Where the 'configuration' key is a Mapping with keys ('cname', 'clevel', 'shuffle', 'blocksize', 'typesize')"
201+
)
202+
raise CodecValidationError(msg)
203+
204+
@overload
205+
def to_json(self, zarr_format: Literal[2]) -> BloscJSON_V2: ...
206+
@overload
207+
def to_json(self, zarr_format: Literal[3]) -> BloscJSON_V3: ...
208+
209+
def to_json(self, zarr_format: ZarrFormat) -> BloscJSON_V2 | BloscJSON_V3:
210+
if self.typesize is None or self.shuffle is None:
211+
raise ValueError("typesize and blocksize need to be set for encoding.")
212+
if zarr_format == 2:
213+
return {
214+
"id": "blosc",
135215
"clevel": self.clevel,
136-
"shuffle": self.shuffle.value,
216+
"cname": self.cname,
217+
"shuffle": BLOSC_SHUFFLE.index(self.shuffle),
137218
"blocksize": self.blocksize,
138-
},
139-
}
219+
}
220+
elif zarr_format == 3:
221+
return {
222+
"name": "blosc",
223+
"configuration": {
224+
"clevel": self.clevel,
225+
"cname": self.cname,
226+
"shuffle": self.shuffle,
227+
"typesize": self.typesize,
228+
"blocksize": self.blocksize,
229+
},
230+
}
231+
raise ValueError(
232+
f"Unsupported Zarr format {zarr_format}. Expected 2 or 3."
233+
) # pragma: no cover
140234

141235
def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
142236
item_size = 1
@@ -146,26 +240,18 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
146240
if new_codec.typesize is None:
147241
new_codec = replace(new_codec, typesize=item_size)
148242
if new_codec.shuffle is None:
149-
new_codec = replace(
150-
new_codec,
151-
shuffle=(BloscShuffle.bitshuffle if item_size == 1 else BloscShuffle.shuffle),
152-
)
243+
new_codec = replace(new_codec, shuffle="bitshuffle" if item_size == 1 else "shuffle")
153244

154245
return new_codec
155246

156247
@cached_property
157248
def _blosc_codec(self) -> Blosc:
158249
if self.shuffle is None:
159250
raise ValueError("`shuffle` needs to be set for decoding and encoding.")
160-
map_shuffle_str_to_int = {
161-
BloscShuffle.noshuffle: 0,
162-
BloscShuffle.shuffle: 1,
163-
BloscShuffle.bitshuffle: 2,
164-
}
165251
config_dict = {
166-
"cname": self.cname.name,
252+
"cname": self.cname,
167253
"clevel": self.clevel,
168-
"shuffle": map_shuffle_str_to_int[self.shuffle],
254+
"shuffle": BLOSC_SHUFFLE.index(self.shuffle),
169255
"blocksize": self.blocksize,
170256
}
171257
# See https://github.com/zarr-developers/numcodecs/pull/713
@@ -178,6 +264,8 @@ async def _decode_single(
178264
chunk_bytes: Buffer,
179265
chunk_spec: ArraySpec,
180266
) -> Buffer:
267+
from zarr.core.buffer.cpu import as_numpy_array_wrapper
268+
181269
return await asyncio.to_thread(
182270
as_numpy_array_wrapper, self._blosc_codec.decode, chunk_bytes, chunk_spec.prototype
183271
)

0 commit comments

Comments
 (0)