Skip to content

Commit 156134f

Browse files
committed
add to_json methods to codecs
1 parent 95c9c8b commit 156134f

File tree

8 files changed

+760
-138
lines changed

8 files changed

+760
-138
lines changed

src/zarr/codecs/blosc.py

Lines changed: 139 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,22 @@
11
from __future__ import annotations
22

33
import asyncio
4+
from collections.abc import Mapping
45
from dataclasses import dataclass, replace
5-
from enum import Enum
66
from functools import cached_property
7-
from typing import TYPE_CHECKING
7+
from typing import TYPE_CHECKING, Final, Literal, NotRequired, TypedDict, TypeGuard, overload
88

99
import numcodecs
1010
from numcodecs.blosc import Blosc
1111
from packaging.version import Version
1212

13-
from zarr.abc.codec import BytesBytesCodec
13+
from zarr.abc.codec import BytesBytesCodec, CodecJSON, CodecJSON_V2, CodecValidationError
1414
from zarr.core.buffer.cpu import as_numpy_array_wrapper
15-
from zarr.core.common import JSON, parse_enum, parse_named_configuration
15+
from zarr.core.common import (
16+
JSON,
17+
NamedRequiredConfig,
18+
ZarrFormat,
19+
)
1620
from zarr.core.dtype.common import HasItemSize
1721
from zarr.registry import register_codec
1822

@@ -22,39 +26,64 @@
2226
from zarr.core.array_spec import ArraySpec
2327
from zarr.core.buffer import Buffer
2428

29+
BloscShuffle = Literal["noshuffle", "shuffle", "bitshuffle"]
30+
BLOSC_SHUFFLE: Final = ("noshuffle", "shuffle", "bitshuffle")
2531

26-
class BloscShuffle(Enum):
27-
"""
28-
Enum for shuffle filter used by blosc.
29-
"""
32+
BloscCname = Literal["lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib"]
33+
BLOSC_CNAME: Final = ("lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib")
3034

31-
noshuffle = "noshuffle"
32-
shuffle = "shuffle"
33-
bitshuffle = "bitshuffle"
3435

35-
@classmethod
36-
def from_int(cls, num: int) -> BloscShuffle:
37-
blosc_shuffle_int_to_str = {
38-
0: "noshuffle",
39-
1: "shuffle",
40-
2: "bitshuffle",
41-
}
42-
if num not in blosc_shuffle_int_to_str:
43-
raise ValueError(f"Value must be between 0 and 2. Got {num}.")
44-
return BloscShuffle[blosc_shuffle_int_to_str[num]]
36+
class BloscConfigV2(TypedDict):
37+
cname: BloscCname
38+
clevel: int
39+
shuffle: int
40+
blocksize: int
41+
typesize: NotRequired[int]
4542

4643

47-
class BloscCname(Enum):
44+
class BloscConfigV3(TypedDict):
45+
cname: BloscCname
46+
clevel: int
47+
shuffle: BloscShuffle
48+
blocksize: int
49+
typesize: int
50+
51+
52+
class BloscJSON_V2(CodecJSON_V2[Literal["blosc"]], BloscConfigV2):
4853
"""
49-
Enum for compression library used by blosc.
54+
The JSON form of the Blosc codec in Zarr V2.
5055
"""
5156

52-
lz4 = "lz4"
53-
lz4hc = "lz4hc"
54-
blosclz = "blosclz"
55-
zstd = "zstd"
56-
snappy = "snappy"
57-
zlib = "zlib"
57+
58+
class BloscJSON_V3(NamedRequiredConfig[Literal["blosc"], BloscConfigV3]):
59+
"""
60+
The JSON form of the Blosc codec in Zarr V3.
61+
"""
62+
63+
64+
def check_json_v2(data: CodecJSON) -> TypeGuard[BloscJSON_V2]:
65+
return (
66+
isinstance(data, Mapping)
67+
and set(data.keys()) == {"id", "clevel", "cname", "shuffle", "blocksize"}
68+
and data["id"] == "blosc"
69+
)
70+
71+
72+
def check_json_v3(data: CodecJSON) -> TypeGuard[BloscJSON_V3]:
73+
return (
74+
isinstance(data, Mapping)
75+
and set(data.keys()) == {"name", "configuration"}
76+
and data["name"] == "blosc"
77+
and isinstance(data["configuration"], Mapping)
78+
and set(data["configuration"].keys())
79+
== {"cname", "clevel", "shuffle", "blocksize", "typesize"}
80+
)
81+
82+
83+
def parse_cname(value: object) -> BloscCname:
84+
if value not in BLOSC_CNAME:
85+
raise ValueError(f"Value must be one of {BLOSC_CNAME}. Got {value} instead.")
86+
return value
5887

5988

6089
# See https://zarr.readthedocs.io/en/stable/user-guide/performance.html#configuring-blosc
@@ -85,31 +114,35 @@ def parse_blocksize(data: JSON) -> int:
85114
raise TypeError(f"Value should be an int. Got {type(data)} instead.")
86115

87116

117+
def parse_shuffle(data: object) -> BloscShuffle:
118+
if data in BLOSC_SHUFFLE:
119+
return data # type: ignore[return-value]
120+
raise TypeError(f"Value must be one of {BLOSC_SHUFFLE}. Got {data} instead.")
121+
122+
88123
@dataclass(frozen=True)
89124
class BloscCodec(BytesBytesCodec):
90-
"""blosc codec"""
91-
92125
is_fixed_size = False
93126

94127
typesize: int | None
95-
cname: BloscCname = BloscCname.zstd
96-
clevel: int = 5
97-
shuffle: BloscShuffle | None = BloscShuffle.noshuffle
98-
blocksize: int = 0
128+
cname: BloscCname
129+
clevel: int
130+
shuffle: BloscShuffle | None
131+
blocksize: int
99132

100133
def __init__(
101134
self,
102135
*,
103136
typesize: int | None = None,
104-
cname: BloscCname | str = BloscCname.zstd,
137+
cname: BloscCname = "zstd",
105138
clevel: int = 5,
106-
shuffle: BloscShuffle | str | None = None,
139+
shuffle: BloscShuffle | None = None,
107140
blocksize: int = 0,
108141
) -> None:
109142
typesize_parsed = parse_typesize(typesize) if typesize is not None else None
110-
cname_parsed = parse_enum(cname, BloscCname)
143+
cname_parsed = parse_cname(cname)
111144
clevel_parsed = parse_clevel(clevel)
112-
shuffle_parsed = parse_enum(shuffle, BloscShuffle) if shuffle is not None else None
145+
shuffle_parsed = parse_shuffle(shuffle) if shuffle is not None else None
113146
blocksize_parsed = parse_blocksize(blocksize)
114147

115148
object.__setattr__(self, "typesize", typesize_parsed)
@@ -120,24 +153,74 @@ def __init__(
120153

121154
@classmethod
122155
def from_dict(cls, data: dict[str, JSON]) -> Self:
123-
_, configuration_parsed = parse_named_configuration(data, "blosc")
124-
return cls(**configuration_parsed) # type: ignore[arg-type]
156+
return cls.from_json(data, zarr_format=3)
125157

126158
def to_dict(self) -> dict[str, JSON]:
127-
if self.typesize is None:
128-
raise ValueError("`typesize` needs to be set for serialization.")
129-
if self.shuffle is None:
130-
raise ValueError("`shuffle` needs to be set for serialization.")
131-
return {
132-
"name": "blosc",
133-
"configuration": {
134-
"typesize": self.typesize,
135-
"cname": self.cname.value,
159+
return self.to_json(zarr_format=3)
160+
161+
@classmethod
162+
def _from_json_v2(cls, data: CodecJSON) -> Self:
163+
if check_json_v2(data):
164+
return cls(
165+
cname=data["cname"],
166+
clevel=data["clevel"],
167+
shuffle=BLOSC_SHUFFLE[data["shuffle"]],
168+
blocksize=data["blocksize"],
169+
typesize=data.get("typesize", None),
170+
)
171+
msg = (
172+
"Invalid Zarr V2 JSON representation of the blosc codec. "
173+
f"Got {data!r}, expected a Mapping with keys ('id', 'cname', 'clevel', 'shuffle', 'blocksize', 'typesize')"
174+
)
175+
raise CodecValidationError(msg)
176+
177+
@classmethod
178+
def _from_json_v3(cls, data: CodecJSON) -> Self:
179+
if check_json_v3(data):
180+
return cls(
181+
typesize=data["configuration"]["typesize"],
182+
cname=data["configuration"]["cname"],
183+
clevel=data["configuration"]["clevel"],
184+
shuffle=data["configuration"]["shuffle"],
185+
blocksize=data["configuration"]["blocksize"],
186+
)
187+
msg = (
188+
"Invalid Zarr V3 JSON representation of the blosc codec. "
189+
f"Got {data!r}, expected a Mapping with keys ('name', 'configuration')"
190+
"Where the 'configuration' key is a Mapping with keys ('cname', 'clevel', 'shuffle', 'blocksize', 'typesize')"
191+
)
192+
raise CodecValidationError(msg)
193+
194+
@overload
195+
def to_json(self, zarr_format: Literal[2]) -> BloscJSON_V2: ...
196+
@overload
197+
def to_json(self, zarr_format: Literal[3]) -> BloscJSON_V3: ...
198+
199+
def to_json(self, zarr_format: ZarrFormat) -> BloscJSON_V2 | BloscJSON_V3:
200+
if self.typesize is None or self.shuffle is None:
201+
raise ValueError("typesize and blocksize need to be set for encoding.")
202+
if zarr_format == 2:
203+
return {
204+
"id": "blosc",
136205
"clevel": self.clevel,
137-
"shuffle": self.shuffle.value,
206+
"cname": self.cname,
207+
"shuffle": BLOSC_SHUFFLE.index(self.shuffle),
138208
"blocksize": self.blocksize,
139-
},
140-
}
209+
}
210+
elif zarr_format == 3:
211+
return {
212+
"name": "blosc",
213+
"configuration": {
214+
"clevel": self.clevel,
215+
"cname": self.cname,
216+
"shuffle": self.shuffle,
217+
"typesize": self.typesize,
218+
"blocksize": self.blocksize,
219+
},
220+
}
221+
raise ValueError(
222+
f"Unsupported Zarr format {zarr_format}. Expected 2 or 3."
223+
) # pragma: no cover
141224

142225
def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
143226
item_size = 1
@@ -147,26 +230,18 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
147230
if new_codec.typesize is None:
148231
new_codec = replace(new_codec, typesize=item_size)
149232
if new_codec.shuffle is None:
150-
new_codec = replace(
151-
new_codec,
152-
shuffle=(BloscShuffle.bitshuffle if item_size == 1 else BloscShuffle.shuffle),
153-
)
233+
new_codec = replace(new_codec, shuffle="bitshuffle" if item_size == 1 else "shuffle")
154234

155235
return new_codec
156236

157237
@cached_property
158238
def _blosc_codec(self) -> Blosc:
159239
if self.shuffle is None:
160240
raise ValueError("`shuffle` needs to be set for decoding and encoding.")
161-
map_shuffle_str_to_int = {
162-
BloscShuffle.noshuffle: 0,
163-
BloscShuffle.shuffle: 1,
164-
BloscShuffle.bitshuffle: 2,
165-
}
166241
config_dict = {
167-
"cname": self.cname.name,
242+
"cname": self.cname,
168243
"clevel": self.clevel,
169-
"shuffle": map_shuffle_str_to_int[self.shuffle],
244+
"shuffle": BLOSC_SHUFFLE.index(self.shuffle),
170245
"blocksize": self.blocksize,
171246
}
172247
# See https://github.com/zarr-developers/numcodecs/pull/713

0 commit comments

Comments
 (0)