Skip to content

Commit dedeaf6

Browse files
committed
Merge branch 'chore/handle-numcodecs-codecs' of github.com:d-v-b/zarr-python into feat/v2-v3-codecs
2 parents 666186f + f023487 commit dedeaf6

File tree

10 files changed

+133
-218
lines changed

10 files changed

+133
-218
lines changed

src/zarr/codecs/__init__.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,34 @@
44
from zarr.codecs.bytes import BytesCodec, Endian
55
from zarr.codecs.crc32c_ import Crc32cCodec
66
from zarr.codecs.gzip import GzipCodec
7+
from zarr.codecs.numcodecs import (
8+
BZ2,
9+
CRC32,
10+
CRC32C,
11+
LZ4,
12+
LZMA,
13+
ZFPY,
14+
Adler32,
15+
AsType,
16+
BitRound,
17+
Blosc,
18+
Delta,
19+
FixedScaleOffset,
20+
Fletcher32,
21+
GZip,
22+
JenkinsLookup3,
23+
PackBits,
24+
PCodec,
25+
Quantize,
26+
Shuffle,
27+
Zlib,
28+
Zstd,
29+
)
730
from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation
831
from zarr.codecs.transpose import TransposeCodec
932
from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec
1033
from zarr.codecs.zstd import ZstdCodec
34+
from zarr.registry import register_codec
1135

1236
__all__ = [
1337
"BloscCname",
@@ -24,3 +48,46 @@
2448
"VLenUTF8Codec",
2549
"ZstdCodec",
2650
]
51+
52+
register_codec("blosc", BloscCodec)
53+
register_codec("bytes", BytesCodec)
54+
55+
# compatibility with earlier versions of ZEP1
56+
register_codec("endian", BytesCodec)
57+
register_codec("crc32c", Crc32cCodec)
58+
register_codec("gzip", GzipCodec)
59+
register_codec("sharding_indexed", ShardingCodec)
60+
register_codec("zstd", ZstdCodec)
61+
register_codec("vlen-utf8", VLenUTF8Codec)
62+
register_codec("vlen-bytes", VLenBytesCodec)
63+
register_codec("transpose", TransposeCodec)
64+
65+
# Register all the codecs formerly contained in numcodecs.zarr3
66+
67+
register_codec("numcodecs.bz2", BZ2, qualname="zarr.codecs.numcodecs.BZ2")
68+
register_codec("numcodecs.crc32", CRC32, qualname="zarr.codecs.numcodecs.CRC32")
69+
register_codec("numcodecs.crc32c", CRC32C, qualname="zarr.codecs.numcodecs.CRC32C")
70+
register_codec("numcodecs.lz4", LZ4, qualname="zarr.codecs.numcodecs.LZ4")
71+
register_codec("numcodecs.lzma", LZMA, qualname="zarr.codecs.numcodecs.LZMA")
72+
register_codec("numcodecs.zfpy", ZFPY, qualname="zarr.codecs.numcodecs.ZFPY")
73+
register_codec("numcodecs.adler32", Adler32, qualname="zarr.codecs.numcodecs.Adler32")
74+
register_codec("numcodecs.astype", AsType, qualname="zarr.codecs.numcodecs.AsType")
75+
register_codec("numcodecs.bitround", BitRound, qualname="zarr.codecs.numcodecs.BitRound")
76+
register_codec("numcodecs.blosc", Blosc, qualname="zarr.codecs.numcodecs.Blosc")
77+
register_codec("numcodecs.delta", Delta, qualname="zarr.codecs.numcodecs.Delta")
78+
register_codec(
79+
"numcodecs.fixedscaleoffset",
80+
FixedScaleOffset,
81+
qualname="zarr.codecs.numcodecs.FixedScaleOffset",
82+
)
83+
register_codec("numcodecs.fletcher32", Fletcher32, qualname="zarr.codecs.numcodecs.Fletcher32")
84+
register_codec("numcodecs.gzip", GZip, qualname="zarr.codecs.numcodecs.GZip")
85+
register_codec(
86+
"numcodecs.jenkins_lookup3", JenkinsLookup3, qualname="zarr.codecs.numcodecs.JenkinsLookup3"
87+
)
88+
register_codec("numcodecs.pcodec", PCodec, qualname="zarr.codecs.numcodecs.pcodec")
89+
register_codec("numcodecs.packbits", PackBits, qualname="zarr.codecs.numcodecs.PackBits")
90+
register_codec("numcodecs.quantize", Quantize, qualname="zarr.codecs.numcodecs.Quantize")
91+
register_codec("numcodecs.shuffle", Shuffle, qualname="zarr.codecs.numcodecs.Shuffle")
92+
register_codec("numcodecs.zlib", Zlib, qualname="zarr.codecs.numcodecs.Zlib")
93+
register_codec("numcodecs.zstd", Zstd, qualname="zarr.codecs.numcodecs.Zstd")

src/zarr/codecs/blosc.py

Lines changed: 65 additions & 157 deletions
Original file line numberDiff line numberDiff line change
@@ -1,100 +1,59 @@
11
from __future__ import annotations
22

33
import asyncio
4-
from collections.abc import Mapping
54
from dataclasses import dataclass, replace
5+
from enum import Enum
66
from functools import cached_property
7-
from typing import (
8-
TYPE_CHECKING,
9-
Final,
10-
Literal,
11-
NotRequired,
12-
TypedDict,
13-
TypeGuard,
14-
overload,
15-
)
7+
from typing import TYPE_CHECKING
168

179
import numcodecs
1810
from numcodecs.blosc import Blosc
1911
from packaging.version import Version
20-
from typing_extensions import ReadOnly
21-
22-
from zarr.abc.codec import BytesBytesCodec, CodecJSON
23-
from zarr.core.common import (
24-
JSON,
25-
NamedRequiredConfig,
26-
ZarrFormat,
27-
)
12+
13+
from zarr.abc.codec import BytesBytesCodec
14+
from zarr.core.buffer.cpu import as_numpy_array_wrapper
15+
from zarr.core.common import JSON, parse_enum, parse_named_configuration
2816
from zarr.core.dtype.common import HasItemSize
29-
from zarr.errors import CodecValidationError
30-
from zarr.registry import register_codec
3117

3218
if TYPE_CHECKING:
3319
from typing import Self
3420

3521
from zarr.core.array_spec import ArraySpec
3622
from zarr.core.buffer import Buffer
3723

38-
BloscShuffle = Literal["noshuffle", "shuffle", "bitshuffle"]
39-
BLOSC_SHUFFLE: Final = ("noshuffle", "shuffle", "bitshuffle")
40-
41-
BloscCname = Literal["lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib"]
42-
BLOSC_CNAME: Final = ("lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib")
43-
44-
45-
class BloscConfigV2(TypedDict):
46-
cname: BloscCname
47-
clevel: int
48-
shuffle: int
49-
blocksize: int
50-
typesize: NotRequired[int]
51-
5224

53-
class BloscConfigV3(TypedDict):
54-
cname: BloscCname
55-
clevel: int
56-
shuffle: BloscShuffle
57-
blocksize: int
58-
typesize: int
59-
60-
61-
class BloscJSON_V2(BloscConfigV2):
25+
class BloscShuffle(Enum):
6226
"""
63-
The JSON form of the Blosc codec in Zarr V2.
27+
Enum for shuffle filter used by blosc.
6428
"""
6529

66-
id: ReadOnly[Literal["blosc"]]
30+
noshuffle = "noshuffle"
31+
shuffle = "shuffle"
32+
bitshuffle = "bitshuffle"
6733

34+
@classmethod
35+
def from_int(cls, num: int) -> BloscShuffle:
36+
blosc_shuffle_int_to_str = {
37+
0: "noshuffle",
38+
1: "shuffle",
39+
2: "bitshuffle",
40+
}
41+
if num not in blosc_shuffle_int_to_str:
42+
raise ValueError(f"Value must be between 0 and 2. Got {num}.")
43+
return BloscShuffle[blosc_shuffle_int_to_str[num]]
6844

69-
class BloscJSON_V3(NamedRequiredConfig[Literal["blosc"], BloscConfigV3]):
45+
46+
class BloscCname(Enum):
7047
"""
71-
The JSON form of the Blosc codec in Zarr V3.
48+
Enum for compression library used by blosc.
7249
"""
7350

74-
75-
def check_json_v2(data: CodecJSON) -> TypeGuard[BloscJSON_V2]:
76-
return (
77-
isinstance(data, Mapping)
78-
and set(data.keys()) == {"id", "clevel", "cname", "shuffle", "blocksize"}
79-
and data["id"] == "blosc"
80-
)
81-
82-
83-
def check_json_v3(data: CodecJSON) -> TypeGuard[BloscJSON_V3]:
84-
return (
85-
isinstance(data, Mapping)
86-
and set(data.keys()) == {"name", "configuration"}
87-
and data["name"] == "blosc"
88-
and isinstance(data["configuration"], Mapping)
89-
and set(data["configuration"].keys())
90-
== {"cname", "clevel", "shuffle", "blocksize", "typesize"}
91-
)
92-
93-
94-
def parse_cname(value: object) -> BloscCname:
95-
if value not in BLOSC_CNAME:
96-
raise ValueError(f"Value must be one of {BLOSC_CNAME}. Got {value} instead.")
97-
return value
51+
lz4 = "lz4"
52+
lz4hc = "lz4hc"
53+
blosclz = "blosclz"
54+
zstd = "zstd"
55+
snappy = "snappy"
56+
zlib = "zlib"
9857

9958

10059
# See https://zarr.readthedocs.io/en/stable/user-guide/performance.html#configuring-blosc
@@ -125,35 +84,31 @@ def parse_blocksize(data: JSON) -> int:
12584
raise TypeError(f"Value should be an int. Got {type(data)} instead.")
12685

12786

128-
def parse_shuffle(data: object) -> BloscShuffle:
129-
if data in BLOSC_SHUFFLE:
130-
return data # type: ignore[return-value]
131-
raise TypeError(f"Value must be one of {BLOSC_SHUFFLE}. Got {data} instead.")
132-
133-
13487
@dataclass(frozen=True)
13588
class BloscCodec(BytesBytesCodec):
89+
"""blosc codec"""
90+
13691
is_fixed_size = False
13792

13893
typesize: int | None
139-
cname: BloscCname
140-
clevel: int
141-
shuffle: BloscShuffle | None
142-
blocksize: int
94+
cname: BloscCname = BloscCname.zstd
95+
clevel: int = 5
96+
shuffle: BloscShuffle | None = BloscShuffle.noshuffle
97+
blocksize: int = 0
14398

14499
def __init__(
145100
self,
146101
*,
147102
typesize: int | None = None,
148-
cname: BloscCname = "zstd",
103+
cname: BloscCname | str = BloscCname.zstd,
149104
clevel: int = 5,
150-
shuffle: BloscShuffle | None = None,
105+
shuffle: BloscShuffle | str | None = None,
151106
blocksize: int = 0,
152107
) -> None:
153108
typesize_parsed = parse_typesize(typesize) if typesize is not None else None
154-
cname_parsed = parse_cname(cname)
109+
cname_parsed = parse_enum(cname, BloscCname)
155110
clevel_parsed = parse_clevel(clevel)
156-
shuffle_parsed = parse_shuffle(shuffle) if shuffle is not None else None
111+
shuffle_parsed = parse_enum(shuffle, BloscShuffle) if shuffle is not None else None
157112
blocksize_parsed = parse_blocksize(blocksize)
158113

159114
object.__setattr__(self, "typesize", typesize_parsed)
@@ -164,74 +119,24 @@ def __init__(
164119

165120
@classmethod
166121
def from_dict(cls, data: dict[str, JSON]) -> Self:
167-
return cls.from_json(data, zarr_format=3)
122+
_, configuration_parsed = parse_named_configuration(data, "blosc")
123+
return cls(**configuration_parsed) # type: ignore[arg-type]
168124

169125
def to_dict(self) -> dict[str, JSON]:
170-
return self.to_json(zarr_format=3)
171-
172-
@classmethod
173-
def _from_json_v2(cls, data: CodecJSON) -> Self:
174-
if check_json_v2(data):
175-
return cls(
176-
cname=data["cname"],
177-
clevel=data["clevel"],
178-
shuffle=BLOSC_SHUFFLE[data["shuffle"]],
179-
blocksize=data["blocksize"],
180-
typesize=data.get("typesize", None),
181-
)
182-
msg = (
183-
"Invalid Zarr V2 JSON representation of the blosc codec. "
184-
f"Got {data!r}, expected a Mapping with keys ('id', 'cname', 'clevel', 'shuffle', 'blocksize', 'typesize')"
185-
)
186-
raise CodecValidationError(msg)
187-
188-
@classmethod
189-
def _from_json_v3(cls, data: CodecJSON) -> Self:
190-
if check_json_v3(data):
191-
return cls(
192-
typesize=data["configuration"]["typesize"],
193-
cname=data["configuration"]["cname"],
194-
clevel=data["configuration"]["clevel"],
195-
shuffle=data["configuration"]["shuffle"],
196-
blocksize=data["configuration"]["blocksize"],
197-
)
198-
msg = (
199-
"Invalid Zarr V3 JSON representation of the blosc codec. "
200-
f"Got {data!r}, expected a Mapping with keys ('name', 'configuration')"
201-
"Where the 'configuration' key is a Mapping with keys ('cname', 'clevel', 'shuffle', 'blocksize', 'typesize')"
202-
)
203-
raise CodecValidationError(msg)
204-
205-
@overload
206-
def to_json(self, zarr_format: Literal[2]) -> BloscJSON_V2: ...
207-
@overload
208-
def to_json(self, zarr_format: Literal[3]) -> BloscJSON_V3: ...
209-
210-
def to_json(self, zarr_format: ZarrFormat) -> BloscJSON_V2 | BloscJSON_V3:
211-
if self.typesize is None or self.shuffle is None:
212-
raise ValueError("typesize and blocksize need to be set for encoding.")
213-
if zarr_format == 2:
214-
return {
215-
"id": "blosc",
126+
if self.typesize is None:
127+
raise ValueError("`typesize` needs to be set for serialization.")
128+
if self.shuffle is None:
129+
raise ValueError("`shuffle` needs to be set for serialization.")
130+
return {
131+
"name": "blosc",
132+
"configuration": {
133+
"typesize": self.typesize,
134+
"cname": self.cname.value,
216135
"clevel": self.clevel,
217-
"cname": self.cname,
218-
"shuffle": BLOSC_SHUFFLE.index(self.shuffle),
136+
"shuffle": self.shuffle.value,
219137
"blocksize": self.blocksize,
220-
}
221-
elif zarr_format == 3:
222-
return {
223-
"name": "blosc",
224-
"configuration": {
225-
"clevel": self.clevel,
226-
"cname": self.cname,
227-
"shuffle": self.shuffle,
228-
"typesize": self.typesize,
229-
"blocksize": self.blocksize,
230-
},
231-
}
232-
raise ValueError(
233-
f"Unsupported Zarr format {zarr_format}. Expected 2 or 3."
234-
) # pragma: no cover
138+
},
139+
}
235140

236141
def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
237142
item_size = 1
@@ -241,18 +146,26 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
241146
if new_codec.typesize is None:
242147
new_codec = replace(new_codec, typesize=item_size)
243148
if new_codec.shuffle is None:
244-
new_codec = replace(new_codec, shuffle="bitshuffle" if item_size == 1 else "shuffle")
149+
new_codec = replace(
150+
new_codec,
151+
shuffle=(BloscShuffle.bitshuffle if item_size == 1 else BloscShuffle.shuffle),
152+
)
245153

246154
return new_codec
247155

248156
@cached_property
249157
def _blosc_codec(self) -> Blosc:
250158
if self.shuffle is None:
251159
raise ValueError("`shuffle` needs to be set for decoding and encoding.")
160+
map_shuffle_str_to_int = {
161+
BloscShuffle.noshuffle: 0,
162+
BloscShuffle.shuffle: 1,
163+
BloscShuffle.bitshuffle: 2,
164+
}
252165
config_dict = {
253-
"cname": self.cname,
166+
"cname": self.cname.name,
254167
"clevel": self.clevel,
255-
"shuffle": BLOSC_SHUFFLE.index(self.shuffle),
168+
"shuffle": map_shuffle_str_to_int[self.shuffle],
256169
"blocksize": self.blocksize,
257170
}
258171
# See https://github.com/zarr-developers/numcodecs/pull/713
@@ -265,8 +178,6 @@ async def _decode_single(
265178
chunk_bytes: Buffer,
266179
chunk_spec: ArraySpec,
267180
) -> Buffer:
268-
from zarr.core.buffer.cpu import as_numpy_array_wrapper
269-
270181
return await asyncio.to_thread(
271182
as_numpy_array_wrapper, self._blosc_codec.decode, chunk_bytes, chunk_spec.prototype
272183
)
@@ -287,6 +198,3 @@ async def _encode_single(
287198

288199
def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) -> int:
289200
raise NotImplementedError
290-
291-
292-
register_codec("blosc", BloscCodec)

0 commit comments

Comments
 (0)