Skip to content

Commit 8566a8f

Browse files
committed
make cli tests pass
1 parent cbf71a6 commit 8566a8f

File tree

4 files changed

+75
-161
lines changed

4 files changed

+75
-161
lines changed

src/zarr/codecs/blosc.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ def _from_json_v2(cls, data: CodecJSON) -> Self:
232232
clevel=data["clevel"],
233233
shuffle=BLOSC_SHUFFLE[data["shuffle"]],
234234
blocksize=data["blocksize"],
235-
typesize=data.get("typesize", 1),
235+
typesize=data.get("typesize"),
236236
)
237237
msg = (
238238
"Invalid Zarr V2 JSON representation of the blosc codec. "

src/zarr/core/array.py

Lines changed: 40 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,44 @@ def parse_array_metadata(data: Any) -> ArrayMetadata:
197197
raise TypeError # pragma: no cover
198198

199199

200+
def v2_to_v3_codecs(metadata: ArrayV2Metadata) -> tuple[Codec | NumcodecWrapper, ...]:
201+
"""
202+
Convert the filters and compressor from Zarr v2 to a Zarr-V3-compatible sequence of codecs.
203+
"""
204+
codecs: tuple[Codec | NumcodecWrapper, ...] = ()
205+
if metadata.filters is not None:
206+
codecs += metadata.filters
207+
if metadata.compressor is not None:
208+
codecs += (metadata.compressor,)
209+
if not any(isinstance(codec, ArrayBytesCodec) for codec in codecs) and not isinstance(
210+
metadata.dtype, HasObjectCodec
211+
):
212+
# The role filled by the ArrayBytesCodec was implicit in zarr v2. So a valid zarr v2-style
213+
# chain of filters + compressor might not contain a codec identifiable as an array-bytes codec.
214+
# In such a case, we will insert a bytes codec that applies no endian transformation.
215+
# We skip this insertion if the data type is an instance of HasObjectCodec, because
216+
# in zarr v2 these data types required a special codec that functioned like an array bytes codec.
217+
218+
# find the last array-array codec, if any
219+
abc_idx = 0
220+
for idx, codec in enumerate(codecs):
221+
if isinstance(codec, ArrayArrayCodec):
222+
abc_idx = idx + 1
223+
if isinstance(metadata.dtype, HasEndianness):
224+
out_endianness = metadata.dtype.endianness
225+
else:
226+
out_endianness = None
227+
codecs = codecs[:abc_idx] + (BytesCodec(endian=out_endianness),) + codecs[abc_idx:]
228+
if metadata.order == "F":
229+
# Zarr V2 supports declaring the order of an array in metadata. Using the zarr v3 codec
230+
# framework, we express C or F ordered arrays by adding a transpose codec to the front
231+
# of the list of codecs.
232+
codecs = (TransposeCodec(order=tuple(reversed(range(metadata.ndim)))),) + codecs
233+
# We ignore this type check failure because we don't want to change the type signature
234+
# of the from_codecs method yet.
235+
return codecs
236+
237+
200238
def create_codec_pipeline(metadata: ArrayMetadata, *, store: Store | None = None) -> CodecPipeline:
201239
if store is not None:
202240
try:
@@ -208,38 +246,8 @@ def create_codec_pipeline(metadata: ArrayMetadata, *, store: Store | None = None
208246

209247
if isinstance(metadata, ArrayV3Metadata):
210248
return get_pipeline_class().from_codecs(metadata.codecs)
211-
elif isinstance(metadata, ArrayV2Metadata):
212-
_codecs: tuple[Codec | NumcodecWrapper, ...] = ()
213-
if metadata.filters is not None:
214-
_codecs += metadata.filters
215-
if metadata.compressor is not None:
216-
_codecs += (metadata.compressor,)
217-
if not any(isinstance(codec, ArrayBytesCodec) for codec in _codecs) and not isinstance(
218-
metadata.dtype, HasObjectCodec
219-
):
220-
# The role filled by the ArrayBytesCodec was implicit in zarr v2. So a valid zarr v2-style
221-
# chain of filters + compressor might not contain a codec identifiable as an array-bytes codec.
222-
# In such a case, we will insert a bytes codec that applies no endian transformation.
223-
# We skip this insertion if the data type is an instance of HasObjectCodec, because
224-
# in zarr v2 these data types required a special codec that functioned like an array bytes codec.
225-
226-
# find the last array-array codec, if any
227-
abc_idx = 0
228-
for idx, codec in enumerate(_codecs):
229-
if isinstance(codec, ArrayArrayCodec):
230-
abc_idx = idx + 1
231-
if isinstance(metadata.dtype, HasEndianness):
232-
out_endianness = metadata.dtype.endianness
233-
else:
234-
out_endianness = None
235-
_codecs = _codecs[:abc_idx] + (BytesCodec(endian=out_endianness),) + _codecs[abc_idx:]
236-
if metadata.order == "F":
237-
# Zarr V2 supports declaring the order of an array in metadata. Using the zarr v3 codec
238-
# framework, we express C or F ordered arrays by adding a transpose codec to the front
239-
# of the list of codecs.
240-
_codecs = (TransposeCodec(order=tuple(reversed(range(metadata.ndim)))),) + _codecs
241-
# We ignore this type check failure because we don't want to change the type signature
242-
# of the from_codecs method yet.
249+
else:
250+
_codecs = v2_to_v3_codecs(metadata)
243251
return get_pipeline_class().from_codecs(_codecs) # type: ignore[arg-type]
244252
raise TypeError # pragma: no cover
245253

src/zarr/metadata/migrate_v3.py

Lines changed: 10 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,14 @@
11
import asyncio
22
import logging
3-
from typing import Literal, cast
4-
5-
import numcodecs.abc
3+
from typing import TYPE_CHECKING, Literal, cast
64

75
import zarr
86
from zarr import Array, Group
9-
from zarr.abc.codec import ArrayArrayCodec, BytesBytesCodec, Codec
107
from zarr.abc.store import Store
11-
from zarr.codecs.blosc import BloscCodec, BloscShuffle
12-
from zarr.codecs.bytes import BytesCodec
13-
from zarr.codecs.gzip import GzipCodec
14-
from zarr.codecs.transpose import TransposeCodec
15-
from zarr.codecs.zstd import ZstdCodec
8+
from zarr.core.array import v2_to_v3_codecs
169
from zarr.core.buffer.core import default_buffer_prototype
1710
from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding
11+
from zarr.core.codec_pipeline import codecs_from_list
1812
from zarr.core.common import (
1913
ZARR_JSON,
2014
ZARRAY_JSON,
@@ -23,15 +17,15 @@
2317
ZMETADATA_V2_JSON,
2418
ZarrFormat,
2519
)
26-
from zarr.core.dtype.common import HasEndianness
27-
from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType
2820
from zarr.core.group import GroupMetadata
2921
from zarr.core.metadata.v2 import ArrayV2Metadata
3022
from zarr.core.metadata.v3 import ArrayV3Metadata
3123
from zarr.core.sync import sync
32-
from zarr.registry import get_codec_class
3324
from zarr.storage import StorePath
3425

26+
if TYPE_CHECKING:
27+
from zarr.abc.codec import Codec
28+
3529
_logger = logging.getLogger(__name__)
3630

3731

@@ -186,27 +180,10 @@ async def _metadata_exists(zarr_format: ZarrFormat, store_path: StorePath) -> bo
186180
def _convert_array_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata:
187181
chunk_key_encoding = V2ChunkKeyEncoding(separator=metadata_v2.dimension_separator)
188182

189-
codecs: list[Codec] = []
190-
191-
# array-array codecs
192-
if metadata_v2.order == "F":
193-
# F is equivalent to order: n-1, ... 1, 0
194-
codecs.append(TransposeCodec(order=list(range(len(metadata_v2.shape) - 1, -1, -1))))
195-
196-
if metadata_v2.filters is not None:
197-
codecs.extend(_convert_filters(metadata_v2.filters))
198-
199-
# array-bytes codecs
200-
if not isinstance(metadata_v2.dtype, HasEndianness):
201-
codecs.append(BytesCodec(endian=None))
202-
else:
203-
codecs.append(BytesCodec(endian=metadata_v2.dtype.endianness))
204-
205-
# bytes-bytes codecs
206-
if metadata_v2.compressor is not None:
207-
bytes_bytes_codec = _convert_compressor(metadata_v2.compressor, metadata_v2.dtype)
208-
codecs.append(bytes_bytes_codec)
209-
183+
codecs: tuple[Codec, ...] = ()
184+
# We first generate a sequence of V3 codecs, then we ensure that this sequence is valid
185+
aa, ab, bb = codecs_from_list(v2_to_v3_codecs(metadata_v2))
186+
codecs = (*aa, ab, *bb)
210187
return ArrayV3Metadata(
211188
shape=metadata_v2.shape,
212189
data_type=metadata_v2.dtype,
@@ -220,66 +197,6 @@ def _convert_array_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata:
220197
)
221198

222199

223-
def _convert_filters(filters: tuple[numcodecs.abc.Codec, ...]) -> list[ArrayArrayCodec]:
224-
filters_codecs = [_find_numcodecs_zarr3(filter) for filter in filters]
225-
for codec in filters_codecs:
226-
if not isinstance(codec, ArrayArrayCodec):
227-
raise TypeError(f"Filter {type(codec)} is not an ArrayArrayCodec")
228-
229-
return cast(list[ArrayArrayCodec], filters_codecs)
230-
231-
232-
def _convert_compressor(
233-
compressor: numcodecs.abc.Codec, dtype: ZDType[TBaseDType, TBaseScalar]
234-
) -> BytesBytesCodec:
235-
match compressor.codec_id:
236-
case "blosc":
237-
return BloscCodec(
238-
typesize=dtype.to_native_dtype().itemsize,
239-
cname=compressor.cname,
240-
clevel=compressor.clevel,
241-
shuffle=BloscShuffle.from_int(compressor.shuffle),
242-
blocksize=compressor.blocksize,
243-
)
244-
245-
case "zstd":
246-
return ZstdCodec(
247-
level=compressor.level,
248-
checksum=compressor.checksum,
249-
)
250-
251-
case "gzip":
252-
return GzipCodec(level=compressor.level)
253-
254-
case _:
255-
# If possible, find matching zarr.codecs.numcodecs codec
256-
compressor_codec = _find_numcodecs_zarr3(compressor)
257-
258-
if not isinstance(compressor_codec, BytesBytesCodec):
259-
raise TypeError(f"Compressor {type(compressor_codec)} is not a BytesBytesCodec")
260-
261-
return compressor_codec
262-
263-
264-
def _find_numcodecs_zarr3(numcodecs_codec: numcodecs.abc.Codec) -> Codec:
265-
"""Find matching zarr.codecs.numcodecs codec (if it exists)"""
266-
267-
numcodec_name = f"numcodecs.{numcodecs_codec.codec_id}"
268-
numcodec_dict = {
269-
"name": numcodec_name,
270-
"configuration": numcodecs_codec.get_config(),
271-
}
272-
273-
try:
274-
codec_v3 = get_codec_class(numcodec_name)
275-
except KeyError as exc:
276-
raise ValueError(
277-
f"Couldn't find corresponding zarr.codecs.numcodecs codec for {numcodecs_codec.codec_id}"
278-
) from exc
279-
280-
return codec_v3.from_dict(numcodec_dict)
281-
282-
283200
async def _save_v3_metadata(
284201
metadata_v3: ArrayV3Metadata | GroupMetadata, output_path: StorePath, dry_run: bool = False
285202
) -> None:

tests/test_cli/test_migrate_v3.py

Lines changed: 24 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,9 @@
3232

3333
runner = typer_testing.CliRunner()
3434

35-
NUMCODECS_USER_WARNING = "Numcodecs codecs are not in the Zarr version 3 specification and may not be supported by other zarr implementations."
35+
UNSTABLE_SPEC_WARNING = (
36+
"Data saved with this codec may not be supported by other Zarr implementations. "
37+
)
3638

3739

3840
def test_migrate_array(local_store: LocalStore) -> None:
@@ -316,8 +318,8 @@ def test_migrate_compressor(
316318
assert np.all(zarr_array[:] == 1)
317319

318320

319-
@pytest.mark.filterwarnings(f"ignore:{NUMCODECS_USER_WARNING}:UserWarning")
320-
def test_migrate_numcodecs_compressor(local_store: LocalStore) -> None:
321+
@pytest.mark.filterwarnings(f"ignore:.*{UNSTABLE_SPEC_WARNING}.*")
322+
def test_migrate_lzma_compressor(local_store: LocalStore) -> None:
321323
"""Test migration of a numcodecs compressor without a zarr.codecs equivalent."""
322324

323325
lzma_settings = {
@@ -360,7 +362,7 @@ def test_migrate_numcodecs_compressor(local_store: LocalStore) -> None:
360362
assert np.all(zarr_array[:] == 1)
361363

362364

363-
@pytest.mark.filterwarnings(f"ignore:{NUMCODECS_USER_WARNING}:UserWarning")
365+
@pytest.mark.filterwarnings(f"ignore:.*{UNSTABLE_SPEC_WARNING}.*")
364366
def test_migrate_filter(local_store: LocalStore) -> None:
365367
filter_v2 = numcodecs.Delta(dtype="<u2", astype="<u2")
366368
filter_v3 = Delta(dtype="<u2", astype="<u2")
@@ -504,11 +506,8 @@ def test_migrate_unknown_codec(local_store: LocalStore) -> None:
504506

505507
result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)])
506508
assert result.exit_code == 1
507-
assert isinstance(result.exception, ValueError)
508-
assert (
509-
str(result.exception)
510-
== "Couldn't find corresponding zarr.codecs.numcodecs codec for categorize"
511-
)
509+
assert isinstance(result.exception, KeyError)
510+
assert str(result.exception) == "'categorize'"
512511

513512

514513
def test_migrate_incorrect_filter(local_store: LocalStore) -> None:
@@ -524,39 +523,29 @@ def test_migrate_incorrect_filter(local_store: LocalStore) -> None:
524523
fill_value=0,
525524
)
526525

527-
with pytest.warns(UserWarning, match=NUMCODECS_USER_WARNING):
528-
result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)])
526+
result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)])
529527

530-
assert result.exit_code == 1
531-
assert isinstance(result.exception, TypeError)
532-
assert (
533-
str(result.exception)
534-
== "Filter <class 'zarr.codecs.numcodecs._codecs.Zstd'> is not an ArrayArrayCodec"
535-
)
528+
assert result.exit_code == 0
536529

537530

538-
def test_migrate_incorrect_compressor(local_store: LocalStore) -> None:
539-
"""Attempting to convert a compressor (which is the wrong type of codec) should always fail"""
531+
def test_migrate_delta_compressor(local_store: LocalStore) -> None:
532+
"""Attempting to convert a numcodecs compressor should succeed"""
540533

541-
zarr.create_array(
542-
store=local_store,
543-
shape=(10, 10),
544-
chunks=(10, 10),
545-
dtype="uint16",
546-
compressors=numcodecs.Delta(dtype="<u2", astype="<u2"),
547-
zarr_format=2,
548-
fill_value=0,
549-
)
534+
with pytest.warns(UserWarning, match=UNSTABLE_SPEC_WARNING):
535+
zarr.create_array(
536+
store=local_store,
537+
shape=(10, 10),
538+
chunks=(10, 10),
539+
dtype="uint16",
540+
compressors=numcodecs.Delta(dtype="<u2", astype="<u2"),
541+
zarr_format=2,
542+
fill_value=0,
543+
)
550544

551-
with pytest.warns(UserWarning, match=NUMCODECS_USER_WARNING):
545+
with pytest.warns(UserWarning, match=UNSTABLE_SPEC_WARNING):
552546
result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)])
553547

554-
assert result.exit_code == 1
555-
assert isinstance(result.exception, TypeError)
556-
assert (
557-
str(result.exception)
558-
== "Compressor <class 'zarr.codecs.numcodecs._codecs.Delta'> is not a BytesBytesCodec"
559-
)
548+
assert result.exit_code == 0
560549

561550

562551
@pytest.mark.parametrize("zarr_format", [2, 3])

0 commit comments

Comments
 (0)