Skip to content

Commit 75b2197

Browse files
committed
adds array_bytes_codec kwarg
1 parent b564ae6 commit 75b2197

File tree

6 files changed

+99
-25
lines changed

6 files changed

+99
-25
lines changed

src/zarr/core/array.py

Lines changed: 38 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@
9797
from zarr.errors import MetadataValidationError
9898
from zarr.registry import (
9999
_parse_array_array_codec,
100+
_parse_array_bytes_codec,
100101
_parse_bytes_bytes_codec,
101102
_resolve_codec,
102103
get_pipeline_class,
@@ -385,6 +386,7 @@ async def _create(
385386
) -> AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata]: ...
386387

387388
@classmethod
389+
# @deprecated("Use `zarr.api.asynchronous.create_array` instead.")
388390
async def _create(
389391
cls,
390392
store: StoreLike,
@@ -417,6 +419,7 @@ async def _create(
417419
config: ArrayConfig | ArrayConfigParams | None = None,
418420
) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]:
419421
"""
422+
Deprecated in favor of `zarr.api.asynchronous.create_array`.
420423
Method to create a new asynchronous array instance.
421424
422425
Parameters
@@ -677,10 +680,10 @@ async def _create_v2(
677680
dimension_separator = "."
678681

679682
dtype = parse_dtype(dtype, zarr_format=2)
680-
if not filters:
681-
filters = _default_filters(dtype)
682-
if not compressor:
683-
compressor = _default_compressor(dtype)
683+
# if not filters:
684+
# filters = _default_filters(dtype)
685+
# if not compressor:
686+
# compressor = _default_compressor(dtype)
684687

685688
# inject VLenUTF8 for str dtype if not already present
686689
if np.issubdtype(dtype, np.str_):
@@ -1530,6 +1533,7 @@ class Array:
15301533
_async_array: AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata]
15311534

15321535
@classmethod
1536+
# @deprecated("Use `zarr.create_array` instead.")
15331537
@_deprecate_positional_args
15341538
def _create(
15351539
cls,
@@ -1561,7 +1565,8 @@ def _create(
15611565
overwrite: bool = False,
15621566
config: ArrayConfig | ArrayConfigParams | None = None,
15631567
) -> Array:
1564-
"""Creates a new Array instance from an initialized store.
1568+
"""Deprecated in favor of `zarr.create_array`.
1569+
Creates a new Array instance from an initialized store.
15651570
15661571
Parameters
15671572
----------
@@ -3504,6 +3509,7 @@ def _get_default_codecs(
35043509
| numcodecs.abc.Codec
35053510
| Literal["auto"]
35063511
)
3512+
ArrayBytesCodecParam: TypeAlias = dict[str, JSON] | ArrayBytesCodec | Literal["auto"]
35073513

35083514

35093515
class ShardsConfigParam(TypedDict):
@@ -3524,6 +3530,7 @@ async def create_array(
35243530
shards: ShardsParam | None = None,
35253531
filters: FiltersParam = "auto",
35263532
compressors: CompressorsParam = "auto",
3533+
array_bytes_codec: ArrayBytesCodecParam | None = "auto",
35273534
fill_value: Any | None = 0,
35283535
order: MemoryOrder | None = None,
35293536
zarr_format: ZarrFormat | None = 3,
@@ -3580,6 +3587,10 @@ async def create_array(
35803587
For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may be provided for Zarr v2.
35813588
If no ``compressors`` are provided, a default compressor will be used.
35823589
These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`.
3590+
array_bytes_codec : dict[str, JSON] | ArrayBytesCodec, optional
3591+
Array-to-bytes codec to use for encoding the array data.
3592+
Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion.
3593+
If no ``array_bytes_codec`` is provided, the `zarr.codecs.BytesCodec` codec will be used.
35833594
fill_value : Any, optional
35843595
Fill value for the array.
35853596
order : {"C", "F"}, optional
@@ -3680,7 +3691,10 @@ async def create_array(
36803691
)
36813692
else:
36823693
array_array, array_bytes, bytes_bytes = _parse_chunk_encoding_v3(
3683-
compressors=compressors, filters=filters, dtype=dtype_parsed
3694+
compressors=compressors,
3695+
filters=filters,
3696+
array_bytes_codec=array_bytes_codec,
3697+
dtype=dtype_parsed,
36843698
)
36853699
sub_codecs = cast(tuple[Codec, ...], (*array_array, array_bytes, *bytes_bytes))
36863700
codecs_out: tuple[Codec, ...]
@@ -3825,7 +3839,11 @@ def _parse_chunk_encoding_v2(
38253839
if compressor == "auto":
38263840
_compressor = default_compressor
38273841
else:
3828-
if isinstance(compressor, Iterable) and not isinstance(compressor, dict):
3842+
if (
3843+
isinstance(compressor, Iterable)
3844+
and not isinstance(compressor, dict)
3845+
and len(compressor) > 1
3846+
):
38293847
msg = f"For Zarr v2 arrays, the `compressor` must be a single codec. Got an iterable with type {type(compressor)} instead."
38303848
raise TypeError(msg)
38313849
_compressor = parse_compressor(compressor)
@@ -3846,8 +3864,9 @@ def _parse_chunk_encoding_v2(
38463864

38473865
def _parse_chunk_encoding_v3(
38483866
*,
3849-
compressors: CompressorsParam,
3850-
filters: FiltersParam,
3867+
compressors: CompressorsParam | None,
3868+
filters: FiltersParam | None,
3869+
array_bytes_codec: ArrayBytesCodecParam | None,
38513870
dtype: np.dtype[Any],
38523871
) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]:
38533872
"""
@@ -3864,6 +3883,8 @@ def _parse_chunk_encoding_v3(
38643883
else:
38653884
if isinstance(compressors, dict | Codec):
38663885
maybe_bytes_bytes = (compressors,)
3886+
elif compressors is None:
3887+
maybe_bytes_bytes = ()
38673888
else:
38683889
maybe_bytes_bytes = cast(Iterable[Codec | dict[str, JSON]], compressors)
38693890

@@ -3874,8 +3895,15 @@ def _parse_chunk_encoding_v3(
38743895
else:
38753896
if isinstance(filters, dict | Codec):
38763897
maybe_array_array = (filters,)
3898+
elif filters is None:
3899+
maybe_array_array = ()
38773900
else:
38783901
maybe_array_array = cast(Iterable[Codec | dict[str, JSON]], filters)
38793902
out_array_array = tuple(_parse_array_array_codec(c) for c in maybe_array_array)
38803903

3881-
return out_array_array, default_array_bytes, out_bytes_bytes
3904+
if array_bytes_codec == "auto":
3905+
out_array_bytes = default_array_bytes
3906+
else:
3907+
out_array_bytes = _parse_array_bytes_codec(array_bytes_codec)
3908+
3909+
return out_array_array, out_array_bytes, out_bytes_bytes

src/zarr/core/metadata/v2.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,8 @@ def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None:
232232
if data is None:
233233
return data
234234
if isinstance(data, Iterable):
235+
if len(data) == 0:
236+
return None
235237
for idx, val in enumerate(data):
236238
if isinstance(val, numcodecs.abc.Codec):
237239
out.append(val)
@@ -249,6 +251,11 @@ def parse_compressor(data: object) -> numcodecs.abc.Codec | None:
249251
"""
250252
Parse a potential compressor.
251253
"""
254+
if isinstance(data, Iterable) and not isinstance(data, dict):
255+
if len(data) == 0:
256+
data = None
257+
else:
258+
data = data[0]
252259
if data is None or isinstance(data, numcodecs.abc.Codec):
253260
return data
254261
if isinstance(data, dict):

src/zarr/registry.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec) -> BytesBytesCodec:
186186
return result
187187

188188

189-
def _parse_array_bytes_codec(data: dict[str, JSON] | ArrayBytesCodec) -> ArrayBytesCodec:
189+
def _parse_array_bytes_codec(data: dict[str, JSON] | Codec) -> ArrayBytesCodec:
190190
"""
191191
Normalize the input to a ``ArrayBytesCodec`` instance.
192192
If the input is already a ``ArrayBytesCodec``, it is returned as is. If the input is a dict, it

tests/test_array.py

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,14 @@
1212

1313
import zarr.api.asynchronous
1414
from zarr import Array, AsyncArray, Group
15-
from zarr.codecs import BytesCodec, VLenBytesCodec, ZstdCodec
16-
from zarr.codecs.gzip import GzipCodec
17-
from zarr.codecs.transpose import TransposeCodec
15+
from zarr.codecs import (
16+
BytesCodec,
17+
GzipCodec,
18+
TransposeCodec,
19+
VLenBytesCodec,
20+
VLenUTF8Codec,
21+
ZstdCodec,
22+
)
1823
from zarr.core._info import ArrayInfo
1924
from zarr.core.array import (
2025
CompressorsParam,
@@ -975,12 +980,45 @@ async def test_create_array_v3_compressors(
975980
compressors=compressors,
976981
)
977982
_, _, bb_codecs_expected = _parse_chunk_encoding_v3(
978-
filters=(), compressors=compressors, dtype=np.dtype(dtype)
983+
filters=(), compressors=compressors, array_bytes_codec="auto", dtype=np.dtype(dtype)
979984
)
980985
# TODO: find a better way to get the compressors from the array.
981986
assert arr.codec_pipeline.bytes_bytes_codecs == bb_codecs_expected # type: ignore[attr-defined]
982987

983988

989+
@pytest.mark.parametrize("store", ["memory"], indirect=True)
990+
@pytest.mark.parametrize("dtype", ["uint8", "float32", "str"])
991+
async def test_create_array_no_filters_compressors(store: MemoryStore, dtype: str) -> None:
992+
"""
993+
Test that the default ``filters`` and ``compressors`` are removed when ``create_array`` is invoked.
994+
"""
995+
996+
# v2
997+
arr = await create_array(
998+
store=store,
999+
dtype=dtype,
1000+
shape=(10,),
1001+
zarr_format=2,
1002+
compressors=(),
1003+
filters=(),
1004+
)
1005+
assert arr.metadata.filters == None # type: ignore[union-attr]
1006+
assert arr.metadata.compressor == None # type: ignore[union-attr]
1007+
1008+
# v3
1009+
arr = await create_array(
1010+
store=store,
1011+
dtype=dtype,
1012+
shape=(10,),
1013+
compressors=(),
1014+
filters=(),
1015+
)
1016+
if dtype == "str":
1017+
assert arr.metadata.codecs == [VLenUTF8Codec()] # type: ignore[union-attr]
1018+
else:
1019+
assert arr.metadata.codecs == [BytesCodec()] # type: ignore[union-attr]
1020+
1021+
9841022
@pytest.mark.parametrize("store", ["memory"], indirect=True)
9851023
@pytest.mark.parametrize(
9861024
"filters",
@@ -1027,7 +1065,7 @@ async def test_create_array_v3_filters(store: MemoryStore, filters: FiltersParam
10271065
filters=filters,
10281066
)
10291067
aa_codecs_expected, _, _ = _parse_chunk_encoding_v3(
1030-
filters=filters, compressors=(), dtype=np.dtype(dtype)
1068+
filters=filters, compressors=(), array_bytes_codec="auto", dtype=np.dtype(dtype)
10311069
)
10321070
# TODO: find a better way to get the filters from the array.
10331071
assert arr.codec_pipeline.array_array_codecs == aa_codecs_expected # type: ignore[attr-defined]

tests/test_codecs/test_endian.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import pytest
55

66
from zarr import AsyncArray
7+
import zarr
78
from zarr.abc.store import Store
89
from zarr.codecs import BytesCodec
910
from zarr.storage.common import StorePath
@@ -17,14 +18,14 @@ async def test_endian(store: Store, endian: Literal["big", "little"]) -> None:
1718
data = np.arange(0, 256, dtype="uint16").reshape((16, 16))
1819
path = "endian"
1920
spath = StorePath(store, path)
20-
a = await AsyncArray._create(
21+
a = await zarr.api.asynchronous.create_array(
2122
spath,
2223
shape=data.shape,
23-
chunk_shape=(16, 16),
24+
chunks=(16, 16),
2425
dtype=data.dtype,
2526
fill_value=0,
26-
chunk_key_encoding=("v2", "."),
27-
codecs=[BytesCodec(endian=endian)],
27+
chunk_key_encoding={"name": "v2", "separator": "."},
28+
array_bytes_codec=BytesCodec(endian=endian),
2829
)
2930

3031
await _AsyncArrayProxy(a)[:, :].set(data)
@@ -43,14 +44,14 @@ async def test_endian_write(
4344
data = np.arange(0, 256, dtype=dtype_input_endian).reshape((16, 16))
4445
path = "endian"
4546
spath = StorePath(store, path)
46-
a = await AsyncArray._create(
47+
a = await zarr.api.asynchronous.create_array(
4748
spath,
4849
shape=data.shape,
49-
chunk_shape=(16, 16),
50+
chunks=(16, 16),
5051
dtype="uint16",
5152
fill_value=0,
52-
chunk_key_encoding=("v2", "."),
53-
codecs=[BytesCodec(endian=dtype_store_endian)],
53+
chunk_key_encoding={"name": "v2", "separator": "."},
54+
array_bytes_codec=BytesCodec(endian=dtype_store_endian),
5455
)
5556

5657
await _AsyncArrayProxy(a)[:, :].set(data)

tests/test_codecs/test_sharding.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -295,7 +295,7 @@ async def test_delete_empty_shards(store: Store) -> None:
295295
spath,
296296
shape=(16, 16),
297297
chunks=(8, 8),
298-
shards=(16, 8),
298+
shards=(8, 16),
299299
dtype="uint16",
300300
fill_value=1,
301301
)

0 commit comments

Comments
 (0)