Skip to content

Commit 3f14d2a

Browse files
committed
adds filters, serializer, compressors properties to Array
1 parent 617e2cd commit 3f14d2a

File tree

7 files changed

+146
-37
lines changed

7 files changed

+146
-37
lines changed

src/zarr/api/synchronous.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -802,7 +802,7 @@ def create_array(
802802
Use ``None`` to omit default filters.
803803
compressors : Iterable[Codec], optional
804804
List of compressors to apply to the array. Compressors are applied in order, and after any
805-
filters are applied (if any are specified).
805+
filters are applied (if any are specified) and the data is serialized into bytes.
806806
807807
For Zarr v3, a "compressor" is a codec that takes a bytestrea, and
808808
returns another bytestream. Multiple compressors my be provided for Zarr v3.

src/zarr/core/_info.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ class ArrayInfo:
8787
_store_type: str
8888
_compressor: numcodecs.abc.Codec | None = None
8989
_filters: tuple[numcodecs.abc.Codec, ...] | None = None
90-
_codecs: list[Codec] | None = None
90+
_codecs: tuple[Codec, ...] | None = None
9191
_count_bytes: int | None = None
9292
_count_bytes_stored: int | None = None
9393
_count_chunks_initialized: int | None = None

src/zarr/core/array.py

Lines changed: 88 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from warnings import warn
2121

2222
import numcodecs
23+
import numcodecs.abc
2324
import numpy as np
2425
import numpy.typing as npt
2526
from typing_extensions import deprecated
@@ -911,6 +912,57 @@ def size(self) -> int:
911912
"""
912913
return np.prod(self.metadata.shape).item()
913914

915+
@property
916+
def filters(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...]:
917+
"""
918+
Filters that are applied to each chunk of the array, in order, before serializing that
919+
chunk to bytes.
920+
"""
921+
if self.metadata.zarr_format == 2:
922+
filters = self.metadata.filters
923+
if filters is None:
924+
return ()
925+
return filters
926+
927+
return tuple(codec for codec in self.metadata.codecs if isinstance(codec, ArrayArrayCodec))
928+
929+
@property
930+
def serializer(self) -> ArrayBytesCodec | None:
931+
"""
932+
Array-to-bytes codec to use for serializing the chunks into bytes.
933+
"""
934+
if self.metadata.zarr_format == 2:
935+
return None
936+
937+
return next(codec for codec in self.metadata.codecs if isinstance(codec, ArrayBytesCodec))
938+
939+
@property
940+
@deprecated("Use AsyncArray.compressors instead.")
941+
def compressor(self) -> numcodecs.abc.Codec | None:
942+
"""
943+
Compressor that is applied to each chunk of the array.
944+
945+
.. deprecated:: 3.0.0
946+
`array.compressor` is deprecated and will be removed in a future release.
947+
Use `array.compressors` instead.
948+
"""
949+
if self.metadata.zarr_format == 2:
950+
return self.metadata.compressor
951+
raise TypeError("`compressor` is not available for Zarr format 3 arrays.")
952+
953+
@property
954+
def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...]:
955+
"""
956+
Compressors that are applied to each chunk of the array. Compressors are applied in order, and after any
957+
filters are applied (if any are specified) and the data is serialized into bytes.
958+
"""
959+
if self.metadata.zarr_format == 2:
960+
if self.metadata.compressor is not None:
961+
return (self.metadata.compressor,)
962+
return ()
963+
964+
return tuple(codec for codec in self.metadata.codecs if isinstance(codec, BytesBytesCodec))
965+
914966
@property
915967
def dtype(self) -> np.dtype[Any]:
916968
"""Returns the data type of the array.
@@ -1967,6 +2019,41 @@ def read_only(self) -> bool:
19672019
def fill_value(self) -> Any:
19682020
return self.metadata.fill_value
19692021

2022+
@property
2023+
def filters(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...]:
2024+
"""
2025+
Filters that are applied to each chunk of the array, in order, before serializing that
2026+
chunk to bytes.
2027+
"""
2028+
return self._async_array.filters
2029+
2030+
@property
2031+
def serializer(self) -> None | ArrayBytesCodec:
2032+
"""
2033+
Array-to-bytes codec to use for serializing the chunks into bytes.
2034+
"""
2035+
return self._async_array.serializer
2036+
2037+
@property
2038+
@deprecated("Use Array.compressors instead.")
2039+
def compressor(self) -> numcodecs.abc.Codec | None:
2040+
"""
2041+
Compressor that is applied to each chunk of the array.
2042+
2043+
.. deprecated:: 3.0.0
2044+
`array.compressor` is deprecated and will be removed in a future release.
2045+
Use `array.compressors` instead.
2046+
"""
2047+
return self._async_array.compressor
2048+
2049+
@property
2050+
def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...]:
2051+
"""
2052+
Compressors that are applied to each chunk of the array. Compressors are applied in order, and after any
2053+
filters are applied (if any are specified) and the data is serialized into bytes.
2054+
"""
2055+
return self._async_array.compressors
2056+
19702057
@property
19712058
def cdata_shape(self) -> ChunkCoords:
19722059
"""
@@ -3710,7 +3797,7 @@ async def create_array(
37103797
Use ``None`` to omit default filters.
37113798
compressors : Iterable[Codec], optional
37123799
List of compressors to apply to the array. Compressors are applied in order, and after any
3713-
filters are applied (if any are specified).
3800+
filters are applied (if any are specified) and the data is serialized into bytes.
37143801
37153802
For Zarr v3, a "compressor" is a codec that takes a bytestrea, and
37163803
returns another bytestream. Multiple compressors my be provided for Zarr v3.

src/zarr/core/group.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1065,7 +1065,7 @@ async def create_array(
10651065
Use ``None`` to omit default filters.
10661066
compressors : Iterable[Codec], optional
10671067
List of compressors to apply to the array. Compressors are applied in order, and after any
1068-
filters are applied (if any are specified).
1068+
filters are applied (if any are specified) and the data is serialized into bytes.
10691069
10701070
For Zarr v3, a "compressor" is a codec that takes a bytestrea, and
10711071
returns another bytestream. Multiple compressors my be provided for Zarr v3.
@@ -2321,7 +2321,7 @@ def create_array(
23212321
Use ``None`` to omit default filters.
23222322
compressors : Iterable[Codec], optional
23232323
List of compressors to apply to the array. Compressors are applied in order, and after any
2324-
filters are applied (if any are specified).
2324+
filters are applied (if any are specified) and the data is serialized into bytes.
23252325
23262326
For Zarr v3, a "compressor" is a codec that takes a bytestrea, and
23272327
returns another bytestream. Multiple compressors my be provided for Zarr v3.
@@ -2710,7 +2710,7 @@ def array(
27102710
Use ``None`` to omit default filters.
27112711
compressors : Iterable[Codec], optional
27122712
List of compressors to apply to the array. Compressors are applied in order, and after any
2713-
filters are applied (if any are specified).
2713+
filters are applied (if any are specified) and the data is serialized into bytes.
27142714
27152715
For Zarr v3, a "compressor" is a codec that takes a bytestrea, and
27162716
returns another bytestream. Multiple compressors my be provided for Zarr v3.

src/zarr/core/metadata/v3.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,7 @@ def __init__(
254254
config=ArrayConfig.from_dict({}), # TODO: config is not needed here.
255255
prototype=default_buffer_prototype(), # TODO: prototype is not needed here.
256256
)
257-
codecs_parsed = [c.evolve_from_array_spec(array_spec) for c in codecs_parsed_partial]
257+
codecs_parsed = tuple(c.evolve_from_array_spec(array_spec) for c in codecs_parsed_partial)
258258
validate_codecs(codecs_parsed_partial, data_type_parsed)
259259

260260
object.__setattr__(self, "shape", shape_parsed)

tests/test_array.py

Lines changed: 48 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -510,9 +510,9 @@ def test_info_v3(self, chunks: tuple[int, int], shards: tuple[int, int] | None)
510510
_order="C",
511511
_read_only=False,
512512
_store_type="MemoryStore",
513-
_codecs=[BytesCodec(), ZstdCodec()]
513+
_codecs=(BytesCodec(), ZstdCodec())
514514
if shards is None
515-
else [ShardingCodec(chunk_shape=chunks, codecs=[BytesCodec(), ZstdCodec()])],
515+
else (ShardingCodec(chunk_shape=chunks, codecs=[BytesCodec(), ZstdCodec()]),),
516516
_count_bytes=512,
517517
)
518518
assert result == expected
@@ -536,7 +536,7 @@ def test_info_complete(self, chunks: tuple[int, int], shards: tuple[int, int] |
536536
_order="C",
537537
_read_only=False,
538538
_store_type="MemoryStore",
539-
_codecs=[BytesCodec()] if shards is None else [ShardingCodec(chunk_shape=chunks)],
539+
_codecs=(BytesCodec(),) if shards is None else (ShardingCodec(chunk_shape=chunks),),
540540
_count_bytes=512,
541541
_count_chunks_initialized=0,
542542
_count_bytes_stored=373 if shards is None else 578, # the metadata?
@@ -596,9 +596,9 @@ async def test_info_v3_async(
596596
_order="C",
597597
_read_only=False,
598598
_store_type="MemoryStore",
599-
_codecs=[BytesCodec(), ZstdCodec()]
599+
_codecs=(BytesCodec(), ZstdCodec())
600600
if shards is None
601-
else [ShardingCodec(chunk_shape=chunks, codecs=[BytesCodec(), ZstdCodec()])],
601+
else (ShardingCodec(chunk_shape=chunks, codecs=[BytesCodec(), ZstdCodec()]),),
602602
_count_bytes=512,
603603
)
604604
assert result == expected
@@ -624,7 +624,7 @@ async def test_info_complete_async(
624624
_order="C",
625625
_read_only=False,
626626
_store_type="MemoryStore",
627-
_codecs=[BytesCodec()] if shards is None else [ShardingCodec(chunk_shape=chunks)],
627+
_codecs=(BytesCodec(),) if shards is None else (ShardingCodec(chunk_shape=chunks),),
628628
_count_bytes=512,
629629
_count_chunks_initialized=0,
630630
_count_bytes_stored=373 if shards is None else 578, # the metadata?
@@ -839,7 +839,8 @@ def test_array_create_metadata_order_v2(
839839
arr = zarr.create_array(store=store, shape=(2, 2), order=order, zarr_format=2, dtype="i4")
840840

841841
expected = order or zarr.config.get("array.order")
842-
assert arr.metadata.order == expected # type: ignore[union-attr]
842+
assert arr.metadata.zarr_format == 2 # guard for mypy
843+
assert arr.metadata.order == expected
843844

844845

845846
@pytest.mark.parametrize("order_config", ["C", "F", None])
@@ -1048,10 +1049,15 @@ async def test_create_array_no_filters_compressors(
10481049
compressors=empty_value,
10491050
filters=empty_value,
10501051
)
1052+
# Test metadata explicitly
1053+
assert arr.metadata.zarr_format == 2 # guard for mypy
10511054
# The v2 metadata stores None and () separately
1052-
assert arr.metadata.filters == empty_value # type: ignore[union-attr]
1055+
assert arr.metadata.filters == empty_value
10531056
# The v2 metadata does not allow tuple for compressor, therefore it is turned into None
1054-
assert arr.metadata.compressor is None # type: ignore[union-attr]
1057+
assert arr.metadata.compressor is None
1058+
1059+
assert arr.filters == ()
1060+
assert arr.compressors == ()
10551061

10561062
# v3
10571063
arr = await create_array(
@@ -1061,10 +1067,13 @@ async def test_create_array_no_filters_compressors(
10611067
compressors=empty_value,
10621068
filters=empty_value,
10631069
)
1070+
assert arr.metadata.zarr_format == 3 # guard for mypy
10641071
if dtype == "str":
1065-
assert arr.metadata.codecs == [VLenUTF8Codec()] # type: ignore[union-attr]
1072+
assert arr.metadata.codecs == (VLenUTF8Codec(),)
1073+
assert arr.serializer == VLenUTF8Codec()
10661074
else:
1067-
assert arr.metadata.codecs == [BytesCodec()] # type: ignore[union-attr]
1075+
assert arr.metadata.codecs == (BytesCodec(),)
1076+
assert arr.serializer == BytesCodec()
10681077

10691078

10701079
@pytest.mark.parametrize("store", ["memory"], indirect=True)
@@ -1130,12 +1139,11 @@ async def test_create_array_v3_chunk_encoding(
11301139
filters=filters,
11311140
compressors=compressors,
11321141
)
1133-
aa_codecs_expected, _, bb_codecs_expected = _parse_chunk_encoding_v3(
1142+
filters_expected, _, compressors_expected = _parse_chunk_encoding_v3(
11341143
filters=filters, compressors=compressors, serializer="auto", dtype=np.dtype(dtype)
11351144
)
1136-
# TODO: find a better way to get the filters / compressors from the array.
1137-
assert arr.codec_pipeline.array_array_codecs == aa_codecs_expected # type: ignore[attr-defined]
1138-
assert arr.codec_pipeline.bytes_bytes_codecs == bb_codecs_expected # type: ignore[attr-defined]
1145+
assert arr.filters == filters_expected
1146+
assert arr.compressors == compressors_expected
11391147

11401148

11411149
@pytest.mark.parametrize("store", ["memory"], indirect=True)
@@ -1167,9 +1175,16 @@ async def test_create_array_v2_chunk_encoding(
11671175
filters_expected, compressor_expected = _parse_chunk_encoding_v2(
11681176
filters=filters, compressor=compressors, dtype=np.dtype(dtype)
11691177
)
1170-
# TODO: find a better way to get the filters/compressor from the array.
1171-
assert arr.metadata.compressor == compressor_expected # type: ignore[union-attr]
1172-
assert arr.metadata.filters == filters_expected # type: ignore[union-attr]
1178+
assert arr.metadata.zarr_format == 2 # guard for mypy
1179+
assert arr.metadata.compressor == compressor_expected
1180+
assert arr.metadata.filters == filters_expected
1181+
1182+
# Normalize for property getters
1183+
compressor_expected = () if compressor_expected is None else (compressor_expected,)
1184+
filters_expected = () if filters_expected is None else filters_expected
1185+
1186+
assert arr.compressors == compressor_expected
1187+
assert arr.filters == filters_expected
11731188

11741189

11751190
@pytest.mark.parametrize("store", ["memory"], indirect=True)
@@ -1185,12 +1200,12 @@ async def test_create_array_v3_default_filters_compressors(store: MemoryStore, d
11851200
shape=(10,),
11861201
zarr_format=3,
11871202
)
1188-
expected_aa, expected_ab, expected_bb = _get_default_chunk_encoding_v3(np_dtype=np.dtype(dtype))
1189-
# TODO: define the codec pipeline class such that these fields are required, which will obviate the
1190-
# type ignore statements
1191-
assert arr.codec_pipeline.array_array_codecs == expected_aa # type: ignore[attr-defined]
1192-
assert arr.codec_pipeline.bytes_bytes_codecs == expected_bb # type: ignore[attr-defined]
1193-
assert arr.codec_pipeline.array_bytes_codec == expected_ab # type: ignore[attr-defined]
1203+
expected_filters, expected_serializer, expected_compressors = _get_default_chunk_encoding_v3(
1204+
np_dtype=np.dtype(dtype)
1205+
)
1206+
assert arr.filters == expected_filters
1207+
assert arr.serializer == expected_serializer
1208+
assert arr.compressors == expected_compressors
11941209

11951210

11961211
@pytest.mark.parametrize("store", ["memory"], indirect=True)
@@ -1209,8 +1224,15 @@ async def test_create_array_v2_default_filters_compressors(store: MemoryStore, d
12091224
expected_filters, expected_compressors = _get_default_chunk_encoding_v2(
12101225
np_dtype=np.dtype(dtype)
12111226
)
1212-
assert arr.metadata.filters == expected_filters # type: ignore[union-attr]
1213-
assert arr.metadata.compressor == expected_compressors # type: ignore[union-attr]
1227+
assert arr.metadata.zarr_format == 2 # guard for mypy
1228+
assert arr.metadata.filters == expected_filters
1229+
assert arr.metadata.compressor == expected_compressors
1230+
1231+
# Normalize for property getters
1232+
expected_filters = () if expected_filters is None else expected_filters
1233+
expected_compressors = () if expected_compressors is None else (expected_compressors,)
1234+
assert arr.filters == expected_filters
1235+
assert arr.compressors == expected_compressors
12141236

12151237

12161238
@pytest.mark.parametrize("store", ["memory"], indirect=True)

tests/test_info.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def test_array_info(zarr_format: ZarrFormat) -> None:
5959
_order="C",
6060
_read_only=True,
6161
_store_type="MemoryStore",
62-
_codecs=[BytesCodec()],
62+
_codecs=(BytesCodec(),),
6363
)
6464
result = repr(info)
6565
assert result == textwrap.dedent(f"""\
@@ -71,7 +71,7 @@ def test_array_info(zarr_format: ZarrFormat) -> None:
7171
Order : C
7272
Read-only : True
7373
Store type : MemoryStore
74-
Codecs : [{{'endian': <Endian.little: 'little'>}}]""")
74+
Codecs : ({{'endian': <Endian.little: 'little'>}},)""")
7575

7676

7777
@pytest.mark.parametrize("zarr_format", ZARR_FORMATS)
@@ -95,7 +95,7 @@ def test_array_info_complete(
9595
_order="C",
9696
_read_only=True,
9797
_store_type="MemoryStore",
98-
_codecs=[BytesCodec()],
98+
_codecs=(BytesCodec(),),
9999
_count_bytes=count_bytes,
100100
_count_bytes_stored=count_bytes_stored,
101101
_count_chunks_initialized=count_chunks_initialized,
@@ -110,7 +110,7 @@ def test_array_info_complete(
110110
Order : C
111111
Read-only : True
112112
Store type : MemoryStore
113-
Codecs : [{{'endian': <Endian.little: 'little'>}}]
113+
Codecs : ({{'endian': <Endian.little: 'little'>}},)
114114
No. bytes : {count_bytes} ({count_bytes_formatted})
115115
No. bytes stored : {count_bytes_stored_formatted}
116116
Storage ratio : {storage_ratio_formatted}

0 commit comments

Comments
 (0)