Skip to content

Commit 9138528

Browse files
authored
Adds filters, compressors and serializer props to Array (#2652)
* adds filters, serializer, compressors properties to Array * adapt Array.info * fixes doctests * ugly numcodecs class names * always show filters and compressors in Array.info * format
1 parent 5c6267e commit 9138528

File tree

12 files changed

+265
-104
lines changed

12 files changed

+265
-104
lines changed

docs/user-guide/arrays.rst

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -168,8 +168,8 @@ argument accepted by all array creation functions. For example::
168168
>>> data = np.arange(100000000, dtype='int32').reshape(10000, 10000)
169169
>>> z = zarr.create_array(store='data/example-5.zarr', shape=data.shape, dtype=data.dtype, chunks=(1000, 1000), compressors=compressors)
170170
>>> z[:] = data
171-
>>> z.metadata.codecs
172-
[BytesCodec(endian=<Endian.little: 'little'>), BloscCodec(typesize=4, cname=<BloscCname.zstd: 'zstd'>, clevel=3, shuffle=<BloscShuffle.bitshuffle: 'bitshuffle'>, blocksize=0)]
171+
>>> z.compressors
172+
(BloscCodec(typesize=4, cname=<BloscCname.zstd: 'zstd'>, clevel=3, shuffle=<BloscShuffle.bitshuffle: 'bitshuffle'>, blocksize=0),)
173173

174174
This array above will use Blosc as the primary compressor, using the Zstandard
175175
algorithm (compression level 3) internally within Blosc, and with the
@@ -188,7 +188,9 @@ which can be used to print useful diagnostics, e.g.::
188188
Order : C
189189
Read-only : False
190190
Store type : LocalStore
191-
Codecs : [{'endian': <Endian.little: 'little'>}, {'typesize': 4, 'cname': <BloscCname.zstd: 'zstd'>, 'clevel': 3, 'shuffle': <BloscShuffle.bitshuffle: 'bitshuffle'>, 'blocksize': 0}]
191+
Filters : ()
192+
Serializer : BytesCodec(endian=<Endian.little: 'little'>)
193+
Compressors : (BloscCodec(typesize=4, cname=<BloscCname.zstd: 'zstd'>, clevel=3, shuffle=<BloscShuffle.bitshuffle: 'bitshuffle'>, blocksize=0),)
192194
No. bytes : 400000000 (381.5M)
193195

194196
The :func:`zarr.Array.info_complete` method inspects the underlying store and
@@ -203,7 +205,9 @@ prints additional diagnostics, e.g.::
203205
Order : C
204206
Read-only : False
205207
Store type : LocalStore
206-
Codecs : [{'endian': <Endian.little: 'little'>}, {'typesize': 4, 'cname': <BloscCname.zstd: 'zstd'>, 'clevel': 3, 'shuffle': <BloscShuffle.bitshuffle: 'bitshuffle'>, 'blocksize': 0}]
208+
Filters : ()
209+
Serializer : BytesCodec(endian=<Endian.little: 'little'>)
210+
Compressors : (BloscCodec(typesize=4, cname=<BloscCname.zstd: 'zstd'>, clevel=3, shuffle=<BloscShuffle.bitshuffle: 'bitshuffle'>, blocksize=0),)
207211
No. bytes : 400000000 (381.5M)
208212
No. bytes stored : 9696302
209213
Storage ratio : 41.3
@@ -223,8 +227,8 @@ here is an array using Gzip compression, level 1::
223227
>>> data = np.arange(100000000, dtype='int32').reshape(10000, 10000)
224228
>>> z = zarr.create_array(store='data/example-6.zarr', shape=data.shape, dtype=data.dtype, chunks=(1000, 1000), compressors=zarr.codecs.GzipCodec(level=1))
225229
>>> z[:] = data
226-
>>> z.metadata.codecs
227-
[BytesCodec(endian=<Endian.little: 'little'>), GzipCodec(level=1)]
230+
>>> z.compressors
231+
(GzipCodec(level=1),)
228232

229233
Here is an example using LZMA from NumCodecs_ with a custom filter pipeline including LZMA's
230234
built-in delta filter::
@@ -236,23 +240,24 @@ built-in delta filter::
236240
>>> compressors = LZMA(filters=lzma_filters)
237241
>>> data = np.arange(100000000, dtype='int32').reshape(10000, 10000)
238242
>>> z = zarr.create_array(store='data/example-7.zarr', shape=data.shape, dtype=data.dtype, chunks=(1000, 1000), compressors=compressors)
239-
>>> z.metadata.codecs
240-
[BytesCodec(endian=<Endian.little: 'little'>), _make_bytes_bytes_codec.<locals>._Codec(codec_name='numcodecs.lzma', codec_config={'id': 'lzma', 'filters': [{'id': 3, 'dist': 4}, {'id': 33, 'preset': 1}]})]
243+
>>> z.compressors
244+
(_make_bytes_bytes_codec.<locals>._Codec(codec_name='numcodecs.lzma', codec_config={'id': 'lzma', 'filters': [{'id': 3, 'dist': 4}, {'id': 33, 'preset': 1}]}),)
241245

242246
The default compressor can be changed by setting the value of the using Zarr's
243247
:ref:`user-guide-config`, e.g.::
244248

245249
>>> with zarr.config.set({'array.v2_default_compressor.numeric': {'id': 'blosc'}}):
246250
... z = zarr.create_array(store={}, shape=(100000000,), chunks=(1000000,), dtype='int32', zarr_format=2)
247-
>>> z.metadata.filters
248-
>>> z.metadata.compressor
249-
Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
251+
>>> z.filters
252+
()
253+
>>> z.compressors
254+
(Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0),)
250255

251256
To disable compression, set ``compressors=None`` when creating an array, e.g.::
252257

253258
>>> z = zarr.create_array(store='data/example-8.zarr', shape=(100000000,), chunks=(1000000,), dtype='int32', compressors=None)
254-
>>> z.metadata.codecs
255-
[BytesCodec(endian=<Endian.little: 'little'>)]
259+
>>> z.compressors
260+
()
256261

257262
.. _user-guide-filters:
258263

@@ -287,7 +292,9 @@ Here is an example using a delta filter with the Blosc compressor::
287292
Order : C
288293
Read-only : False
289294
Store type : LocalStore
290-
Codecs : [{'codec_name': 'numcodecs.delta', 'codec_config': {'id': 'delta', 'dtype': 'int32'}}, {'endian': <Endian.little: 'little'>}, {'typesize': 4, 'cname': <BloscCname.zstd: 'zstd'>, 'clevel': 1, 'shuffle': <BloscShuffle.shuffle: 'shuffle'>, 'blocksize': 0}]
295+
Filters : (_make_array_array_codec.<locals>._Codec(codec_name='numcodecs.delta', codec_config={'id': 'delta', 'dtype': 'int32'}),)
296+
Serializer : BytesCodec(endian=<Endian.little: 'little'>)
297+
Compressors : (BloscCodec(typesize=4, cname=<BloscCname.zstd: 'zstd'>, clevel=1, shuffle=<BloscShuffle.shuffle: 'shuffle'>, blocksize=0),)
291298
No. bytes : 400000000 (381.5M)
292299

293300
For more information about available filter codecs, see the `Numcodecs
@@ -600,11 +607,13 @@ Sharded arrays can be created by providing the ``shards`` parameter to :func:`za
600607
Order : C
601608
Read-only : False
602609
Store type : LocalStore
603-
Codecs : [{'chunk_shape': (100, 100), 'codecs': ({'endian': <Endian.little: 'little'>}, {'level': 0, 'checksum': False}), 'index_codecs': ({'endian': <Endian.little: 'little'>}, {}), 'index_location': <ShardingCodecIndexLocation.end: 'end'>}]
610+
Filters : ()
611+
Serializer : BytesCodec(endian=<Endian.little: 'little'>)
612+
Compressors : (ZstdCodec(level=0, checksum=False),)
604613
No. bytes : 100000000 (95.4M)
605614
No. bytes stored : 3981060
606615
Storage ratio : 25.1
607-
Chunks Initialized : 100
616+
Shards Initialized : 100
608617

609618
In this example a shard shape of (1000, 1000) and a chunk shape of (100, 100) is used.
610619
This means that 10*10 chunks are stored in each shard, and there are 10*10 shards in total.

docs/user-guide/consolidated_metadata.rst

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,8 @@ that can be used.:
5252
chunk_key_encoding=DefaultChunkKeyEncoding(name='default',
5353
separator='/'),
5454
fill_value=np.float64(0.0),
55-
codecs=[BytesCodec(endian=<Endian.little: 'little'>),
56-
ZstdCodec(level=0, checksum=False)],
55+
codecs=(BytesCodec(endian=<Endian.little: 'little'>),
56+
ZstdCodec(level=0, checksum=False)),
5757
attributes={},
5858
dimension_names=None,
5959
zarr_format=3,
@@ -65,8 +65,8 @@ that can be used.:
6565
chunk_key_encoding=DefaultChunkKeyEncoding(name='default',
6666
separator='/'),
6767
fill_value=np.float64(0.0),
68-
codecs=[BytesCodec(endian=<Endian.little: 'little'>),
69-
ZstdCodec(level=0, checksum=False)],
68+
codecs=(BytesCodec(endian=<Endian.little: 'little'>),
69+
ZstdCodec(level=0, checksum=False)),
7070
attributes={},
7171
dimension_names=None,
7272
zarr_format=3,
@@ -78,8 +78,8 @@ that can be used.:
7878
chunk_key_encoding=DefaultChunkKeyEncoding(name='default',
7979
separator='/'),
8080
fill_value=np.float64(0.0),
81-
codecs=[BytesCodec(endian=<Endian.little: 'little'>),
82-
ZstdCodec(level=0, checksum=False)],
81+
codecs=(BytesCodec(endian=<Endian.little: 'little'>),
82+
ZstdCodec(level=0, checksum=False)),
8383
attributes={},
8484
dimension_names=None,
8585
zarr_format=3,

docs/user-guide/groups.rst

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,9 @@ property. E.g.::
109109
Order : C
110110
Read-only : False
111111
Store type : MemoryStore
112-
Codecs : [{'endian': <Endian.little: 'little'>}, {'level': 0, 'checksum': False}]
112+
Filters : ()
113+
Serializer : BytesCodec(endian=<Endian.little: 'little'>)
114+
Compressors : (ZstdCodec(level=0, checksum=False),)
113115
No. bytes : 8000000 (7.6M)
114116
No. bytes stored : 1432
115117
Storage ratio : 5586.6
@@ -123,7 +125,9 @@ property. E.g.::
123125
Order : C
124126
Read-only : False
125127
Store type : MemoryStore
126-
Codecs : [{'endian': <Endian.little: 'little'>}, {'level': 0, 'checksum': False}]
128+
Filters : ()
129+
Serializer : BytesCodec(endian=<Endian.little: 'little'>)
130+
Compressors : (ZstdCodec(level=0, checksum=False),)
127131
No. bytes : 4000000 (3.8M)
128132

129133
Groups also have the :func:`zarr.Group.tree` method, e.g.::

docs/user-guide/performance.rst

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,9 @@ To use sharding, you need to specify the ``shards`` parameter when creating the
9898
Order : C
9999
Read-only : False
100100
Store type : MemoryStore
101-
Codecs : [{'chunk_shape': (100, 100, 100), 'codecs': ({'endian': <Endian.little: 'little'>}, {'level': 0, 'checksum': False}), 'index_codecs': ({'endian': <Endian.little: 'little'>}, {}), 'index_location': <ShardingCodecIndexLocation.end: 'end'>}]
101+
Filters : ()
102+
Serializer : BytesCodec(endian=<Endian.little: 'little'>)
103+
Compressors : (ZstdCodec(level=0, checksum=False),)
102104
No. bytes : 100000000000 (93.1G)
103105

104106
.. _user-guide-chunks-order:
@@ -125,7 +127,9 @@ ratios, depending on the correlation structure within the data. E.g.::
125127
Order : C
126128
Read-only : False
127129
Store type : MemoryStore
128-
Codecs : [{'endian': <Endian.little: 'little'>}, {'level': 0, 'checksum': False}]
130+
Filters : ()
131+
Serializer : BytesCodec(endian=<Endian.little: 'little'>)
132+
Compressors : (ZstdCodec(level=0, checksum=False),)
129133
No. bytes : 400000000 (381.5M)
130134
No. bytes stored : 342588717
131135
Storage ratio : 1.2
@@ -142,7 +146,9 @@ ratios, depending on the correlation structure within the data. E.g.::
142146
Order : F
143147
Read-only : False
144148
Store type : MemoryStore
145-
Codecs : [{'endian': <Endian.little: 'little'>}, {'level': 0, 'checksum': False}]
149+
Filters : ()
150+
Serializer : BytesCodec(endian=<Endian.little: 'little'>)
151+
Compressors : (ZstdCodec(level=0, checksum=False),)
146152
No. bytes : 400000000 (381.5M)
147153
No. bytes stored : 342588717
148154
Storage ratio : 1.2

src/zarr/api/synchronous.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -802,7 +802,7 @@ def create_array(
802802
Use ``None`` to omit default filters.
803803
compressors : Iterable[Codec], optional
804804
List of compressors to apply to the array. Compressors are applied in order, and after any
805-
filters are applied (if any are specified).
805+
filters are applied (if any are specified) and the data is serialized into bytes.
806806
807807
For Zarr format 3, a "compressor" is a codec that takes a bytestream, and
808808
returns another bytestream. Multiple compressors my be provided for Zarr format 3.

src/zarr/core/_info.py

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import numcodecs.abc
66
import numpy as np
77

8-
from zarr.abc.codec import Codec
8+
from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec
99
from zarr.core.common import ZarrFormat
1010
from zarr.core.metadata.v3 import DataType
1111

@@ -85,9 +85,9 @@ class ArrayInfo:
8585
_order: Literal["C", "F"]
8686
_read_only: bool
8787
_store_type: str
88-
_compressor: numcodecs.abc.Codec | None = None
89-
_filters: tuple[numcodecs.abc.Codec, ...] | None = None
90-
_codecs: list[Codec] | None = None
88+
_filters: tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...] = ()
89+
_serializer: ArrayBytesCodec | None = None
90+
_compressors: tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...] = ()
9191
_count_bytes: int | None = None
9292
_count_bytes_stored: int | None = None
9393
_count_chunks_initialized: int | None = None
@@ -109,18 +109,19 @@ def __repr__(self) -> str:
109109
Read-only : {_read_only}
110110
Store type : {_store_type}""")
111111

112-
kwargs = dataclasses.asdict(self)
112+
# We can't use dataclasses.asdict, because we only want a shallow dict
113+
kwargs = {field.name: getattr(self, field.name) for field in dataclasses.fields(self)}
114+
113115
if self._chunk_shape is None:
114116
# for non-regular chunk grids
115117
kwargs["chunk_shape"] = "<variable>"
116-
if self._compressor is not None:
117-
template += "\nCompressor : {_compressor}"
118118

119-
if self._filters is not None:
120-
template += "\nFilters : {_filters}"
119+
template += "\nFilters : {_filters}"
120+
121+
if self._serializer is not None:
122+
template += "\nSerializer : {_serializer}"
121123

122-
if self._codecs is not None:
123-
template += "\nCodecs : {_codecs}"
124+
template += "\nCompressors : {_compressors}"
124125

125126
if self._count_bytes is not None:
126127
template += "\nNo. bytes : {_count_bytes}"
@@ -139,5 +140,8 @@ def __repr__(self) -> str:
139140
kwargs["_storage_ratio"] = f"{self._count_bytes / self._count_bytes_stored:.1f}"
140141

141142
if self._count_chunks_initialized is not None:
142-
template += "\nChunks Initialized : {_count_chunks_initialized}"
143+
if self._shard_shape is not None:
144+
template += "\nShards Initialized : {_count_chunks_initialized}"
145+
else:
146+
template += "\nChunks Initialized : {_count_chunks_initialized}"
143147
return template.format(**kwargs)

0 commit comments

Comments
 (0)