Skip to content

Commit d2b68a2

Browse files
committed
test filters; fix packbits filter
1 parent b12e7b7 commit d2b68a2

File tree

7 files changed

+297
-76
lines changed

7 files changed

+297
-76
lines changed

docs/api/filters.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@ Filters (``zarr.filters``)
22
==========================
33
.. module:: zarr.filters
44

5-
TODO
6-
75
.. autoclass:: DeltaFilter
8-
.. autoclass:: ScaleOffsetFilter
6+
.. autoclass:: FixedScaleOffsetFilter
97
.. autoclass:: QuantizeFilter
8+
.. autoclass:: PackBitsFilter
9+
.. autoclass:: CategoryFilter

docs/tutorial.rst

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -247,26 +247,27 @@ values may increase compression ratio. Some compressors provide built-in
247247
filters that apply transformations to the data prior to compression. For
248248
example, the Blosc compressor has highly optimized built-in implementations of
249249
byte- and bit-shuffle filters, and the LZMA compressor has a built-in
250-
implementation of a delta filter. However, to provide some additional
250+
implementation of a delta filter. However, to provide additional
251251
flexibility for implementing and using filters in combination with different
252252
compressors, Zarr also provides a mechanism for configuring filters outside of
253253
the primary compressor.
254254

255255
Here is an example using the Zarr delta filter with the Blosc compressor:
256256

257-
>>> filters = [zarr.DeltaFilter(dec_dtype='i4', enc_dtype='i4')]
257+
>>> filters = [zarr.DeltaFilter(dtype='i4', astype='u1')]
258258
>>> z = zarr.array(np.arange(100000000, dtype='i4').reshape(10000, 10000),
259259
... chunks=(1000, 1000), filters=filters, compression='blosc',
260-
... compression_opts=dict(cname='lz4', clevel=5, shuffle=0))
260+
... compression_opts=dict(cname='zstd', clevel=1, shuffle=0))
261261
>>> z
262262
zarr.core.Array((10000, 10000), int32, chunks=(1000, 1000), order=C)
263-
compression: blosc; compression_opts: {'clevel': 5, 'cname': 'lz4', 'shuffle': 0}
264-
nbytes: 381.5M; nbytes_stored: 3.4M; ratio: 112.8; initialized: 100/100
263+
compression: blosc; compression_opts: {'clevel': 1, 'cname': 'zstd', 'shuffle': 0}
264+
nbytes: 381.5M; nbytes_stored: 34.8K; ratio: 11221.5; initialized: 100/100
265265
filters: delta
266266
store: builtins.dict
267267

268-
Zarr comes with implementations of delta, scale-offset and quantize filters.
269-
For more information see the :mod:`zarr.filters` API docs.
268+
Zarr comes with implementations of delta, scale-offset, quantize, packbits and
269+
category filters. It is also relatively straightforward to implement custom
270+
filters. For more information see the :mod:`zarr.filters` API docs.
270271

271272
Parallel computing and synchronization
272273
--------------------------------------

zarr/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from zarr.sync import ThreadSynchronizer, ProcessSynchronizer
1515
from zarr.version import version as __version__
1616
from zarr.filters import DeltaFilter, FixedScaleOffsetFilter, \
17-
QuantizeFilter, PackBitsFilter
17+
QuantizeFilter, PackBitsFilter, CategoryFilter
1818

1919

2020
try:

zarr/filters.py

Lines changed: 114 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from zarr.meta import encode_dtype, decode_dtype
1010
from zarr.compressors import registry as compressor_registry
11+
from zarr.compat import text_type, binary_type
1112

1213

1314
filter_registry = dict()
@@ -31,6 +32,12 @@ class DeltaFilter(object):
3132
astype : dtype, optional
3233
Data type to use for encoded data.
3334
35+
Notes
36+
-----
37+
If `astype` is an integer data type, please ensure that it is
38+
sufficiently large to store encoded values. No checks are made and data
39+
may become corrupted due to integer overflow if `astype` is too small.
40+
3441
Examples
3542
--------
3643
>>> import zarr
@@ -86,7 +93,7 @@ def get_filter_config(self):
8693
def from_filter_config(cls, config):
8794
dtype = decode_dtype(config['dtype'])
8895
astype = decode_dtype(config['astype'])
89-
return cls(dtype=dtype, asdtype=astype)
96+
return cls(dtype=dtype, astype=astype)
9097

9198

9299
filter_registry[DeltaFilter.filter_name] = DeltaFilter
@@ -109,6 +116,12 @@ class FixedScaleOffsetFilter(object):
109116
astype : dtype, optional
110117
Data type to use for encoded data.
111118
119+
Notes
120+
-----
121+
If `astype` is an integer data type, please ensure that it is
122+
sufficiently large to store encoded values. No checks are made and data
123+
may become corrupted due to integer overflow if `astype` is too small.
124+
112125
Examples
113126
--------
114127
>>> import zarr
@@ -248,6 +261,8 @@ def __init__(self, digits, dtype, astype=None):
248261
self.astype = self.dtype
249262
else:
250263
self.astype = np.dtype(astype)
264+
if self.dtype.kind != 'f' or self.astype.kind != 'f':
265+
raise ValueError('only floating point data types are supported')
251266

252267
def encode(self, buf):
253268
# interpret buffer as 1D array
@@ -324,11 +339,17 @@ def encode(self, buf):
324339
arr = _ndarray_from_buffer(buf, bool)
325340
# determine size of packed data
326341
n = arr.size
327-
n_bytes_packed = (n // 8) + 1
328-
n_bits_padded = n % 8
342+
n_bytes_packed = (n // 8)
343+
n_bits_leftover = n % 8
344+
if n_bits_leftover > 0:
345+
n_bytes_packed += 1
329346
# setup output
330347
enc = np.empty(n_bytes_packed + 1, dtype='u1')
331348
# remember how many bits were padded
349+
if n_bits_leftover:
350+
n_bits_padded = 8 - n_bits_leftover
351+
else:
352+
n_bits_padded = 0
332353
enc[0] = n_bits_padded
333354
# apply encoding
334355
enc[1:] = np.packbits(arr)
@@ -342,7 +363,8 @@ def decode(self, buf):
342363
# apply decoding
343364
dec = np.unpackbits(enc[1:])
344365
# remove padded bits
345-
dec = dec[:-n_bits_padded]
366+
if n_bits_padded:
367+
dec = dec[:-n_bits_padded]
346368
# view as boolean array
347369
dec = dec.view(bool)
348370
return dec
@@ -360,6 +382,94 @@ def from_filter_config(cls, config):
360382
filter_registry[PackBitsFilter.filter_name] = PackBitsFilter
361383

362384

385+
def _ensure_bytes(l):
386+
if isinstance(l, binary_type):
387+
return l
388+
elif isinstance(l, text_type):
389+
return l.encode('ascii')
390+
else:
391+
raise ValueError('expected bytes, found %r' % l)
392+
393+
394+
class CategoryFilter(object):
395+
"""Filter encoding categorical string data as integers.
396+
397+
Parameters
398+
----------
399+
labels : sequence of strings
400+
Category labels.
401+
dtype : dtype
402+
Data type to use for decoded data.
403+
astype : dtype, optional
404+
Data type to use for encoded data.
405+
406+
Examples
407+
--------
408+
>>> import zarr
409+
>>> import numpy as np
410+
>>> x = np.array([b'male', b'female', b'female', b'male', b'unexpected'])
411+
>>> x
412+
array([b'male', b'female', b'female', b'male', b'unexpected'],
413+
dtype='|S10')
414+
>>> f = zarr.CategoryFilter(labels=[b'female', b'male'], dtype=x.dtype)
415+
>>> y = f.encode(x)
416+
>>> y
417+
array([2, 1, 1, 2, 0], dtype=uint8)
418+
>>> z = f.decode(y)
419+
>>> z
420+
array([b'male', b'female', b'female', b'male', b''],
421+
dtype='|S10')
422+
423+
"""
424+
425+
filter_name = 'category'
426+
427+
def __init__(self, labels, dtype, astype='u1'):
428+
self.labels = [_ensure_bytes(l) for l in labels]
429+
self.dtype = np.dtype(dtype)
430+
if self.dtype.kind != 'S':
431+
raise ValueError('only string data types are supported')
432+
self.astype = np.dtype(astype)
433+
434+
def encode(self, buf):
435+
# view input as ndarray
436+
arr = _ndarray_from_buffer(buf, self.dtype)
437+
# setup output array
438+
enc = np.zeros_like(arr, dtype=self.astype)
439+
# apply encoding, reserving 0 for values not specified in labels
440+
for i, l in enumerate(self.labels):
441+
enc[arr == l] = i + 1
442+
return enc
443+
444+
def decode(self, buf):
445+
# view encoded data as ndarray
446+
enc = _ndarray_from_buffer(buf, self.astype)
447+
# setup output
448+
dec = np.zeros_like(enc, dtype=self.dtype)
449+
# apply decoding
450+
for i, l in enumerate(self.labels):
451+
dec[enc == (i + 1)] = l
452+
return dec
453+
454+
def get_filter_config(self):
455+
config = dict()
456+
config['name'] = self.filter_name
457+
config['labels'] = [text_type(l, 'ascii') for l in self.labels]
458+
config['dtype'] = encode_dtype(self.dtype)
459+
config['astype'] = encode_dtype(self.astype)
460+
return config
461+
462+
@classmethod
463+
def from_filter_config(cls, config):
464+
dtype = decode_dtype(config['dtype'])
465+
astype = decode_dtype(config['astype'])
466+
labels = config['labels']
467+
return cls(labels=labels, dtype=dtype, astype=astype)
468+
469+
470+
filter_registry[CategoryFilter.filter_name] = CategoryFilter
471+
472+
363473
# add in compressors as filters
364474
for cls in compressor_registry.values():
365475
if hasattr(cls, 'filter_name'):

zarr/tests/test_core.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from zarr.errors import ReadOnlyError
2020
from zarr.compat import PY2
2121
from zarr.util import buffer_size
22-
from zarr.filters import DeltaFilter, ScaleOffsetFilter, QuantizeFilter
22+
from zarr.filters import DeltaFilter, FixedScaleOffsetFilter
2323

2424

2525
compression_configs = [
@@ -654,9 +654,8 @@ def create_array(store=None, read_only=False, chunk_store=None, **kwargs):
654654
chunk_store = store
655655
dtype = kwargs.get('dtype', None)
656656
filters = [
657-
DeltaFilter(astype=dtype, dtype=dtype),
658-
ScaleOffsetFilter(enc_dtype=dtype, dec_dtype=dtype, scale=1,
659-
offset=0),
657+
DeltaFilter(dtype=dtype),
658+
FixedScaleOffsetFilter(dtype=dtype, scale=1, offset=0),
660659
]
661660
kwargs.setdefault('filters', filters)
662661
init_array(store, chunk_store=chunk_store, **kwargs)
@@ -671,8 +670,8 @@ def test_repr(self):
671670
# flake8: noqa
672671
expect = """zarr.core.Array((100,), float32, chunks=(10,), order=C)
673672
compression: zlib; compression_opts: 1
674-
nbytes: 400; nbytes_stored: 514; ratio: 0.8; initialized: 0/10
675-
filters: delta, scaleoffset
673+
nbytes: 400; nbytes_stored: 505; ratio: 0.8; initialized: 0/10
674+
filters: delta, fixedscaleoffset
676675
store: builtins.dict
677676
chunk_store: builtins.dict
678677
"""

0 commit comments

Comments
 (0)