test filters; fix packbits filter

alimanfoo · alimanfoo · commit d2b68a2901f4 · 2016-08-31T20:30:52.000+01:00
diff --git a/docs/api/filters.rst b/docs/api/filters.rst
@@ -2,8 +2,8 @@ Filters (``zarr.filters``)
 ==========================
 .. module:: zarr.filters
 
-TODO
-
 .. autoclass:: DeltaFilter
-.. autoclass:: ScaleOffsetFilter
+.. autoclass:: FixedScaleOffsetFilter
 .. autoclass:: QuantizeFilter
+.. autoclass:: PackBitsFilter
+.. autoclass:: CategoryFilter
diff --git a/docs/tutorial.rst b/docs/tutorial.rst
@@ -247,26 +247,27 @@ values may increase compression ratio. Some compressors provide built-in
 filters that apply transformations to the data prior to compression. For
 example, the Blosc compressor has highly optimized built-in implementations of
 byte- and bit-shuffle filters, and the LZMA compressor has a built-in
-implementation of a delta filter. However, to provide some additional
+implementation of a delta filter. However, to provide additional
 flexibility for implementing and using filters in combination with different
 compressors, Zarr also provides a mechanism for configuring filters outside of
 the primary compressor.
 
 Here is an example using the Zarr delta filter with the Blosc compressor:
 
-    >>> filters = [zarr.DeltaFilter(dec_dtype='i4', enc_dtype='i4')]
+    >>> filters = [zarr.DeltaFilter(dtype='i4', astype='u1')]
     >>> z = zarr.array(np.arange(100000000, dtype='i4').reshape(10000, 10000),
     ...                chunks=(1000, 1000), filters=filters, compression='blosc',
-    ...                compression_opts=dict(cname='lz4', clevel=5, shuffle=0))
+    ...                compression_opts=dict(cname='zstd', clevel=1, shuffle=0))
     >>> z
     zarr.core.Array((10000, 10000), int32, chunks=(1000, 1000), order=C)
-      compression: blosc; compression_opts: {'clevel': 5, 'cname': 'lz4', 'shuffle': 0}
-      nbytes: 381.5M; nbytes_stored: 3.4M; ratio: 112.8; initialized: 100/100
+      compression: blosc; compression_opts: {'clevel': 1, 'cname': 'zstd', 'shuffle': 0}
+      nbytes: 381.5M; nbytes_stored: 34.8K; ratio: 11221.5; initialized: 100/100
       filters: delta
       store: builtins.dict
 
-Zarr comes with implementations of delta, scale-offset and quantize filters.
-For more information see the :mod:`zarr.filters` API docs.
+Zarr comes with implementations of delta, scale-offset, quantize, packbits and
+category filters. It is also relatively straightforward to implement custom
+filters. For more information see the :mod:`zarr.filters` API docs.
 
 Parallel computing and synchronization
 --------------------------------------
diff --git a/zarr/__init__.py b/zarr/__init__.py
@@ -14,7 +14,7 @@
 from zarr.sync import ThreadSynchronizer, ProcessSynchronizer
 from zarr.version import version as __version__
 from zarr.filters import DeltaFilter, FixedScaleOffsetFilter, \
-    QuantizeFilter, PackBitsFilter
+    QuantizeFilter, PackBitsFilter, CategoryFilter
 
 
 try:
diff --git a/zarr/filters.py b/zarr/filters.py
@@ -8,6 +8,7 @@
 
 from zarr.meta import encode_dtype, decode_dtype
 from zarr.compressors import registry as compressor_registry
+from zarr.compat import text_type, binary_type
 
 
 filter_registry = dict()
@@ -31,6 +32,12 @@ class DeltaFilter(object):
     astype : dtype, optional
         Data type to use for encoded data.
 
+    Notes
+    -----
+    If `astype` is an integer data type, please ensure that it is
+    sufficiently large to store encoded values. No checks are made and data
+    may become corrupted due to integer overflow if `astype` is too small.
+
     Examples
     --------
     >>> import zarr
@@ -86,7 +93,7 @@ def get_filter_config(self):
     def from_filter_config(cls, config):
         dtype = decode_dtype(config['dtype'])
         astype = decode_dtype(config['astype'])
-        return cls(dtype=dtype, asdtype=astype)
+        return cls(dtype=dtype, astype=astype)
 
 
 filter_registry[DeltaFilter.filter_name] = DeltaFilter
@@ -109,6 +116,12 @@ class FixedScaleOffsetFilter(object):
     astype : dtype, optional
         Data type to use for encoded data.
 
+    Notes
+    -----
+    If `astype` is an integer data type, please ensure that it is
+    sufficiently large to store encoded values. No checks are made and data
+    may become corrupted due to integer overflow if `astype` is too small.
+
     Examples
     --------
     >>> import zarr
@@ -248,6 +261,8 @@ def __init__(self, digits, dtype, astype=None):
             self.astype = self.dtype
         else:
             self.astype = np.dtype(astype)
+        if self.dtype.kind != 'f' or self.astype.kind != 'f':
+            raise ValueError('only floating point data types are supported')
 
     def encode(self, buf):
         # interpret buffer as 1D array
@@ -324,11 +339,17 @@ def encode(self, buf):
         arr = _ndarray_from_buffer(buf, bool)
         # determine size of packed data
         n = arr.size
-        n_bytes_packed = (n // 8) + 1
-        n_bits_padded = n % 8
+        n_bytes_packed = (n // 8)
+        n_bits_leftover = n % 8
+        if n_bits_leftover > 0:
+            n_bytes_packed += 1
         # setup output
         enc = np.empty(n_bytes_packed + 1, dtype='u1')
         # remember how many bits were padded
+        if n_bits_leftover:
+            n_bits_padded = 8 - n_bits_leftover
+        else:
+            n_bits_padded = 0
         enc[0] = n_bits_padded
         # apply encoding
         enc[1:] = np.packbits(arr)
@@ -342,7 +363,8 @@ def decode(self, buf):
         # apply decoding
         dec = np.unpackbits(enc[1:])
         # remove padded bits
-        dec = dec[:-n_bits_padded]
+        if n_bits_padded:
+            dec = dec[:-n_bits_padded]
         # view as boolean array
         dec = dec.view(bool)
         return dec
@@ -360,6 +382,94 @@ def from_filter_config(cls, config):
 filter_registry[PackBitsFilter.filter_name] = PackBitsFilter
 
 
+def _ensure_bytes(l):
+    if isinstance(l, binary_type):
+        return l
+    elif isinstance(l, text_type):
+        return l.encode('ascii')
+    else:
+        raise ValueError('expected bytes, found %r' % l)
+
+
+class CategoryFilter(object):
+    """Filter encoding categorical string data as integers.
+
+    Parameters
+    ----------
+    labels : sequence of strings
+        Category labels.
+    dtype : dtype
+        Data type to use for decoded data.
+    astype : dtype, optional
+        Data type to use for encoded data.
+
+    Examples
+    --------
+    >>> import zarr
+    >>> import numpy as np
+    >>> x = np.array([b'male', b'female', b'female', b'male', b'unexpected'])
+    >>> x
+    array([b'male', b'female', b'female', b'male', b'unexpected'],
+          dtype='|S10')
+    >>> f = zarr.CategoryFilter(labels=[b'female', b'male'], dtype=x.dtype)
+    >>> y = f.encode(x)
+    >>> y
+    array([2, 1, 1, 2, 0], dtype=uint8)
+    >>> z = f.decode(y)
+    >>> z
+    array([b'male', b'female', b'female', b'male', b''],
+          dtype='|S10')
+
+    """
+
+    filter_name = 'category'
+
+    def __init__(self, labels, dtype, astype='u1'):
+        self.labels = [_ensure_bytes(l) for l in labels]
+        self.dtype = np.dtype(dtype)
+        if self.dtype.kind != 'S':
+            raise ValueError('only string data types are supported')
+        self.astype = np.dtype(astype)
+
+    def encode(self, buf):
+        # view input as ndarray
+        arr = _ndarray_from_buffer(buf, self.dtype)
+        # setup output array
+        enc = np.zeros_like(arr, dtype=self.astype)
+        # apply encoding, reserving 0 for values not specified in labels
+        for i, l in enumerate(self.labels):
+            enc[arr == l] = i + 1
+        return enc
+
+    def decode(self, buf):
+        # view encoded data as ndarray
+        enc = _ndarray_from_buffer(buf, self.astype)
+        # setup output
+        dec = np.zeros_like(enc, dtype=self.dtype)
+        # apply decoding
+        for i, l in enumerate(self.labels):
+            dec[enc == (i + 1)] = l
+        return dec
+
+    def get_filter_config(self):
+        config = dict()
+        config['name'] = self.filter_name
+        config['labels'] = [text_type(l, 'ascii') for l in self.labels]
+        config['dtype'] = encode_dtype(self.dtype)
+        config['astype'] = encode_dtype(self.astype)
+        return config
+
+    @classmethod
+    def from_filter_config(cls, config):
+        dtype = decode_dtype(config['dtype'])
+        astype = decode_dtype(config['astype'])
+        labels = config['labels']
+        return cls(labels=labels, dtype=dtype, astype=astype)
+
+
+filter_registry[CategoryFilter.filter_name] = CategoryFilter
+
+
 # add in compressors as filters
 for cls in compressor_registry.values():
     if hasattr(cls, 'filter_name'):
diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py
@@ -19,7 +19,7 @@
 from zarr.errors import ReadOnlyError
 from zarr.compat import PY2
 from zarr.util import buffer_size
-from zarr.filters import DeltaFilter, ScaleOffsetFilter, QuantizeFilter
+from zarr.filters import DeltaFilter, FixedScaleOffsetFilter
 
 
 compression_configs = [
@@ -654,9 +654,8 @@ def create_array(store=None, read_only=False, chunk_store=None, **kwargs):
             chunk_store = store
         dtype = kwargs.get('dtype', None)
         filters = [
-            DeltaFilter(astype=dtype, dtype=dtype),
-            ScaleOffsetFilter(enc_dtype=dtype, dec_dtype=dtype, scale=1,
-                              offset=0),
+            DeltaFilter(dtype=dtype),
+            FixedScaleOffsetFilter(dtype=dtype, scale=1, offset=0),
         ]
         kwargs.setdefault('filters', filters)
         init_array(store, chunk_store=chunk_store, **kwargs)
@@ -671,8 +670,8 @@ def test_repr(self):
             # flake8: noqa
             expect = """zarr.core.Array((100,), float32, chunks=(10,), order=C)
   compression: zlib; compression_opts: 1
-  nbytes: 400; nbytes_stored: 514; ratio: 0.8; initialized: 0/10
-  filters: delta, scaleoffset
+  nbytes: 400; nbytes_stored: 505; ratio: 0.8; initialized: 0/10
+  filters: delta, fixedscaleoffset
   store: builtins.dict
   chunk_store: builtins.dict
 """
diff --git a/zarr/tests/test_filters.py b/zarr/tests/test_filters.py
diff --git a/zarr/tests/test_meta.py b/zarr/tests/test_meta.py