add categorize

alimanfoo · alimanfoo · commit 8ba41ad6a3f9 · 2016-09-16T16:08:14.000+01:00
diff --git a/docs/categorize.rst b/docs/categorize.rst
@@ -0,0 +1,5 @@
+Categorize
+==========
+.. module:: numcodecs.categorize
+
+.. autoclass:: Categorize
diff --git a/docs/index.rst b/docs/index.rst
@@ -30,6 +30,7 @@ Contents
     delta
     fixedscaleoffset
     packbits
+    categorize
     release
 
 Acknowledgments
diff --git a/numcodecs/__init__.py b/numcodecs/__init__.py
@@ -31,3 +31,6 @@
 
 from numcodecs.packbits import PackBits
 register_codec(PackBits)
+
+from numcodecs.categorize import Categorize
+register_codec(Categorize)
diff --git a/numcodecs/categorize.py b/numcodecs/categorize.py
@@ -0,0 +1,114 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, print_function, division
+
+
+from numcodecs.abc import Codec
+from numcodecs.compat import ndarray_from_buffer, buffer_copy, ensure_text, \
+    ensure_bytes
+
+
+import numpy as np
+
+
+class Categorize(Codec):
+    """Filter encoding categorical string data as integers.
+
+    Parameters
+    ----------
+    labels : sequence of strings
+        Category labels.
+    dtype : dtype
+        Data type to use for decoded data.
+    astype : dtype, optional
+        Data type to use for encoded data.
+
+    Examples
+    --------
+    >>> import numcodecs as codecs
+    >>> import numpy as np
+    >>> x = np.array([b'male', b'female', b'female', b'male', b'unexpected'])
+    >>> x
+    array([b'male', b'female', b'female', b'male', b'unexpected'],
+          dtype='|S10')
+    >>> f = codecs.Categorize(labels=[b'female', b'male'], dtype=x.dtype)
+    >>> y = f.encode(x)
+    >>> y
+    array([2, 1, 1, 2, 0], dtype=uint8)
+    >>> z = f.decode(y)
+    >>> z
+    array([b'male', b'female', b'female', b'male', b''],
+          dtype='|S10')
+
+    """
+
+    codec_id = 'categorize'
+
+    def __init__(self, labels, dtype, astype='u1'):
+        self.dtype = np.dtype(dtype)
+        if self.dtype.kind == 'S':
+            self.labels = [ensure_bytes(l) for l in labels]
+        elif self.dtype.kind == 'U':
+            self.labels = [ensure_text(l) for l in labels]
+        else:
+            self.labels = labels
+        self.astype = np.dtype(astype)
+
+    def encode(self, buf):
+
+        # view input as ndarray
+        arr = ndarray_from_buffer(buf, self.dtype)
+
+        # setup output array
+        enc = np.zeros_like(arr, dtype=self.astype)
+
+        # apply encoding, reserving 0 for values not specified in labels
+        for i, l in enumerate(self.labels):
+            enc[arr == l] = i + 1
+
+        return enc
+
+    def decode(self, buf, out=None):
+
+        # view encoded data as ndarray
+        enc = ndarray_from_buffer(buf, self.astype)
+
+        # setup output
+        if isinstance(out, np.ndarray):
+            # optimization, decode directly to output
+            dec = out.reshape(-1, order='A')
+            copy_needed = False
+        else:
+            dec = np.zeros_like(enc, dtype=self.dtype)
+            copy_needed = True
+
+        # apply decoding
+        for i, l in enumerate(self.labels):
+            dec[enc == (i + 1)] = l
+
+        # handle output
+        if copy_needed:
+            dec = buffer_copy(dec, out)
+
+        return dec
+
+    def get_config(self):
+        if self.dtype.kind == 'S':
+            labels = [ensure_text(l) for l in self.labels]
+        else:
+            labels = self.labels
+        config = dict(
+            id=self.codec_id,
+            labels=labels,
+            dtype=self.dtype.str,
+            astype=self.astype.str
+        )
+        return config
+
+    def __repr__(self):
+        # make sure labels part is not too long
+        labels = repr(self.labels[:3])
+        if len(self.labels) > 3:
+            labels = labels[:-1] + ', ...]'
+        r = '%s(dtype=%r, astype=%r, labels=%s)' % \
+            (type(self).__name__, self.dtype.str, self.astype.str, labels)
+        return r
diff --git a/numcodecs/compat.py b/numcodecs/compat.py
@@ -89,3 +89,17 @@ def ndarray_from_buffer(buf, dtype):
     else:
         arr = np.frombuffer(buf, dtype=dtype)
     return arr
+
+
+def ensure_bytes(l, encoding='utf-8'):
+    if isinstance(l, binary_type):
+        return l
+    else:
+        return l.encode(encoding=encoding)
+
+
+def ensure_text(l, encoding='utf-8'):
+    if isinstance(l, text_type):
+        return l
+    else:
+        return text_type(l, encoding=encoding)
diff --git a/numcodecs/tests/test_categorize.py b/numcodecs/tests/test_categorize.py
@@ -0,0 +1,118 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, print_function, division
+
+
+import numpy as np
+from numpy.testing import assert_array_equal
+from nose.tools import eq_ as eq
+
+
+from numcodecs.categorize import Categorize
+from numcodecs.tests.common import check_encode_decode, check_config
+from numcodecs.compat import PY2
+
+
+labels = [b'foo', b'bar', b'baz', b'quux']
+labels_u = [u'ƒöõ', u'ßàř', u'ßāẑ', u'ƪùüx']
+labels_num = [1000000, 2000000, 3000000]
+arrays = [
+    np.random.choice(labels, size=1000),
+    np.random.choice(labels, size=(100, 10)),
+    np.random.choice(labels, size=(10, 10, 10)),
+    np.random.choice(labels, size=1000).reshape(100, 10, order='F'),
+]
+arrays_u = [
+    np.random.choice(labels_u, size=1000),
+    np.random.choice(labels_u, size=(100, 10)),
+    np.random.choice(labels_u, size=(10, 10, 10)),
+    np.random.choice(labels_u, size=1000).reshape(100, 10, order='F'),
+]
+arrays_num = [
+    np.random.choice(labels_num, size=1000),
+    np.random.choice(labels_num, size=(100, 10)),
+    np.random.choice(labels_num, size=(10, 10, 10)),
+    np.random.choice(labels_num, size=1000).reshape(100, 10, order='F'),
+]
+
+
+def test_encode_decode():
+
+    # string dtype
+    for arr in arrays:
+        codec = Categorize(labels, dtype=arr.dtype)
+        check_encode_decode(arr, codec)
+
+    # unicode dtype
+    for arr in arrays_u:
+        codec = Categorize(labels_u, dtype=arr.dtype)
+        check_encode_decode(arr, codec)
+
+    # other dtype
+    for arr in arrays_num:
+        codec = Categorize(labels_num, dtype=arr.dtype)
+        check_encode_decode(arr, codec)
+
+
+def test_encode():
+    arr = np.array([b'foo', b'bar', b'foo', b'baz', b'quux'])
+    # miss off quux
+    codec = Categorize(labels=labels[:-1], dtype=arr.dtype, astype='u1')
+
+    # test encoding
+    expect = np.array([1, 2, 1, 3, 0], dtype='u1')
+    enc = codec.encode(arr)
+    assert_array_equal(expect, enc)
+    eq(expect.dtype, enc.dtype)
+
+    # test decoding with unexpected value
+    dec = codec.decode(enc)
+    expect = arr.copy()
+    expect[expect == b'quux'] = b''
+    assert_array_equal(expect, dec)
+    eq(arr.dtype, dec.dtype)
+
+
+def test_encode_unicode():
+    arr = np.array([u'ƒöõ', u'ßàř', u'ƒöõ', u'ßāẑ', u'ƪùüx'])
+    # miss off quux
+    codec = Categorize(labels=labels_u[:-1], dtype=arr.dtype, astype='u1')
+
+    # test encoding
+    expect = np.array([1, 2, 1, 3, 0], dtype='u1')
+    enc = codec.encode(arr)
+    assert_array_equal(expect, enc)
+    eq(expect.dtype, enc.dtype)
+
+    # test decoding with unexpected value
+    dec = codec.decode(enc)
+    expect = arr.copy()
+    expect[expect == u'ƪùüx'] = u''
+    assert_array_equal(expect, dec)
+    eq(arr.dtype, dec.dtype)
+
+
+def test_config():
+    codec = Categorize(labels=labels, dtype='S4')
+    check_config(codec)
+    codec = Categorize(labels=labels_u, dtype='U4')
+    check_config(codec)
+
+
+def test_repr():
+    if not PY2:
+
+        dtype = '|S5'
+        astype = '|u1'
+        codec = Categorize(labels=labels, dtype=dtype, astype=astype)
+        expect = "Categorize(dtype='|S5', astype='|u1', " \
+                 "labels=[b'foo', b'bar', b'baz', ...])"
+        actual = repr(codec)
+        eq(expect, actual)
+
+        dtype = '<U5'
+        astype = '|u1'
+        codec = Categorize(labels=labels_u, dtype=dtype, astype=astype)
+        expect = "Categorize(dtype='<U5', astype='|u1', " \
+                 "labels=['ƒöõ', 'ßàř', 'ßāẑ', ...])"
+        actual = repr(codec)
+        eq(expect, actual)