zarr-developers · jakirkham · May 18, 2022 · Dec 17, 2021 · Dec 17, 2021 · Dec 17, 2021
diff --git a/numcodecs/bitround.py b/numcodecs/bitround.py
@@ -0,0 +1,82 @@
+import numpy as np
+
+
+from .abc import Codec
+from .compat import ensure_ndarray, ndarray_copy
+
+max_bits = {
+    "float16": 10,
+    "float32": 23,
+    "float64": 52,
+}
+types = {
+    "float16": np.int16,
+    "float32": np.int32,
+    "float64": np.int64,
+}
+inverse = {
+    "int16": np.float16,
+    "int32": np.float32,
+    "int64": np.float64
+}
+
+
+class BitRound(Codec):
+    """Floating-point bit rounding codec
+
+    Drops a specified number of bits from the floating point mantissa,
+    leaving an array more amenable to compression. The number of bits to keep should
+    be determined by an information analysis of the data to be compressed. See
+    https://github.com/zarr-developers/numcodecs/issues/298 for discussion
+    and the original implementation in Julia referred to at
+    https://www.nature.com/articles/s43588-021-00156-2
+
+    Parameters
+    ----------
+
+    keepbits: int
+        The number of bits of the mantissa to keep. The range allowed
+        depends on the dtype input data. If keepbits is
+        equal to the maximum allowed for the data type, this is equivalent
+        to no transform.
+    """
+
+    codec_id = 'bitround'
+
+    def __init__(self, keepbits: int):
+        if keepbits < 0:
+            raise ValueError("keepbits must be zero or positive")
+        self.keepbits = keepbits
+
+    def encode(self, buf):
+        """Create int array by rounding floating-point data
+
+        The itemsize will be preserved, but the output should be much more
+        compressible.
+        """
+        a = ensure_ndarray(buf)
+        bits = max_bits[str(a.dtype)]
+        all_set = np.frombuffer(b"\xff" * a.dtype.itemsize, dtype=types[str(a.dtype)])
+        if self.keepbits == bits:
+            return a
+        if self.keepbits > bits:
+            raise ValueError("Keepbits too large for given dtype")
+        if not a.dtype.kind == "f" or a.dtype.itemsize > 8:
+            raise TypeError("Only float arrays (16-64bit) can be bit-rounded")
+        b = a.view(types[str(a.dtype)])
+        maskbits = 23 - self.keepbits
+        mask = (all_set >> maskbits) << maskbits
+        half_quantum1 = (1 << (maskbits - 1)) - 1
+        b += ((b >> maskbits) & 1) + half_quantum1
+        b &= mask
+        return b
+
+    def decode(self, buf, out=None):
+        """Remake floats from ints
+
+        As with ``encode``, preserves itemsize.
+        """
+        dt = buf.dtype if buf.dtype.kind == "f" else inverse[str(buf.dtype)]
+        data = ensure_ndarray(buf).view(dt)
+        out = ndarray_copy(data, out)
+        return out
diff --git a/numcodecs/tests/test_bitround.py b/numcodecs/tests/test_bitround.py
@@ -0,0 +1,80 @@
+import numpy as np
+
+import pytest
+
+from numcodecs.bitround import BitRound
+
+# adapted from https://github.com/milankl/BitInformation.jl/blob/main/test/round_nearest.jl
+
+
+# TODO: add other dtypes
+@pytest.fixture(params=[np.float32])
+def dtype(request):
+    return request.param
+
+
+# number of mantissa bits for each dtype
+MBITS = {np.float32: 23}
+
+
+def round(data, keepbits):
+    codec = BitRound(keepbits=keepbits)
+    data = data.copy()  # otherwise overwrites the input
+    encoded = codec.encode(data)
+    return codec.decode(encoded)
+
+
+def test_round_zero_to_zero(dtype):
+    a = np.zeros((3, 2), dtype=dtype)
+    # Don't understand Milan's original test:
+    # How is it possible to have negative keepbits?
+    # for k in range(-5, 50):
+    for k in range(0, MBITS[dtype]):
+        ar = round(a, k)
+        np.testing.assert_equal(a, ar)
+
+
+def test_round_one_to_one(dtype):
+    a = np.ones((3, 2), dtype=dtype)
+    for k in range(0, MBITS[dtype]):
+        ar = round(a, k)
+        np.testing.assert_equal(a, ar)
+
+
+def test_round_minus_one_to_minus_one(dtype):
+    a = -np.ones((3, 2), dtype=dtype)
+    for k in range(0, MBITS[dtype]):
+        ar = round(a, k)
+        np.testing.assert_equal(a, ar)
+
+
+def test_no_rounding(dtype):
+    a = np.random.random_sample((300, 200)).astype(dtype)
+    keepbits = MBITS[dtype]
+    ar = round(a, keepbits)
+    np.testing.assert_equal(a, ar)
+
+
+APPROX_KEEPBITS = {np.float32: 11}
+
+
+def test_approx_equal(dtype):
+    a = np.random.random_sample((300, 200)).astype(dtype)
+    ar = round(a, APPROX_KEEPBITS[dtype])
+    # Mimic julia behavior - https://docs.julialang.org/en/v1/base/math/#Base.isapprox
+    rtol = np.sqrt(np.finfo(np.float32).eps)
+    # This gets us much closer but still failing for ~6% of the array
+    # It does pass if we add 1 to keepbits (11 instead of 10)
+    # Is there an off-by-one issue here?
+    np.testing.assert_allclose(a, ar, rtol=rtol)
+
+
+def test_idempotence(dtype):
+    a = np.random.random_sample((300, 200)).astype(dtype)
+    for k in range(20):
+        ar = round(a, k)
+        ar2 = round(a, k)
+        np.testing.assert_equal(ar, ar2)
+
+
+# TODO: implement tie_to_even and round_to_nearest