From cdb77b2d943de460b69378ac0b3bb57fc48bafc7 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Fri, 17 Dec 2021 09:58:47 -0500 Subject: [PATCH 01/24] added new codec --- numcodecs/bitround.py | 28 ++++++++++++++++++++++++++++ numcodecs/tests/test_bitround.py | 20 ++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 numcodecs/bitround.py create mode 100644 numcodecs/tests/test_bitround.py diff --git a/numcodecs/bitround.py b/numcodecs/bitround.py new file mode 100644 index 00000000..36c2cd9e --- /dev/null +++ b/numcodecs/bitround.py @@ -0,0 +1,28 @@ +import numpy as np + + +from .abc import Codec +from .compat import ensure_ndarray, ndarray_copy + + +class BitRound(Codec): + codec_id = 'bitround' + + def __init__(self, keepbits: int): + self.keepbits = keepbits + + def encode(self, buf): + a = ensure_ndarray(buf).view() + assert a.dtype == np.float32 + b = a.view(dtype=np.int32) + maskbits = 23 - self.keepbits + mask = (0xFFFFFFFF >> maskbits) << maskbits + half_quantum1 = (1 << (maskbits - 1)) - 1 + b += ((b >> maskbits) & 1) + half_quantum1 + b &= mask + return b + + def decode(self, buf, out=None): + data = ensure_ndarray(buf).view(np.float32) + out = ndarray_copy(data, out) + return out diff --git a/numcodecs/tests/test_bitround.py b/numcodecs/tests/test_bitround.py new file mode 100644 index 00000000..f829a7c7 --- /dev/null +++ b/numcodecs/tests/test_bitround.py @@ -0,0 +1,20 @@ +import numpy as np + +import pytest + +from numcodecs.bitround import BitRound +from numcodecs.tests.common import check_encode_decode, check_config, \ + check_repr, check_backwards_compatibility + +# adapted from https://github.com/milankl/BitInformation.jl/blob/main/test/round_nearest.jl + + +@pytest.fixture(params=[np.float32]) +def dtype(request): + return request.param + + +def test_smoke(dtype): + data = np.zeros((3, 2), dtype=dtype) + codec = BitRound(keepbits=10) + codec.decode(codec.encode(data)) From c0b6347464a96a683eb6ff9e11821387ba1daebc Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Fri, 17 Dec 2021 11:29:10 -0500 Subject: [PATCH 02/24] wrote some tests --- numcodecs/bitround.py | 3 ++ numcodecs/tests/test_bitround.py | 70 +++++++++++++++++++++++++++++--- 2 files changed, 67 insertions(+), 6 deletions(-) diff --git a/numcodecs/bitround.py b/numcodecs/bitround.py index 36c2cd9e..9540f746 100644 --- a/numcodecs/bitround.py +++ b/numcodecs/bitround.py @@ -12,6 +12,9 @@ def __init__(self, keepbits: int): self.keepbits = keepbits def encode(self, buf): + # TODO: figure out if we need to make a copy + # Currently this appears to be overwriting the input buffer + # Is that the right behavior? a = ensure_ndarray(buf).view() assert a.dtype == np.float32 b = a.view(dtype=np.int32) diff --git a/numcodecs/tests/test_bitround.py b/numcodecs/tests/test_bitround.py index f829a7c7..b896295e 100644 --- a/numcodecs/tests/test_bitround.py +++ b/numcodecs/tests/test_bitround.py @@ -3,18 +3,76 @@ import pytest from numcodecs.bitround import BitRound -from numcodecs.tests.common import check_encode_decode, check_config, \ - check_repr, check_backwards_compatibility # adapted from https://github.com/milankl/BitInformation.jl/blob/main/test/round_nearest.jl +# TODO: add other dtypes @pytest.fixture(params=[np.float32]) def dtype(request): return request.param -def test_smoke(dtype): - data = np.zeros((3, 2), dtype=dtype) - codec = BitRound(keepbits=10) - codec.decode(codec.encode(data)) +# number of mantissa bits for each dtype +MBITS = {np.float32: 23} + + +def round(data, keepbits): + codec = BitRound(keepbits=keepbits) + data = data.copy() # otherwise overwrites the input + encoded = codec.encode(data) + return codec.decode(encoded) + + +def test_round_zero_to_zero(dtype): + a = np.zeros((3, 2), dtype=dtype) + # Don't understand Milan's original test: + # How is it possible to have negative keepbits? + # for k in range(-5, 50): + for k in range(0, MBITS[dtype]): + ar = round(a, k) + np.testing.assert_equal(a, ar) + + +def test_round_one_to_one(dtype): + a = np.ones((3, 2), dtype=dtype) + for k in range(0, MBITS[dtype]): + ar = round(a, k) + np.testing.assert_equal(a, ar) + + +def test_round_minus_one_to_minus_one(dtype): + a = -np.ones((3, 2), dtype=dtype) + for k in range(0, MBITS[dtype]): + ar = round(a, k) + np.testing.assert_equal(a, ar) + + +# This triggers a 'negative shift count' error in the codec +def test_no_rounding(dtype): + a = np.random.random_sample((300, 200)).astype(dtype) + keepbits = MBITS[dtype] + ar = round(a, keepbits) + np.testing.assert_equal(a, ar) + + +APPROX_KEEPBITS = {np.float32: 10} + + +# This does not pass at the default tolerance of allclose +# How is it different from Julia's ≈ operator? +def test_approx_equal(dtype): + a = np.random.random_sample((300, 200)).astype(dtype) + ar = round(a, APPROX_KEEPBITS[dtype]) + np.testing.assert_allclose(a, ar) + + +def test_idempotence(dtype): + a = np.random.random_sample((300, 200)).astype(dtype) + for k in range(20): + ar = round(a, k) + ar2 = round(a, k) + np.testing.assert_equal(ar, ar2) + + +# TODO: implement tie_to_even and round_to_nearest From 9e0c943ee8879b5eeb6a44bb5126c9804845e790 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Fri, 17 Dec 2021 12:19:25 -0500 Subject: [PATCH 03/24] use julia-style approximation --- numcodecs/tests/test_bitround.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/numcodecs/tests/test_bitround.py b/numcodecs/tests/test_bitround.py index b896295e..4a8f9603 100644 --- a/numcodecs/tests/test_bitround.py +++ b/numcodecs/tests/test_bitround.py @@ -59,12 +59,15 @@ def test_no_rounding(dtype): APPROX_KEEPBITS = {np.float32: 10} -# This does not pass at the default tolerance of allclose -# How is it different from Julia's ≈ operator? def test_approx_equal(dtype): a = np.random.random_sample((300, 200)).astype(dtype) ar = round(a, APPROX_KEEPBITS[dtype]) - np.testing.assert_allclose(a, ar) + # Mimic julia behavior - https://docs.julialang.org/en/v1/base/math/#Base.isapprox + rtol = np.sqrt(np.finfo(np.float32).eps) + # This gets us much closer but still failing for ~6% of the array + # It does pass if we add 1 to keepbits (11 instead of 10) + # Is there an off-by-one issue here? + np.testing.assert_allclose(a, ar, rtol=rtol) def test_idempotence(dtype): From 06a27d7994b688581725e83d4307e6f50134a1a4 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Tue, 4 Jan 2022 10:27:42 +0100 Subject: [PATCH 04/24] add range limits to bitround --- numcodecs/bitround.py | 4 ++++ numcodecs/tests/test_bitround.py | 3 +-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/numcodecs/bitround.py b/numcodecs/bitround.py index 9540f746..762171bf 100644 --- a/numcodecs/bitround.py +++ b/numcodecs/bitround.py @@ -9,9 +9,13 @@ class BitRound(Codec): codec_id = 'bitround' def __init__(self, keepbits: int): + if (keepbits < 0) or (keepbits > 23): + raise ValueError("keepbits must be between 0 and 23") self.keepbits = keepbits def encode(self, buf): + if self.keepbits==23: + return buf # TODO: figure out if we need to make a copy # Currently this appears to be overwriting the input buffer # Is that the right behavior? diff --git a/numcodecs/tests/test_bitround.py b/numcodecs/tests/test_bitround.py index 4a8f9603..d9e5cc1a 100644 --- a/numcodecs/tests/test_bitround.py +++ b/numcodecs/tests/test_bitround.py @@ -48,7 +48,6 @@ def test_round_minus_one_to_minus_one(dtype): np.testing.assert_equal(a, ar) -# This triggers a 'negative shift count' error in the codec def test_no_rounding(dtype): a = np.random.random_sample((300, 200)).astype(dtype) keepbits = MBITS[dtype] @@ -56,7 +55,7 @@ def test_no_rounding(dtype): np.testing.assert_equal(a, ar) -APPROX_KEEPBITS = {np.float32: 10} +APPROX_KEEPBITS = {np.float32: 11} def test_approx_equal(dtype): From 7c7dc7cc83db1ae5c9fd93ece863acedbbc8156f Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Tue, 4 Jan 2022 10:40:41 +0100 Subject: [PATCH 05/24] fix PEP error --- numcodecs/bitround.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numcodecs/bitround.py b/numcodecs/bitround.py index 762171bf..1bc04682 100644 --- a/numcodecs/bitround.py +++ b/numcodecs/bitround.py @@ -14,7 +14,7 @@ def __init__(self, keepbits: int): self.keepbits = keepbits def encode(self, buf): - if self.keepbits==23: + if self.keepbits == 23: return buf # TODO: figure out if we need to make a copy # Currently this appears to be overwriting the input buffer From 69263a5ba55da546eee7865d82a5dcdfd8055c1a Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 7 Apr 2022 14:34:31 -0400 Subject: [PATCH 06/24] Add docs and allow for multple precisions --- numcodecs/bitround.py | 60 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 12 deletions(-) diff --git a/numcodecs/bitround.py b/numcodecs/bitround.py index 1bc04682..24d84ee6 100644 --- a/numcodecs/bitround.py +++ b/numcodecs/bitround.py @@ -4,32 +4,68 @@ from .abc import Codec from .compat import ensure_ndarray, ndarray_copy +max_bits = { + "float16": 10, + "float32": 23, + "float64": 52, +} +types = { + "float16": np.int16, + "float32": np.int32, + "float64": np.int64, +} +inverse = { + "int16": np.float16, + "int32": np.float32, + "int64": np.float64 +} + class BitRound(Codec): + """Real information content algorithm + + Drops data in the least significant part of the floating point mantissa, + leaving an array uch more amenable to compression. See + https://github.com/zarr-developers/numcodecs/issues/298 for discussion + and the original implementation in Julia referred to at + https://www.nature.com/articles/s43588-021-00156-2 + """ + codec_id = 'bitround' def __init__(self, keepbits: int): - if (keepbits < 0) or (keepbits > 23): - raise ValueError("keepbits must be between 0 and 23") + if keepbits < 0: + raise ValueError("keepbits must be zero or positive") self.keepbits = keepbits def encode(self, buf): - if self.keepbits == 23: - return buf - # TODO: figure out if we need to make a copy - # Currently this appears to be overwriting the input buffer - # Is that the right behavior? - a = ensure_ndarray(buf).view() - assert a.dtype == np.float32 - b = a.view(dtype=np.int32) + """Create int array by rounding floating data + + The itemsize will be preserved, but the output should be much more + compressible. + """ + a = ensure_ndarray(buf) + bits = max_bits[str(a.dtype)] + all_set = np.frombuffer(b"\xff" * a.dtype.itemsize, dtype=types[str(a.dtype)]) + if self.keepbits == bits: + return a + if self.keepbits > bits: + raise ValueError("Keepbits too large for given dtype") + if not a.dtype.kind == "f" or a.dtype.itemsize > 8: + raise TypeError("Only float arrays (16-64bit) can be bit-rounded") + b = a.view(types[str(a.dtype)]) maskbits = 23 - self.keepbits - mask = (0xFFFFFFFF >> maskbits) << maskbits + mask = (all_set >> maskbits) << maskbits half_quantum1 = (1 << (maskbits - 1)) - 1 b += ((b >> maskbits) & 1) + half_quantum1 b &= mask return b def decode(self, buf, out=None): - data = ensure_ndarray(buf).view(np.float32) + """Remake floats from ints + + As with ``encode``, preserves itemsize. + """ + data = ensure_ndarray(buf).view(inverse[str(buf.dtype)]) out = ndarray_copy(data, out) return out From 6df3b696fe0479866eba75ce30aed80278ffa707 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 7 Apr 2022 14:55:16 -0400 Subject: [PATCH 07/24] fix --- numcodecs/bitround.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/numcodecs/bitround.py b/numcodecs/bitround.py index 24d84ee6..8805722c 100644 --- a/numcodecs/bitround.py +++ b/numcodecs/bitround.py @@ -66,6 +66,7 @@ def decode(self, buf, out=None): As with ``encode``, preserves itemsize. """ - data = ensure_ndarray(buf).view(inverse[str(buf.dtype)]) + dt = buf.dtype if buf.dtype.kind == "f" else inverse[str(buf.dtype)] + data = ensure_ndarray(buf).view(dt) out = ndarray_copy(data, out) return out From b5abbbb0d42a10fae8b1165be66e3457a98e522e Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 7 Apr 2022 14:55:51 -0400 Subject: [PATCH 08/24] Update numcodecs/bitround.py Co-authored-by: Ryan Abernathey --- numcodecs/bitround.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/numcodecs/bitround.py b/numcodecs/bitround.py index 8805722c..d5ee174d 100644 --- a/numcodecs/bitround.py +++ b/numcodecs/bitround.py @@ -25,7 +25,8 @@ class BitRound(Codec): """Real information content algorithm Drops data in the least significant part of the floating point mantissa, - leaving an array uch more amenable to compression. See + leaving an array more amenable to compression. The number of bits to keep should + be determined by a information analysis of the data to be compressed. See https://github.com/zarr-developers/numcodecs/issues/298 for discussion and the original implementation in Julia referred to at https://www.nature.com/articles/s43588-021-00156-2 From 7d9846ac42a5e7c28bdd6b613e6a1b64980d811e Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 7 Apr 2022 15:17:30 -0400 Subject: [PATCH 09/24] Update numcodecs/bitround.py Co-authored-by: Ryan Abernathey --- numcodecs/bitround.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numcodecs/bitround.py b/numcodecs/bitround.py index d5ee174d..1e8a989c 100644 --- a/numcodecs/bitround.py +++ b/numcodecs/bitround.py @@ -24,7 +24,7 @@ class BitRound(Codec): """Real information content algorithm - Drops data in the least significant part of the floating point mantissa, + Drops a specified number of bits from the floating point mantissa, leaving an array more amenable to compression. The number of bits to keep should be determined by a information analysis of the data to be compressed. See https://github.com/zarr-developers/numcodecs/issues/298 for discussion From 76e9f6f4524e9c1bd5dc84b7ab54882b28180438 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 7 Apr 2022 15:17:40 -0400 Subject: [PATCH 10/24] Update numcodecs/bitround.py Co-authored-by: Ryan Abernathey --- numcodecs/bitround.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numcodecs/bitround.py b/numcodecs/bitround.py index 1e8a989c..4917412c 100644 --- a/numcodecs/bitround.py +++ b/numcodecs/bitround.py @@ -22,7 +22,7 @@ class BitRound(Codec): - """Real information content algorithm + """Floating-point bit rounding codec Drops a specified number of bits from the floating point mantissa, leaving an array more amenable to compression. The number of bits to keep should From 2f6207e757f1c1674e0b327ce6e3e67ba050b13e Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 7 Apr 2022 15:17:47 -0400 Subject: [PATCH 11/24] Update numcodecs/bitround.py Co-authored-by: Ryan Abernathey --- numcodecs/bitround.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numcodecs/bitround.py b/numcodecs/bitround.py index 4917412c..69794e1a 100644 --- a/numcodecs/bitround.py +++ b/numcodecs/bitround.py @@ -40,7 +40,7 @@ def __init__(self, keepbits: int): self.keepbits = keepbits def encode(self, buf): - """Create int array by rounding floating data + """Create int array by rounding floating-point data The itemsize will be preserved, but the output should be much more compressible. From 594202a79aa84523c3665031ad27b09376b37a25 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 8 Apr 2022 16:37:06 -0400 Subject: [PATCH 12/24] document keepbits --- numcodecs/bitround.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/numcodecs/bitround.py b/numcodecs/bitround.py index 69794e1a..fb46d508 100644 --- a/numcodecs/bitround.py +++ b/numcodecs/bitround.py @@ -30,6 +30,15 @@ class BitRound(Codec): https://github.com/zarr-developers/numcodecs/issues/298 for discussion and the original implementation in Julia referred to at https://www.nature.com/articles/s43588-021-00156-2 + + Parameters + ---------- + + keepbits: int + The number of bits of the mantissa to keep. The range allowed + depends on the dtype input data. If keepbits is + equal to the maximum allowed for the data type, this is equivalent + to no transform. """ codec_id = 'bitround' From c027936a4e7fa49cfefd573f9f2661428434d82b Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 8 Apr 2022 16:46:17 -0400 Subject: [PATCH 13/24] Update numcodecs/bitround.py --- numcodecs/bitround.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numcodecs/bitround.py b/numcodecs/bitround.py index fb46d508..8855b43c 100644 --- a/numcodecs/bitround.py +++ b/numcodecs/bitround.py @@ -26,7 +26,7 @@ class BitRound(Codec): Drops a specified number of bits from the floating point mantissa, leaving an array more amenable to compression. The number of bits to keep should - be determined by a information analysis of the data to be compressed. See + be determined by an information analysis of the data to be compressed. See https://github.com/zarr-developers/numcodecs/issues/298 for discussion and the original implementation in Julia referred to at https://www.nature.com/articles/s43588-021-00156-2 From 8b1fcfa9e14e3f39e37027cec23896da9934333e Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 11 Apr 2022 10:23:48 -0400 Subject: [PATCH 14/24] Test float64 --- numcodecs/bitround.py | 2 +- numcodecs/tests/test_bitround.py | 20 +++++++------------- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/numcodecs/bitround.py b/numcodecs/bitround.py index 8855b43c..fadb6175 100644 --- a/numcodecs/bitround.py +++ b/numcodecs/bitround.py @@ -64,7 +64,7 @@ def encode(self, buf): if not a.dtype.kind == "f" or a.dtype.itemsize > 8: raise TypeError("Only float arrays (16-64bit) can be bit-rounded") b = a.view(types[str(a.dtype)]) - maskbits = 23 - self.keepbits + maskbits = bits - self.keepbits mask = (all_set >> maskbits) << maskbits half_quantum1 = (1 << (maskbits - 1)) - 1 b += ((b >> maskbits) & 1) + half_quantum1 diff --git a/numcodecs/tests/test_bitround.py b/numcodecs/tests/test_bitround.py index d9e5cc1a..2005a095 100644 --- a/numcodecs/tests/test_bitround.py +++ b/numcodecs/tests/test_bitround.py @@ -2,20 +2,17 @@ import pytest -from numcodecs.bitround import BitRound +from numcodecs.bitround import BitRound, max_bits # adapted from https://github.com/milankl/BitInformation.jl/blob/main/test/round_nearest.jl # TODO: add other dtypes -@pytest.fixture(params=[np.float32]) +@pytest.fixture(params=["float32", "float64"]) def dtype(request): return request.param -# number of mantissa bits for each dtype -MBITS = {np.float32: 23} - def round(data, keepbits): codec = BitRound(keepbits=keepbits) @@ -29,33 +26,33 @@ def test_round_zero_to_zero(dtype): # Don't understand Milan's original test: # How is it possible to have negative keepbits? # for k in range(-5, 50): - for k in range(0, MBITS[dtype]): + for k in range(0, max_bits[dtype]): ar = round(a, k) np.testing.assert_equal(a, ar) def test_round_one_to_one(dtype): a = np.ones((3, 2), dtype=dtype) - for k in range(0, MBITS[dtype]): + for k in range(0, max_bits[dtype]): ar = round(a, k) np.testing.assert_equal(a, ar) def test_round_minus_one_to_minus_one(dtype): a = -np.ones((3, 2), dtype=dtype) - for k in range(0, MBITS[dtype]): + for k in range(0, max_bits[dtype]): ar = round(a, k) np.testing.assert_equal(a, ar) def test_no_rounding(dtype): a = np.random.random_sample((300, 200)).astype(dtype) - keepbits = MBITS[dtype] + keepbits = max_bits[dtype] ar = round(a, keepbits) np.testing.assert_equal(a, ar) -APPROX_KEEPBITS = {np.float32: 11} +APPROX_KEEPBITS = {"float32": 11, "float64": 18} def test_approx_equal(dtype): @@ -75,6 +72,3 @@ def test_idempotence(dtype): ar = round(a, k) ar2 = round(a, k) np.testing.assert_equal(ar, ar2) - - -# TODO: implement tie_to_even and round_to_nearest From 4b2520934f1ff8836e607b8f13f78c9fb18133c5 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 11 Apr 2022 10:27:13 -0400 Subject: [PATCH 15/24] blank line --- numcodecs/tests/test_bitround.py | 1 - 1 file changed, 1 deletion(-) diff --git a/numcodecs/tests/test_bitround.py b/numcodecs/tests/test_bitround.py index 2005a095..81168c86 100644 --- a/numcodecs/tests/test_bitround.py +++ b/numcodecs/tests/test_bitround.py @@ -13,7 +13,6 @@ def dtype(request): return request.param - def round(data, keepbits): codec = BitRound(keepbits=keepbits) data = data.copy() # otherwise overwrites the input From e5829cbe90aff7e47dfc174f83e8355e45512c00 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 13 Apr 2022 10:53:31 -0400 Subject: [PATCH 16/24] Split out bitround functions for direct call --- numcodecs/bitround.py | 46 +++++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/numcodecs/bitround.py b/numcodecs/bitround.py index fadb6175..e28c79d7 100644 --- a/numcodecs/bitround.py +++ b/numcodecs/bitround.py @@ -55,28 +55,36 @@ def encode(self, buf): compressible. """ a = ensure_ndarray(buf) - bits = max_bits[str(a.dtype)] - all_set = np.frombuffer(b"\xff" * a.dtype.itemsize, dtype=types[str(a.dtype)]) - if self.keepbits == bits: - return a - if self.keepbits > bits: - raise ValueError("Keepbits too large for given dtype") - if not a.dtype.kind == "f" or a.dtype.itemsize > 8: - raise TypeError("Only float arrays (16-64bit) can be bit-rounded") - b = a.view(types[str(a.dtype)]) - maskbits = bits - self.keepbits - mask = (all_set >> maskbits) << maskbits - half_quantum1 = (1 << (maskbits - 1)) - 1 - b += ((b >> maskbits) & 1) + half_quantum1 - b &= mask - return b + return _bitround(a, self.keepbits) def decode(self, buf, out=None): """Remake floats from ints As with ``encode``, preserves itemsize. """ - dt = buf.dtype if buf.dtype.kind == "f" else inverse[str(buf.dtype)] - data = ensure_ndarray(buf).view(dt) - out = ndarray_copy(data, out) - return out + buf = ensure_ndarray(buf) + return _unround(buf, out) + +def _bitround(a, keepbits): + bits = max_bits[str(a.dtype)] + all_set = np.frombuffer(b"\xff" * a.dtype.itemsize, dtype=types[str(a.dtype)]) + if keepbits == bits: + return a + if keepbits > bits: + raise ValueError("Keepbits too large for given dtype") + if not a.dtype.kind == "f" or a.dtype.itemsize > 8: + raise TypeError("Only float arrays (16-64bit) can be bit-rounded") + b = a.view(types[str(a.dtype)]) + maskbits = bits - keepbits + mask = (all_set >> maskbits) << maskbits + half_quantum1 = (1 << (maskbits - 1)) - 1 + b += ((b >> maskbits) & 1) + half_quantum1 + b &= mask + return b + + +def _unround(buf, out): + dt = buf.dtype if buf.dtype.kind == "f" else inverse[str(buf.dtype)] + data = buf.view(dt) + out = ndarray_copy(data, out) + return out From 374cf9ed0755be907b336d7b40809de3d7b351d0 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 13 Apr 2022 11:08:05 -0400 Subject: [PATCH 17/24] Add docs and complete test coverage --- docs/bitround.rst | 11 +++++++++++ docs/index.rst | 1 + docs/release.rst | 3 +++ numcodecs/bitround.py | 5 +++-- numcodecs/tests/test_bitround.py | 7 +++++++ 5 files changed, 25 insertions(+), 2 deletions(-) create mode 100644 docs/bitround.rst diff --git a/docs/bitround.rst b/docs/bitround.rst new file mode 100644 index 00000000..3c98b5a7 --- /dev/null +++ b/docs/bitround.rst @@ -0,0 +1,11 @@ +Bitround +======== +.. automodule:: numcodecs.bitround + +.. autoclass:: BitRound + + .. autoattribute:: codec_id + .. automethod:: encode + .. automethod:: decode + .. automethod:: get_config + .. automethod:: from_config diff --git a/docs/index.rst b/docs/index.rst index e8cff917..a0152e42 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -71,6 +71,7 @@ Contents delta fixedscaleoffset quantize + bitround packbits categorize checksum32 diff --git a/docs/release.rst b/docs/release.rst index 63f4bbc1..811d18c6 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -8,6 +8,9 @@ Unreleased .. _release_0.9.1: +* Add bitround codec + By :user:`Ryan Abernathy ` and :user:`Martin Durant `, :issue:`298`. + 0.9.1 ----- diff --git a/numcodecs/bitround.py b/numcodecs/bitround.py index e28c79d7..79337557 100644 --- a/numcodecs/bitround.py +++ b/numcodecs/bitround.py @@ -65,15 +65,16 @@ def decode(self, buf, out=None): buf = ensure_ndarray(buf) return _unround(buf, out) + def _bitround(a, keepbits): + if not a.dtype.kind == "f" or a.dtype.itemsize > 8: + raise TypeError("Only float arrays (16-64bit) can be bit-rounded") bits = max_bits[str(a.dtype)] all_set = np.frombuffer(b"\xff" * a.dtype.itemsize, dtype=types[str(a.dtype)]) if keepbits == bits: return a if keepbits > bits: raise ValueError("Keepbits too large for given dtype") - if not a.dtype.kind == "f" or a.dtype.itemsize > 8: - raise TypeError("Only float arrays (16-64bit) can be bit-rounded") b = a.view(types[str(a.dtype)]) maskbits = bits - keepbits mask = (all_set >> maskbits) << maskbits diff --git a/numcodecs/tests/test_bitround.py b/numcodecs/tests/test_bitround.py index 81168c86..1e5fa701 100644 --- a/numcodecs/tests/test_bitround.py +++ b/numcodecs/tests/test_bitround.py @@ -71,3 +71,10 @@ def test_idempotence(dtype): ar = round(a, k) ar2 = round(a, k) np.testing.assert_equal(ar, ar2) + + +def test_errors(): + with pytest.raises(ValueError): + BitRound(keepbits=99).encode(np.array([0], dtype="float32")) + with pytest.raises(TypeError): + BitRound(keepbits=10).encode(np.array([0])) From 9622fe318777d769d170f650eaa304dd5c6e5b3b Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 13 Apr 2022 11:12:03 -0400 Subject: [PATCH 18/24] merge mislaid doc label --- docs/release.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/release.rst b/docs/release.rst index 7fc1982d..de0daf55 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -36,11 +36,11 @@ Unreleased By :user:`Dimitri Papadopoulos Orfanos `, :issue:`295`, :issue:`294`, :issue:`293`, and :issue:`292`. -.. _release_0.9.1: - * Add bitround codec By :user:`Ryan Abernathy ` and :user:`Martin Durant `, :issue:`298`. +.. _release_0.9.1: + 0.9.1 ----- From e62de8644e4f68390d7ef39cb1e92832f3cf9972 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 13 Apr 2022 16:40:27 -0400 Subject: [PATCH 19/24] use ndarray_like --- numcodecs/bitround.py | 51 ++++++++++++++++++------------------------- 1 file changed, 21 insertions(+), 30 deletions(-) diff --git a/numcodecs/bitround.py b/numcodecs/bitround.py index 79337557..9a7d888f 100644 --- a/numcodecs/bitround.py +++ b/numcodecs/bitround.py @@ -2,7 +2,7 @@ from .abc import Codec -from .compat import ensure_ndarray, ndarray_copy +from .compat import ensure_ndarray_like, ndarray_copy max_bits = { "float16": 10, @@ -54,38 +54,29 @@ def encode(self, buf): The itemsize will be preserved, but the output should be much more compressible. """ - a = ensure_ndarray(buf) - return _bitround(a, self.keepbits) + a = ensure_ndarray_like(buf) + if not a.dtype.kind == "f" or a.dtype.itemsize > 8: + raise TypeError("Only float arrays (16-64bit) can be bit-rounded") + bits = max_bits[str(a.dtype)] + all_set = np.frombuffer(b"\xff" * a.dtype.itemsize, dtype=types[str(a.dtype)]) + if keepbits == bits: + return a + if keepbits > bits: + raise ValueError("Keepbits too large for given dtype") + b = a.view(types[str(a.dtype)]) + maskbits = bits - keepbits + mask = (all_set >> maskbits) << maskbits + half_quantum1 = (1 << (maskbits - 1)) - 1 + b += ((b >> maskbits) & 1) + half_quantum1 + b &= mask + return b def decode(self, buf, out=None): """Remake floats from ints As with ``encode``, preserves itemsize. """ - buf = ensure_ndarray(buf) - return _unround(buf, out) - - -def _bitround(a, keepbits): - if not a.dtype.kind == "f" or a.dtype.itemsize > 8: - raise TypeError("Only float arrays (16-64bit) can be bit-rounded") - bits = max_bits[str(a.dtype)] - all_set = np.frombuffer(b"\xff" * a.dtype.itemsize, dtype=types[str(a.dtype)]) - if keepbits == bits: - return a - if keepbits > bits: - raise ValueError("Keepbits too large for given dtype") - b = a.view(types[str(a.dtype)]) - maskbits = bits - keepbits - mask = (all_set >> maskbits) << maskbits - half_quantum1 = (1 << (maskbits - 1)) - 1 - b += ((b >> maskbits) & 1) + half_quantum1 - b &= mask - return b - - -def _unround(buf, out): - dt = buf.dtype if buf.dtype.kind == "f" else inverse[str(buf.dtype)] - data = buf.view(dt) - out = ndarray_copy(data, out) - return out + buf = ensure_ndarray_like(buf) + dt = buf.dtype if buf.dtype.kind == "f" else inverse[str(buf.dtype)] + data = buf.view(dt) + return ndarray_copy(data, out) From 166e15ce2c162116c47ffa0272ce49ba793c45b1 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 13 Apr 2022 16:50:04 -0400 Subject: [PATCH 20/24] put self --- numcodecs/bitround.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/numcodecs/bitround.py b/numcodecs/bitround.py index 9a7d888f..9ff21ef9 100644 --- a/numcodecs/bitround.py +++ b/numcodecs/bitround.py @@ -59,12 +59,12 @@ def encode(self, buf): raise TypeError("Only float arrays (16-64bit) can be bit-rounded") bits = max_bits[str(a.dtype)] all_set = np.frombuffer(b"\xff" * a.dtype.itemsize, dtype=types[str(a.dtype)]) - if keepbits == bits: + if self.keepbits == bits: return a - if keepbits > bits: + if self.keepbits > bits: raise ValueError("Keepbits too large for given dtype") b = a.view(types[str(a.dtype)]) - maskbits = bits - keepbits + maskbits = bits - self.keepbits mask = (all_set >> maskbits) << maskbits half_quantum1 = (1 << (maskbits - 1)) - 1 b += ((b >> maskbits) & 1) + half_quantum1 From 775d3688db1212f3eb18fc650b859c8aaf1957f2 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 15 Apr 2022 12:43:56 -0400 Subject: [PATCH 21/24] suggested changes --- numcodecs/bitround.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/numcodecs/bitround.py b/numcodecs/bitround.py index 9ff21ef9..37702065 100644 --- a/numcodecs/bitround.py +++ b/numcodecs/bitround.py @@ -4,6 +4,9 @@ from .abc import Codec from .compat import ensure_ndarray_like, ndarray_copy +# The size in bits of the mantissa/significand for the various floating types +# You cannot keep more bits of data than you have available +# https://en.wikipedia.org/wiki/IEEE_754 max_bits = { "float16": 10, "float32": 23, @@ -14,11 +17,6 @@ "float32": np.int32, "float64": np.int64, } -inverse = { - "int16": np.float16, - "int32": np.float32, - "int64": np.float64 -} class BitRound(Codec): @@ -58,12 +56,14 @@ def encode(self, buf): if not a.dtype.kind == "f" or a.dtype.itemsize > 8: raise TypeError("Only float arrays (16-64bit) can be bit-rounded") bits = max_bits[str(a.dtype)] - all_set = np.frombuffer(b"\xff" * a.dtype.itemsize, dtype=types[str(a.dtype)]) + # cast float to int type of same width (preserve endianness) + a_int_dtype = np.dtype(a.dtype.str.replace("f", "i")) + all_set = np.array(-1, dtype=a_int_dtype) if self.keepbits == bits: return a if self.keepbits > bits: raise ValueError("Keepbits too large for given dtype") - b = a.view(types[str(a.dtype)]) + b = a.view(a_int_dtype) maskbits = bits - self.keepbits mask = (all_set >> maskbits) << maskbits half_quantum1 = (1 << (maskbits - 1)) - 1 @@ -77,6 +77,7 @@ def decode(self, buf, out=None): As with ``encode``, preserves itemsize. """ buf = ensure_ndarray_like(buf) - dt = buf.dtype if buf.dtype.kind == "f" else inverse[str(buf.dtype)] + # Cast back from `int` to `float` type (noop if a `float`ing type buffer is provided) + dt = np.dtype(buf.dtype.str.replace("i", "f")) data = buf.view(dt) return ndarray_copy(data, out) From 7deff68de8fd6851a644ec62d59f3c8bdfc82232 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 22 Apr 2022 15:09:23 -0400 Subject: [PATCH 22/24] Update numcodecs/bitround.py Co-authored-by: jakirkham --- numcodecs/bitround.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/numcodecs/bitround.py b/numcodecs/bitround.py index 37702065..b1f104ed 100644 --- a/numcodecs/bitround.py +++ b/numcodecs/bitround.py @@ -12,11 +12,6 @@ "float32": 23, "float64": 52, } -types = { - "float16": np.int16, - "float32": np.int32, - "float64": np.int64, -} class BitRound(Codec): From 66b7b1aab167083761176e833c9070f998010bc9 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 18 May 2022 13:50:20 -0400 Subject: [PATCH 23/24] Update numcodecs/bitround.py Co-authored-by: Ryan Abernathey --- numcodecs/bitround.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numcodecs/bitround.py b/numcodecs/bitround.py index b1f104ed..e1797212 100644 --- a/numcodecs/bitround.py +++ b/numcodecs/bitround.py @@ -22,7 +22,7 @@ class BitRound(Codec): be determined by an information analysis of the data to be compressed. See https://github.com/zarr-developers/numcodecs/issues/298 for discussion and the original implementation in Julia referred to at - https://www.nature.com/articles/s43588-021-00156-2 + https://github.com/milankl/BitInformation.jl Parameters ---------- From 56d9511d2c12594b66ac8ef4c585a976371f9727 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 18 May 2022 13:50:36 -0400 Subject: [PATCH 24/24] Update numcodecs/bitround.py Co-authored-by: Ryan Abernathey --- numcodecs/bitround.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/numcodecs/bitround.py b/numcodecs/bitround.py index e1797212..767e5e43 100644 --- a/numcodecs/bitround.py +++ b/numcodecs/bitround.py @@ -19,7 +19,9 @@ class BitRound(Codec): Drops a specified number of bits from the floating point mantissa, leaving an array more amenable to compression. The number of bits to keep should - be determined by an information analysis of the data to be compressed. See + be determined by an information analysis of the data to be compressed. + The approach is based on the paper by Klöwer et al. 2021 + (https://www.nature.com/articles/s43588-021-00156-2). See https://github.com/zarr-developers/numcodecs/issues/298 for discussion and the original implementation in Julia referred to at https://github.com/milankl/BitInformation.jl