From 0c453405ec4265756f34a57196fa761e28d2dd4c Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 7 Apr 2025 16:51:09 +0200 Subject: [PATCH 01/15] (fix): use `typesize` on `Blosc` codec --- src/zarr/codecs/blosc.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index 2fcc041a6b..9a999e10d7 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -8,6 +8,7 @@ import numcodecs from numcodecs.blosc import Blosc +from packaging.version import Version from zarr.abc.codec import BytesBytesCodec from zarr.core.buffer.cpu import as_numpy_array_wrapper @@ -163,6 +164,9 @@ def _blosc_codec(self) -> Blosc: "shuffle": map_shuffle_str_to_int[self.shuffle], "blocksize": self.blocksize, } + # See https://github.com/zarr-developers/numcodecs/pull/713 + if Version(numcodecs.__version__) >= Version("0.16.0"): + config_dict["typesize"] = self.typesize return Blosc.from_config(config_dict) async def _decode_single( From 3074bf2906cf78341810b3009b79edb8a38782d1 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 7 Apr 2025 17:42:36 +0200 Subject: [PATCH 02/15] (chore): relnote --- changes/2962.fix.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/2962.fix.rst diff --git a/changes/2962.fix.rst b/changes/2962.fix.rst new file mode 100644 index 0000000000..200bb21b4f --- /dev/null +++ b/changes/2962.fix.rst @@ -0,0 +1 @@ +Internally use `typesize` constructor parameter for :class:`numcodecs.Blosc` to improve compression ratios back to the v2-package levels. \ No newline at end of file From 386f09f1d070408a8e92405a24c25d034828abe8 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 7 Apr 2025 20:48:23 +0200 Subject: [PATCH 03/15] (fix): intersphinx --- changes/2962.fix.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changes/2962.fix.rst b/changes/2962.fix.rst index 200bb21b4f..83d24b72ce 100644 --- a/changes/2962.fix.rst +++ b/changes/2962.fix.rst @@ -1 +1 @@ -Internally use `typesize` constructor parameter for :class:`numcodecs.Blosc` to improve compression ratios back to the v2-package levels. \ No newline at end of file +Internally use `typesize` constructor parameter for :class:`numcodecs.blosc.Blosc` to improve compression ratios back to the v2-package levels. \ No newline at end of file From e7c5b00572d027004f1d1db13017c0b132ea2d34 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 7 Apr 2025 20:57:04 +0200 Subject: [PATCH 04/15] (fix): look at that compression ratio! --- docs/user-guide/arrays.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/user-guide/arrays.rst b/docs/user-guide/arrays.rst index a62b2ea0fa..e6d1bcdc54 100644 --- a/docs/user-guide/arrays.rst +++ b/docs/user-guide/arrays.rst @@ -209,8 +209,8 @@ prints additional diagnostics, e.g.:: Serializer : BytesCodec(endian=) Compressors : (BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0),) No. bytes : 400000000 (381.5M) - No. bytes stored : 9696520 - Storage ratio : 41.3 + No. bytes stored : 3558573 + Storage ratio : 112.4 Chunks Initialized : 100 .. note:: From 5e1a59378029c909825825ff2dd268e4671aa9b0 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 9 Apr 2025 16:10:10 +0200 Subject: [PATCH 05/15] (fix): add test --- tests/test_codecs/test_blosc.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_codecs/test_blosc.py b/tests/test_codecs/test_blosc.py index c1c5c92329..c2cdfff4d6 100644 --- a/tests/test_codecs/test_blosc.py +++ b/tests/test_codecs/test_blosc.py @@ -54,3 +54,11 @@ async def test_blosc_evolve(store: Store, dtype: str) -> None: assert blosc_configuration_json["shuffle"] == "bitshuffle" else: assert blosc_configuration_json["shuffle"] == "shuffle" + + +async def test_typesize() -> None: + a = np.arange(1000000) + codecs = [zarr.codecs.BytesCodec(), zarr.codecs.BloscCodec()] + z3 = zarr.array(a, chunks=(10000), codecs=codecs) + v3_size = len(await z3.store.get("c/0", prototype=default_buffer_prototype())) + assert v3_size == 402 From 9fe74b800004f72998ac46068e3af77b04fb212b Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 9 Apr 2025 16:23:09 +0200 Subject: [PATCH 06/15] (fix): min version --- tests/test_codecs/test_blosc.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_codecs/test_blosc.py b/tests/test_codecs/test_blosc.py index c2cdfff4d6..353f7de5cc 100644 --- a/tests/test_codecs/test_blosc.py +++ b/tests/test_codecs/test_blosc.py @@ -1,7 +1,9 @@ import json +import numcodecs import numpy as np import pytest +from packaging.version import Version import zarr from zarr.abc.store import Store @@ -61,4 +63,4 @@ async def test_typesize() -> None: codecs = [zarr.codecs.BytesCodec(), zarr.codecs.BloscCodec()] z3 = zarr.array(a, chunks=(10000), codecs=codecs) v3_size = len(await z3.store.get("c/0", prototype=default_buffer_prototype())) - assert v3_size == 402 + assert v3_size == 402 if Version(numcodecs.__version__) >= Version("0.16.0") else 10216 From 6f0feca9a35e6e6704f818386318d23d7d8b7d64 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 9 Apr 2025 16:45:11 +0200 Subject: [PATCH 07/15] (fix): parenthesis? --- tests/test_codecs/test_blosc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_codecs/test_blosc.py b/tests/test_codecs/test_blosc.py index 353f7de5cc..dc91aaa2dd 100644 --- a/tests/test_codecs/test_blosc.py +++ b/tests/test_codecs/test_blosc.py @@ -63,4 +63,4 @@ async def test_typesize() -> None: codecs = [zarr.codecs.BytesCodec(), zarr.codecs.BloscCodec()] z3 = zarr.array(a, chunks=(10000), codecs=codecs) v3_size = len(await z3.store.get("c/0", prototype=default_buffer_prototype())) - assert v3_size == 402 if Version(numcodecs.__version__) >= Version("0.16.0") else 10216 + assert v3_size == (402 if Version(numcodecs.__version__) >= Version("0.16.0") else 10216) From 25ece766197cc728741e91b253002d9d4179ab21 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 9 Apr 2025 16:47:27 +0200 Subject: [PATCH 08/15] (fix): try assertion error --- tests/test_codecs/test_blosc.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/test_codecs/test_blosc.py b/tests/test_codecs/test_blosc.py index dc91aaa2dd..0414385265 100644 --- a/tests/test_codecs/test_blosc.py +++ b/tests/test_codecs/test_blosc.py @@ -61,6 +61,8 @@ async def test_blosc_evolve(store: Store, dtype: str) -> None: async def test_typesize() -> None: a = np.arange(1000000) codecs = [zarr.codecs.BytesCodec(), zarr.codecs.BloscCodec()] - z3 = zarr.array(a, chunks=(10000), codecs=codecs) - v3_size = len(await z3.store.get("c/0", prototype=default_buffer_prototype())) - assert v3_size == (402 if Version(numcodecs.__version__) >= Version("0.16.0") else 10216) + z = zarr.array(a, chunks=(10000), codecs=codecs) + size = len(await z.store.get("c/0", prototype=default_buffer_prototype())) + assert size == (402 if Version(numcodecs.__version__) >= Version("0.16.0") else 10216), ( + "blosc size mismatch, found {size}" + ) From 7bf71b47e562dcc5d13dfd2526cba0fc2c48fe12 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 9 Apr 2025 17:46:51 +0200 Subject: [PATCH 09/15] (fix): windows size --- tests/test_codecs/test_blosc.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/test_codecs/test_blosc.py b/tests/test_codecs/test_blosc.py index 0414385265..b192bd7b71 100644 --- a/tests/test_codecs/test_blosc.py +++ b/tests/test_codecs/test_blosc.py @@ -1,4 +1,5 @@ import json +import platform import numcodecs import numpy as np @@ -63,6 +64,11 @@ async def test_typesize() -> None: codecs = [zarr.codecs.BytesCodec(), zarr.codecs.BloscCodec()] z = zarr.array(a, chunks=(10000), codecs=codecs) size = len(await z.store.get("c/0", prototype=default_buffer_prototype())) - assert size == (402 if Version(numcodecs.__version__) >= Version("0.16.0") else 10216), ( - "blosc size mismatch, found {size}" - ) + match Version(numcodecs.__version__) >= Version("0.16.0"), platform.system() == "Windows": + case True, True: + expected_size = 400 + case True, False: + expected_size = 402 + case False, _: + expected_size = 10216 + assert size == expected_size, f"blosc size mismatch, found {size} but expected {expected_size}" From 3c9c6cc07c8de9ba31987c89e9fc0224a0dcbaf1 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 6 May 2025 18:27:33 +0200 Subject: [PATCH 10/15] (fix): add bytes print --- tests/test_codecs/test_blosc.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/test_codecs/test_blosc.py b/tests/test_codecs/test_blosc.py index b192bd7b71..c299b46013 100644 --- a/tests/test_codecs/test_blosc.py +++ b/tests/test_codecs/test_blosc.py @@ -63,7 +63,8 @@ async def test_typesize() -> None: a = np.arange(1000000) codecs = [zarr.codecs.BytesCodec(), zarr.codecs.BloscCodec()] z = zarr.array(a, chunks=(10000), codecs=codecs) - size = len(await z.store.get("c/0", prototype=default_buffer_prototype())) + bytes = (await z.store.get("c/0", prototype=default_buffer_prototype())).to_bytes() + size = len(bytes) match Version(numcodecs.__version__) >= Version("0.16.0"), platform.system() == "Windows": case True, True: expected_size = 400 @@ -71,4 +72,6 @@ async def test_typesize() -> None: expected_size = 402 case False, _: expected_size = 10216 - assert size == expected_size, f"blosc size mismatch, found {size} but expected {expected_size}" + assert size == expected_size, ( + f"blosc size mismatch, found {size} but expected {expected_size}. First 10 bytes: {bytes[:10]} and last 10 bytes: {bytes[-10:]}" + ) From 45693bf002cd7f7f581d3e8258ec07ad55d6a3dc Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 8 May 2025 13:59:01 +0200 Subject: [PATCH 11/15] (fix): aghh windows latest is correct, error for non latest --- tests/test_codecs/test_blosc.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/test_codecs/test_blosc.py b/tests/test_codecs/test_blosc.py index c299b46013..6d32e974f3 100644 --- a/tests/test_codecs/test_blosc.py +++ b/tests/test_codecs/test_blosc.py @@ -66,12 +66,10 @@ async def test_typesize() -> None: bytes = (await z.store.get("c/0", prototype=default_buffer_prototype())).to_bytes() size = len(bytes) match Version(numcodecs.__version__) >= Version("0.16.0"), platform.system() == "Windows": - case True, True: - expected_size = 400 - case True, False: + case True, _: expected_size = 402 case False, _: expected_size = 10216 assert size == expected_size, ( - f"blosc size mismatch, found {size} but expected {expected_size}. First 10 bytes: {bytes[:10]} and last 10 bytes: {bytes[-10:]}" + f"blosc size mismatch, found {size} but expected {expected_size}. First 10 bytes: {bytes[:20]} and last 10 bytes: {bytes[-20:]}" ) From 0fcc9f0c786c4ea05bc398257c9c7f89a6dc658d Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 8 May 2025 14:42:01 +0200 Subject: [PATCH 12/15] (fix): conditions for sizes --- tests/test_codecs/test_blosc.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/tests/test_codecs/test_blosc.py b/tests/test_codecs/test_blosc.py index 6d32e974f3..c43f8cea4b 100644 --- a/tests/test_codecs/test_blosc.py +++ b/tests/test_codecs/test_blosc.py @@ -65,11 +65,21 @@ async def test_typesize() -> None: z = zarr.array(a, chunks=(10000), codecs=codecs) bytes = (await z.store.get("c/0", prototype=default_buffer_prototype())).to_bytes() size = len(bytes) - match Version(numcodecs.__version__) >= Version("0.16.0"), platform.system() == "Windows": - case True, _: + msg = f"Blosc size mismatch. First 10 bytes: {bytes[:20]} and last 10 bytes: {bytes[-20:]}" + match ( + Version(numcodecs.__version__) >= Version("0.16.0"), + platform.system() == "Windows", + Version(np.__version__) < Version("2.0.0"), + ): + case True, True, True: + # See https://github.com/zarr-developers/zarr-python/pull/2962 + # for why this condition is distinct. It's not clear + # if it's the python version of the numpy version with windows. + expected_size = 400 + assert size == 400, msg + case (True, True, False) | (True, False, _): expected_size = 402 - case False, _: + assert size == expected_size, msg + case False, _, _: expected_size = 10216 - assert size == expected_size, ( - f"blosc size mismatch, found {size} but expected {expected_size}. First 10 bytes: {bytes[:20]} and last 10 bytes: {bytes[-20:]}" - ) + assert size == expected_size, msg From fa7092fd75d839149f412973a8e13c88a52dd5d6 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 8 May 2025 15:40:34 +0200 Subject: [PATCH 13/15] (fix): try clearer data --- tests/test_codecs/test_blosc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_codecs/test_blosc.py b/tests/test_codecs/test_blosc.py index c43f8cea4b..17edd01537 100644 --- a/tests/test_codecs/test_blosc.py +++ b/tests/test_codecs/test_blosc.py @@ -60,7 +60,7 @@ async def test_blosc_evolve(store: Store, dtype: str) -> None: async def test_typesize() -> None: - a = np.arange(1000000) + a = np.arange(1000000, dtype=np.uint64) codecs = [zarr.codecs.BytesCodec(), zarr.codecs.BloscCodec()] z = zarr.array(a, chunks=(10000), codecs=codecs) bytes = (await z.store.get("c/0", prototype=default_buffer_prototype())).to_bytes() From 5cfa80fbeee653f32a92502e511764c416a79fdd Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 8 May 2025 15:59:45 +0200 Subject: [PATCH 14/15] (fix): awesome! --- tests/test_codecs/test_blosc.py | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/tests/test_codecs/test_blosc.py b/tests/test_codecs/test_blosc.py index 17edd01537..cebd074f72 100644 --- a/tests/test_codecs/test_blosc.py +++ b/tests/test_codecs/test_blosc.py @@ -1,5 +1,4 @@ import json -import platform import numcodecs import numpy as np @@ -66,20 +65,9 @@ async def test_typesize() -> None: bytes = (await z.store.get("c/0", prototype=default_buffer_prototype())).to_bytes() size = len(bytes) msg = f"Blosc size mismatch. First 10 bytes: {bytes[:20]} and last 10 bytes: {bytes[-20:]}" - match ( - Version(numcodecs.__version__) >= Version("0.16.0"), - platform.system() == "Windows", - Version(np.__version__) < Version("2.0.0"), - ): - case True, True, True: - # See https://github.com/zarr-developers/zarr-python/pull/2962 - # for why this condition is distinct. It's not clear - # if it's the python version of the numpy version with windows. - expected_size = 400 - assert size == 400, msg - case (True, True, False) | (True, False, _): - expected_size = 402 - assert size == expected_size, msg - case False, _, _: - expected_size = 10216 - assert size == expected_size, msg + if Version(numcodecs.__version__) >= Version("0.16.0"): + expected_size = 402 + assert size == expected_size, msg + else: + expected_size = 10216 + assert size == expected_size, msg From e5653f43928d33f51c25a32d850ee12d64114b8b Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 8 May 2025 16:03:48 +0200 Subject: [PATCH 15/15] (fix): pre-commit --- tests/test_codecs/test_blosc.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_codecs/test_blosc.py b/tests/test_codecs/test_blosc.py index cebd074f72..6e6e9df383 100644 --- a/tests/test_codecs/test_blosc.py +++ b/tests/test_codecs/test_blosc.py @@ -62,9 +62,11 @@ async def test_typesize() -> None: a = np.arange(1000000, dtype=np.uint64) codecs = [zarr.codecs.BytesCodec(), zarr.codecs.BloscCodec()] z = zarr.array(a, chunks=(10000), codecs=codecs) - bytes = (await z.store.get("c/0", prototype=default_buffer_prototype())).to_bytes() + data = await z.store.get("c/0", prototype=default_buffer_prototype()) + assert data is not None + bytes = data.to_bytes() size = len(bytes) - msg = f"Blosc size mismatch. First 10 bytes: {bytes[:20]} and last 10 bytes: {bytes[-20:]}" + msg = f"Blosc size mismatch. First 10 bytes: {bytes[:20]!r} and last 10 bytes: {bytes[-20:]!r}" if Version(numcodecs.__version__) >= Version("0.16.0"): expected_size = 402 assert size == expected_size, msg