From ce1957172815ddbe497209cb7a86cd1da33c418f Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 3 Mar 2025 14:13:54 +0100 Subject: [PATCH 1/7] (feat): `typesize` declared with constructor --- numcodecs/blosc.pyx | 12 ++++++++---- numcodecs/tests/test_blosc.py | 10 ++++++++++ 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/numcodecs/blosc.pyx b/numcodecs/blosc.pyx index 3caa3607..8eacb64a 100644 --- a/numcodecs/blosc.pyx +++ b/numcodecs/blosc.pyx @@ -235,7 +235,7 @@ def _err_bad_cname(cname): err_bad_cname = deprecated(_err_bad_cname) def compress(source, char* cname, int clevel, int shuffle=SHUFFLE, - int blocksize=AUTOBLOCKS): + int blocksize=AUTOBLOCKS, typesize=None): """Compress data. Parameters @@ -279,7 +279,10 @@ def compress(source, char* cname, int clevel, int shuffle=SHUFFLE, source_buffer = Buffer(source, PyBUF_ANY_CONTIGUOUS) source_ptr = source_buffer.ptr nbytes = source_buffer.nbytes - itemsize = source_buffer.itemsize + if typesize is not None: + itemsize = typesize + else: + itemsize = source_buffer.itemsize # determine shuffle if shuffle == AUTOSHUFFLE: @@ -566,7 +569,7 @@ class Blosc(Codec): AUTOSHUFFLE = AUTOSHUFFLE max_buffer_size = 2**31 - 1 - def __init__(self, cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=AUTOBLOCKS): + def __init__(self, cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=AUTOBLOCKS, typesize=None): self.cname = cname if isinstance(cname, str): self._cname_bytes = cname.encode('ascii') @@ -575,10 +578,11 @@ class Blosc(Codec): self.clevel = clevel self.shuffle = shuffle self.blocksize = blocksize + self.typesize = typesize def encode(self, buf): buf = ensure_contiguous_ndarray(buf, self.max_buffer_size) - return compress(buf, self._cname_bytes, self.clevel, self.shuffle, self.blocksize) + return compress(buf, self._cname_bytes, self.clevel, self.shuffle, self.blocksize, self.typesize) def decode(self, buf, out=None): buf = ensure_contiguous_ndarray(buf, self.max_buffer_size) diff --git a/numcodecs/tests/test_blosc.py b/numcodecs/tests/test_blosc.py index 0bc14010..3ec20b25 100644 --- a/numcodecs/tests/test_blosc.py +++ b/numcodecs/tests/test_blosc.py @@ -273,3 +273,13 @@ def test_max_buffer_size(): _skip_null(codec) assert codec.max_buffer_size == 2**31 - 1 check_max_buffer_size(codec) + +def test_typesize_explicit(): + arr = np.arange(100).astype("int64") + itemsize = arr.itemsize + codec_no_type_size = Blosc(shuffle=Blosc.SHUFFLE) + codec_itemsize = Blosc(shuffle=Blosc.SHUFFLE, typesize=itemsize) + encoded_without_itemsize = codec_no_type_size.encode(arr.tobytes()) + encoded_with_itemsize = codec_itemsize.encode(arr.tobytes()) + assert encoded_without_itemsize[3] == 1 # inferred from bytes i.e., 1 + assert encoded_with_itemsize[3] == itemsize # given as a constructor argument \ No newline at end of file From 1c10d3f092df04d0167abcc8a609843bd3473736 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 3 Mar 2025 14:18:27 +0100 Subject: [PATCH 2/7] (chore): add docstring --- numcodecs/blosc.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/numcodecs/blosc.pyx b/numcodecs/blosc.pyx index 8eacb64a..70491e6f 100644 --- a/numcodecs/blosc.pyx +++ b/numcodecs/blosc.pyx @@ -555,6 +555,8 @@ class Blosc(Codec): blocksize : int The requested size of the compressed blocks. If 0 (default), an automatic blocksize will be used. + typesize : int, optional + The size in bytes of uncompressed array elements. See Also -------- From bf6e4e5ad9e5534b1251838ca10d32af86b2a6ef Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 3 Mar 2025 14:21:16 +0100 Subject: [PATCH 3/7] (chore): relnote --- docs/release.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/release.rst b/docs/release.rst index 5c8f83b9..3fcb36c5 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -63,7 +63,10 @@ Improvements Import errors caused by optional dependencies (ZFPY, MsgPack, CRC32C, and PCodec) are still silently caught. By :user:`David Stansby `, :issue:`550`. - +* Add ``typesize`` argument to ``Blosc`` to allow for buffers that are passed to ``encode`` + use that information. zarr v3 currently has its Blosc codec as bytes-to-bytes but does retain + the size information so using it here allows for massive compression ratio gains. + By :user:`Ilan Gold ` 0.14.1 ------ From 04f775fe08262cdd56311a384c776303d3d78f77 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 3 Mar 2025 14:57:38 +0100 Subject: [PATCH 4/7] (chore): format --- numcodecs/tests/test_blosc.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/numcodecs/tests/test_blosc.py b/numcodecs/tests/test_blosc.py index 3ec20b25..aa58275f 100644 --- a/numcodecs/tests/test_blosc.py +++ b/numcodecs/tests/test_blosc.py @@ -274,6 +274,7 @@ def test_max_buffer_size(): assert codec.max_buffer_size == 2**31 - 1 check_max_buffer_size(codec) + def test_typesize_explicit(): arr = np.arange(100).astype("int64") itemsize = arr.itemsize @@ -281,5 +282,6 @@ def test_typesize_explicit(): codec_itemsize = Blosc(shuffle=Blosc.SHUFFLE, typesize=itemsize) encoded_without_itemsize = codec_no_type_size.encode(arr.tobytes()) encoded_with_itemsize = codec_itemsize.encode(arr.tobytes()) - assert encoded_without_itemsize[3] == 1 # inferred from bytes i.e., 1 - assert encoded_with_itemsize[3] == itemsize # given as a constructor argument \ No newline at end of file + # third byte encodes the `typesize` + assert encoded_without_itemsize[3] == 1 # inferred from bytes i.e., 1 + assert encoded_with_itemsize[3] == itemsize # given as a constructor argument From 22d7f00d16013bc242e8867356c2e19e55ea4c90 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 3 Mar 2025 15:50:01 +0100 Subject: [PATCH 5/7] (fix): add check for `typesize<1` --- numcodecs/blosc.pyx | 6 +++++- numcodecs/tests/test_blosc.py | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/numcodecs/blosc.pyx b/numcodecs/blosc.pyx index 70491e6f..3082aad4 100644 --- a/numcodecs/blosc.pyx +++ b/numcodecs/blosc.pyx @@ -279,7 +279,9 @@ def compress(source, char* cname, int clevel, int shuffle=SHUFFLE, source_buffer = Buffer(source, PyBUF_ANY_CONTIGUOUS) source_ptr = source_buffer.ptr nbytes = source_buffer.nbytes - if typesize is not None: + if isinstance(typesize, int): + if typesize < 1: + raise ValueError(f"Cannot use typesize {typesize} less than 1.") itemsize = typesize else: itemsize = source_buffer.itemsize @@ -572,6 +574,8 @@ class Blosc(Codec): max_buffer_size = 2**31 - 1 def __init__(self, cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=AUTOBLOCKS, typesize=None): + if isinstance(typesize, int) and typesize < 1: + raise ValueError(f"Cannot use typesize {typesize} less than 1.") self.cname = cname if isinstance(cname, str): self._cname_bytes = cname.encode('ascii') diff --git a/numcodecs/tests/test_blosc.py b/numcodecs/tests/test_blosc.py index aa58275f..f1e66727 100644 --- a/numcodecs/tests/test_blosc.py +++ b/numcodecs/tests/test_blosc.py @@ -285,3 +285,8 @@ def test_typesize_explicit(): # third byte encodes the `typesize` assert encoded_without_itemsize[3] == 1 # inferred from bytes i.e., 1 assert encoded_with_itemsize[3] == itemsize # given as a constructor argument + + +def test_typesize_less_than_1(): + with pytest.raises(ValueError, match=r"Cannot use typesize"): + Blosc(shuffle=Blosc.SHUFFLE, typesize=0) From a421c5b47d4763860fa52383194f2d4f24a223df Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 3 Mar 2025 15:56:20 +0100 Subject: [PATCH 6/7] (chore): no cover for internal `ValueError` --- numcodecs/blosc.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numcodecs/blosc.pyx b/numcodecs/blosc.pyx index 3082aad4..c55d9004 100644 --- a/numcodecs/blosc.pyx +++ b/numcodecs/blosc.pyx @@ -280,7 +280,7 @@ def compress(source, char* cname, int clevel, int shuffle=SHUFFLE, source_ptr = source_buffer.ptr nbytes = source_buffer.nbytes if isinstance(typesize, int): - if typesize < 1: + if typesize < 1: # pragma: no cover raise ValueError(f"Cannot use typesize {typesize} less than 1.") itemsize = typesize else: From 7fe0dd85963886fcd84f1950989f6ab3ceba85d7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 3 Mar 2025 15:57:58 +0100 Subject: [PATCH 7/7] (fix): test internal `compress` error --- numcodecs/blosc.pyx | 2 +- numcodecs/tests/test_blosc.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/numcodecs/blosc.pyx b/numcodecs/blosc.pyx index c55d9004..3082aad4 100644 --- a/numcodecs/blosc.pyx +++ b/numcodecs/blosc.pyx @@ -280,7 +280,7 @@ def compress(source, char* cname, int clevel, int shuffle=SHUFFLE, source_ptr = source_buffer.ptr nbytes = source_buffer.nbytes if isinstance(typesize, int): - if typesize < 1: # pragma: no cover + if typesize < 1: raise ValueError(f"Cannot use typesize {typesize} less than 1.") itemsize = typesize else: diff --git a/numcodecs/tests/test_blosc.py b/numcodecs/tests/test_blosc.py index f1e66727..46d3e3a4 100644 --- a/numcodecs/tests/test_blosc.py +++ b/numcodecs/tests/test_blosc.py @@ -290,3 +290,9 @@ def test_typesize_explicit(): def test_typesize_less_than_1(): with pytest.raises(ValueError, match=r"Cannot use typesize"): Blosc(shuffle=Blosc.SHUFFLE, typesize=0) + compressor = Blosc(shuffle=Blosc.SHUFFLE) + # not really something that should be done in practice, but good for testing. + compressor.typesize = 0 + arr = np.arange(100) + with pytest.raises(ValueError, match=r"Cannot use typesize"): + compressor.encode(arr.tobytes())