diff --git a/docs/release.rst b/docs/release.rst index 14fc9423..59ed6d28 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -28,6 +28,10 @@ Improvements ~~~~~~~~~~~~ * Raise a custom `UnknownCodecError` when trying to retrieve an unavailable codec. By :user:`Cas Wognum `. +* Add ``typesize`` argument to ``Blosc`` to allow for buffers that are passed to ``encode`` + use that information. zarr v3 currently has its Blosc codec as bytes-to-bytes but does retain + the size information so using it here allows for massive compression ratio gains. + By :user:`Ilan Gold ` Fixes ~~~~~ diff --git a/numcodecs/blosc.pyx b/numcodecs/blosc.pyx index 3caa3607..3082aad4 100644 --- a/numcodecs/blosc.pyx +++ b/numcodecs/blosc.pyx @@ -235,7 +235,7 @@ def _err_bad_cname(cname): err_bad_cname = deprecated(_err_bad_cname) def compress(source, char* cname, int clevel, int shuffle=SHUFFLE, - int blocksize=AUTOBLOCKS): + int blocksize=AUTOBLOCKS, typesize=None): """Compress data. Parameters @@ -279,7 +279,12 @@ def compress(source, char* cname, int clevel, int shuffle=SHUFFLE, source_buffer = Buffer(source, PyBUF_ANY_CONTIGUOUS) source_ptr = source_buffer.ptr nbytes = source_buffer.nbytes - itemsize = source_buffer.itemsize + if isinstance(typesize, int): + if typesize < 1: + raise ValueError(f"Cannot use typesize {typesize} less than 1.") + itemsize = typesize + else: + itemsize = source_buffer.itemsize # determine shuffle if shuffle == AUTOSHUFFLE: @@ -552,6 +557,8 @@ class Blosc(Codec): blocksize : int The requested size of the compressed blocks. If 0 (default), an automatic blocksize will be used. + typesize : int, optional + The size in bytes of uncompressed array elements. See Also -------- @@ -566,7 +573,9 @@ class Blosc(Codec): AUTOSHUFFLE = AUTOSHUFFLE max_buffer_size = 2**31 - 1 - def __init__(self, cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=AUTOBLOCKS): + def __init__(self, cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=AUTOBLOCKS, typesize=None): + if isinstance(typesize, int) and typesize < 1: + raise ValueError(f"Cannot use typesize {typesize} less than 1.") self.cname = cname if isinstance(cname, str): self._cname_bytes = cname.encode('ascii') @@ -575,10 +584,11 @@ class Blosc(Codec): self.clevel = clevel self.shuffle = shuffle self.blocksize = blocksize + self.typesize = typesize def encode(self, buf): buf = ensure_contiguous_ndarray(buf, self.max_buffer_size) - return compress(buf, self._cname_bytes, self.clevel, self.shuffle, self.blocksize) + return compress(buf, self._cname_bytes, self.clevel, self.shuffle, self.blocksize, self.typesize) def decode(self, buf, out=None): buf = ensure_contiguous_ndarray(buf, self.max_buffer_size) diff --git a/numcodecs/tests/test_blosc.py b/numcodecs/tests/test_blosc.py index 0bc14010..46d3e3a4 100644 --- a/numcodecs/tests/test_blosc.py +++ b/numcodecs/tests/test_blosc.py @@ -273,3 +273,26 @@ def test_max_buffer_size(): _skip_null(codec) assert codec.max_buffer_size == 2**31 - 1 check_max_buffer_size(codec) + + +def test_typesize_explicit(): + arr = np.arange(100).astype("int64") + itemsize = arr.itemsize + codec_no_type_size = Blosc(shuffle=Blosc.SHUFFLE) + codec_itemsize = Blosc(shuffle=Blosc.SHUFFLE, typesize=itemsize) + encoded_without_itemsize = codec_no_type_size.encode(arr.tobytes()) + encoded_with_itemsize = codec_itemsize.encode(arr.tobytes()) + # third byte encodes the `typesize` + assert encoded_without_itemsize[3] == 1 # inferred from bytes i.e., 1 + assert encoded_with_itemsize[3] == itemsize # given as a constructor argument + + +def test_typesize_less_than_1(): + with pytest.raises(ValueError, match=r"Cannot use typesize"): + Blosc(shuffle=Blosc.SHUFFLE, typesize=0) + compressor = Blosc(shuffle=Blosc.SHUFFLE) + # not really something that should be done in practice, but good for testing. + compressor.typesize = 0 + arr = np.arange(100) + with pytest.raises(ValueError, match=r"Cannot use typesize"): + compressor.encode(arr.tobytes())