Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ Improvements
~~~~~~~~~~~~
* Raise a custom `UnknownCodecError` when trying to retrieve an unavailable codec.
By :user:`Cas Wognum <cwognum>`.
* Add ``typesize`` argument to ``Blosc`` to allow for buffers that are passed to ``encode``
use that information. zarr v3 currently has its Blosc codec as bytes-to-bytes but does retain
the size information so using it here allows for massive compression ratio gains.
By :user:`Ilan Gold <ilan-gold>`

Fixes
~~~~~
Expand Down
18 changes: 14 additions & 4 deletions numcodecs/blosc.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def _err_bad_cname(cname):
err_bad_cname = deprecated(_err_bad_cname)

def compress(source, char* cname, int clevel, int shuffle=SHUFFLE,
int blocksize=AUTOBLOCKS):
int blocksize=AUTOBLOCKS, typesize=None):
"""Compress data.

Parameters
Expand Down Expand Up @@ -279,7 +279,12 @@ def compress(source, char* cname, int clevel, int shuffle=SHUFFLE,
source_buffer = Buffer(source, PyBUF_ANY_CONTIGUOUS)
source_ptr = source_buffer.ptr
nbytes = source_buffer.nbytes
itemsize = source_buffer.itemsize
if isinstance(typesize, int):
if typesize < 1:
raise ValueError(f"Cannot use typesize {typesize} less than 1.")
itemsize = typesize
else:
itemsize = source_buffer.itemsize

# determine shuffle
if shuffle == AUTOSHUFFLE:
Expand Down Expand Up @@ -552,6 +557,8 @@ class Blosc(Codec):
blocksize : int
The requested size of the compressed blocks. If 0 (default), an automatic
blocksize will be used.
typesize : int, optional
The size in bytes of uncompressed array elements.

See Also
--------
Expand All @@ -566,7 +573,9 @@ class Blosc(Codec):
AUTOSHUFFLE = AUTOSHUFFLE
max_buffer_size = 2**31 - 1

def __init__(self, cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=AUTOBLOCKS):
def __init__(self, cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=AUTOBLOCKS, typesize=None):
if isinstance(typesize, int) and typesize < 1:
raise ValueError(f"Cannot use typesize {typesize} less than 1.")
self.cname = cname
if isinstance(cname, str):
self._cname_bytes = cname.encode('ascii')
Expand All @@ -575,10 +584,11 @@ class Blosc(Codec):
self.clevel = clevel
self.shuffle = shuffle
self.blocksize = blocksize
self.typesize = typesize

def encode(self, buf):
buf = ensure_contiguous_ndarray(buf, self.max_buffer_size)
return compress(buf, self._cname_bytes, self.clevel, self.shuffle, self.blocksize)
return compress(buf, self._cname_bytes, self.clevel, self.shuffle, self.blocksize, self.typesize)

def decode(self, buf, out=None):
buf = ensure_contiguous_ndarray(buf, self.max_buffer_size)
Expand Down
23 changes: 23 additions & 0 deletions numcodecs/tests/test_blosc.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,3 +273,26 @@ def test_max_buffer_size():
_skip_null(codec)
assert codec.max_buffer_size == 2**31 - 1
check_max_buffer_size(codec)


def test_typesize_explicit():
arr = np.arange(100).astype("int64")
itemsize = arr.itemsize
codec_no_type_size = Blosc(shuffle=Blosc.SHUFFLE)
codec_itemsize = Blosc(shuffle=Blosc.SHUFFLE, typesize=itemsize)
encoded_without_itemsize = codec_no_type_size.encode(arr.tobytes())
encoded_with_itemsize = codec_itemsize.encode(arr.tobytes())
# third byte encodes the `typesize`
assert encoded_without_itemsize[3] == 1 # inferred from bytes i.e., 1
assert encoded_with_itemsize[3] == itemsize # given as a constructor argument


def test_typesize_less_than_1():
with pytest.raises(ValueError, match=r"Cannot use typesize"):
Blosc(shuffle=Blosc.SHUFFLE, typesize=0)
compressor = Blosc(shuffle=Blosc.SHUFFLE)
# not really something that should be done in practice, but good for testing.
compressor.typesize = 0
arr = np.arange(100)
with pytest.raises(ValueError, match=r"Cannot use typesize"):
compressor.encode(arr.tobytes())
Loading