Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ Improvements
~~~~~~~~~~~~
* Raise a custom `UnknownCodecError` when trying to retrieve an unavailable codec.
By :user:`Cas Wognum <cwognum>`.
* Add ``typesize`` argument to ``Blosc`` to allow for buffers that are passed to ``encode``
use that information. zarr v3 currently has its Blosc codec as bytes-to-bytes but does retain
the size information so using it here allows for massive compression ratio gains.
By :user:`Ilan Gold <ilan-gold>`

Fixes
~~~~~
Expand Down
14 changes: 10 additions & 4 deletions numcodecs/blosc.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def _err_bad_cname(cname):
err_bad_cname = deprecated(_err_bad_cname)

def compress(source, char* cname, int clevel, int shuffle=SHUFFLE,
int blocksize=AUTOBLOCKS):
int blocksize=AUTOBLOCKS, typesize=None):
"""Compress data.

Parameters
Expand Down Expand Up @@ -279,7 +279,10 @@ def compress(source, char* cname, int clevel, int shuffle=SHUFFLE,
source_buffer = Buffer(source, PyBUF_ANY_CONTIGUOUS)
source_ptr = source_buffer.ptr
nbytes = source_buffer.nbytes
itemsize = source_buffer.itemsize
if typesize is not None:
itemsize = typesize
else:
itemsize = source_buffer.itemsize

# determine shuffle
if shuffle == AUTOSHUFFLE:
Expand Down Expand Up @@ -552,6 +555,8 @@ class Blosc(Codec):
blocksize : int
The requested size of the compressed blocks. If 0 (default), an automatic
blocksize will be used.
typesize : int, optional
The size in bytes of uncompressed array elements.

See Also
--------
Expand All @@ -566,7 +571,7 @@ class Blosc(Codec):
AUTOSHUFFLE = AUTOSHUFFLE
max_buffer_size = 2**31 - 1

def __init__(self, cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=AUTOBLOCKS):
def __init__(self, cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=AUTOBLOCKS, typesize=None):
self.cname = cname
if isinstance(cname, str):
self._cname_bytes = cname.encode('ascii')
Expand All @@ -575,10 +580,11 @@ class Blosc(Codec):
self.clevel = clevel
self.shuffle = shuffle
self.blocksize = blocksize
self.typesize = typesize

def encode(self, buf):
buf = ensure_contiguous_ndarray(buf, self.max_buffer_size)
return compress(buf, self._cname_bytes, self.clevel, self.shuffle, self.blocksize)
return compress(buf, self._cname_bytes, self.clevel, self.shuffle, self.blocksize, self.typesize)

def decode(self, buf, out=None):
buf = ensure_contiguous_ndarray(buf, self.max_buffer_size)
Expand Down
12 changes: 12 additions & 0 deletions numcodecs/tests/test_blosc.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,3 +273,15 @@ def test_max_buffer_size():
_skip_null(codec)
assert codec.max_buffer_size == 2**31 - 1
check_max_buffer_size(codec)


def test_typesize_explicit():
arr = np.arange(100).astype("int64")
itemsize = arr.itemsize
codec_no_type_size = Blosc(shuffle=Blosc.SHUFFLE)
codec_itemsize = Blosc(shuffle=Blosc.SHUFFLE, typesize=itemsize)
encoded_without_itemsize = codec_no_type_size.encode(arr.tobytes())
encoded_with_itemsize = codec_itemsize.encode(arr.tobytes())
# third byte encodes the `typesize`
assert encoded_without_itemsize[3] == 1 # inferred from bytes i.e., 1
assert encoded_with_itemsize[3] == itemsize # given as a constructor argument
Loading