zarr-developers · dstansby · Mar 4, 2025 · Mar 3, 2025 · Mar 3, 2025 · Mar 3, 2025
diff --git a/docs/release.rst b/docs/release.rst
@@ -28,6 +28,10 @@ Improvements
 ~~~~~~~~~~~~
 * Raise a custom `UnknownCodecError` when trying to retrieve an unavailable codec.
   By :user:`Cas Wognum <cwognum>`.
+* Add ``typesize`` argument to ``Blosc`` to allow for buffers that are passed to ``encode``
+  use that information.  zarr v3 currently has its Blosc codec as bytes-to-bytes but does retain
+  the size information so using it here allows for massive compression ratio gains.
+  By :user:`Ilan Gold <ilan-gold>`
 
 Fixes
 ~~~~~

diff --git a/numcodecs/blosc.pyx b/numcodecs/blosc.pyx
@@ -235,7 +235,7 @@ def _err_bad_cname(cname):
 err_bad_cname = deprecated(_err_bad_cname)
 
 def compress(source, char* cname, int clevel, int shuffle=SHUFFLE,
-             int blocksize=AUTOBLOCKS):
+             int blocksize=AUTOBLOCKS, typesize=None):
     """Compress data.
 
     Parameters
@@ -279,7 +279,12 @@ def compress(source, char* cname, int clevel, int shuffle=SHUFFLE,
     source_buffer = Buffer(source, PyBUF_ANY_CONTIGUOUS)
     source_ptr = source_buffer.ptr
     nbytes = source_buffer.nbytes
-    itemsize = source_buffer.itemsize
+    if isinstance(typesize, int):
+        if typesize < 1:
+            raise ValueError(f"Cannot use typesize {typesize} less than 1.")
+        itemsize = typesize
+    else:
+        itemsize = source_buffer.itemsize
 
     # determine shuffle
     if shuffle == AUTOSHUFFLE:
@@ -552,6 +557,8 @@ class Blosc(Codec):
     blocksize : int
         The requested size of the compressed blocks.  If 0 (default), an automatic
         blocksize will be used.
+    typesize : int, optional
+        The size in bytes of uncompressed array elements.
 
     See Also
     --------
@@ -566,7 +573,9 @@ class Blosc(Codec):
     AUTOSHUFFLE = AUTOSHUFFLE
     max_buffer_size = 2**31 - 1
 
-    def __init__(self, cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=AUTOBLOCKS):
+    def __init__(self, cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=AUTOBLOCKS, typesize=None):
+        if isinstance(typesize, int) and typesize < 1:
+            raise ValueError(f"Cannot use typesize {typesize} less than 1.")
         self.cname = cname
         if isinstance(cname, str):
             self._cname_bytes = cname.encode('ascii')
@@ -575,10 +584,11 @@ class Blosc(Codec):
         self.clevel = clevel
         self.shuffle = shuffle
         self.blocksize = blocksize
+        self.typesize = typesize
 
     def encode(self, buf):
         buf = ensure_contiguous_ndarray(buf, self.max_buffer_size)
-        return compress(buf, self._cname_bytes, self.clevel, self.shuffle, self.blocksize)
+        return compress(buf, self._cname_bytes, self.clevel, self.shuffle, self.blocksize, self.typesize)
 
     def decode(self, buf, out=None):
         buf = ensure_contiguous_ndarray(buf, self.max_buffer_size)

diff --git a/numcodecs/tests/test_blosc.py b/numcodecs/tests/test_blosc.py
@@ -273,3 +273,26 @@ def test_max_buffer_size():
         _skip_null(codec)
         assert codec.max_buffer_size == 2**31 - 1
         check_max_buffer_size(codec)
+
+
+def test_typesize_explicit():
+    arr = np.arange(100).astype("int64")
+    itemsize = arr.itemsize
+    codec_no_type_size = Blosc(shuffle=Blosc.SHUFFLE)
+    codec_itemsize = Blosc(shuffle=Blosc.SHUFFLE, typesize=itemsize)
+    encoded_without_itemsize = codec_no_type_size.encode(arr.tobytes())
+    encoded_with_itemsize = codec_itemsize.encode(arr.tobytes())
+    # third byte encodes the `typesize`
+    assert encoded_without_itemsize[3] == 1  # inferred from bytes i.e., 1
+    assert encoded_with_itemsize[3] == itemsize  # given as a constructor argument
+
+
+def test_typesize_less_than_1():
+    with pytest.raises(ValueError, match=r"Cannot use typesize"):
+        Blosc(shuffle=Blosc.SHUFFLE, typesize=0)
+    compressor = Blosc(shuffle=Blosc.SHUFFLE)
+    # not really something that should be done in practice, but good for testing.
+    compressor.typesize = 0
+    arr = np.arange(100)
+    with pytest.raises(ValueError, match=r"Cannot use typesize"):
+        compressor.encode(arr.tobytes())