Skip to content
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/2962.fix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Internally use `typesize` constructor parameter for :class:`numcodecs.blosc.Blosc` to improve compression ratios back to the v2-package levels.
4 changes: 2 additions & 2 deletions docs/user-guide/arrays.rst
Original file line number Diff line number Diff line change
Expand Up @@ -209,8 +209,8 @@ prints additional diagnostics, e.g.::
Serializer : BytesCodec(endian=<Endian.little: 'little'>)
Compressors : (BloscCodec(typesize=4, cname=<BloscCname.zstd: 'zstd'>, clevel=3, shuffle=<BloscShuffle.bitshuffle: 'bitshuffle'>, blocksize=0),)
No. bytes : 400000000 (381.5M)
No. bytes stored : 9696520
Storage ratio : 41.3
No. bytes stored : 3558573
Storage ratio : 112.4
Chunks Initialized : 100

.. note::
Expand Down
4 changes: 4 additions & 0 deletions src/zarr/codecs/blosc.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import numcodecs
from numcodecs.blosc import Blosc
from packaging.version import Version

from zarr.abc.codec import BytesBytesCodec
from zarr.core.buffer.cpu import as_numpy_array_wrapper
Expand Down Expand Up @@ -163,6 +164,9 @@ def _blosc_codec(self) -> Blosc:
"shuffle": map_shuffle_str_to_int[self.shuffle],
"blocksize": self.blocksize,
}
# See https://github.com/zarr-developers/numcodecs/pull/713
if Version(numcodecs.__version__) >= Version("0.16.0"):
config_dict["typesize"] = self.typesize
return Blosc.from_config(config_dict)

async def _decode_single(
Expand Down
29 changes: 29 additions & 0 deletions tests/test_codecs/test_blosc.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import json
import platform

import numcodecs
import numpy as np
import pytest
from packaging.version import Version

import zarr
from zarr.abc.store import Store
Expand Down Expand Up @@ -54,3 +57,29 @@ async def test_blosc_evolve(store: Store, dtype: str) -> None:
assert blosc_configuration_json["shuffle"] == "bitshuffle"
else:
assert blosc_configuration_json["shuffle"] == "shuffle"


async def test_typesize() -> None:
a = np.arange(1000000)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
a = np.arange(1000000)
a = np.arange(2**16, dtype=np.uint16)

As a thought, worth explicitly specifying the data type (and making the data smaller)? Don't know if it will fix the windows issue, but I think worth doing anyway os there's a concrete bytesize, and perhaps using integer data type will help with linux/windows because perhaps they have different floating point implementations (although that's wild speculation on my part...)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

arange is default uint64 so I'll push something with that added.

codecs = [zarr.codecs.BytesCodec(), zarr.codecs.BloscCodec()]
z = zarr.array(a, chunks=(10000), codecs=codecs)
bytes = (await z.store.get("c/0", prototype=default_buffer_prototype())).to_bytes()
size = len(bytes)
msg = f"Blosc size mismatch. First 10 bytes: {bytes[:20]} and last 10 bytes: {bytes[-20:]}"
match (
Version(numcodecs.__version__) >= Version("0.16.0"),
platform.system() == "Windows",
Version(np.__version__) < Version("2.0.0"),
):
case True, True, True:
# See https://github.com/zarr-developers/zarr-python/pull/2962
# for why this condition is distinct. It's not clear
# if it's the python version of the numpy version with windows.
expected_size = 400
assert size == 400, msg
case (True, True, False) | (True, False, _):
expected_size = 402
assert size == expected_size, msg
case False, _, _:
expected_size = 10216
assert size == expected_size, msg