Skip to content
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
1fa42d9
add default compressor to config
brokkoli71 Nov 6, 2024
02053e9
modify _default_compressor to _default_filters_and_compressor
brokkoli71 Nov 6, 2024
6ac38ea
fix test_metadata_to_dict
brokkoli71 Nov 6, 2024
9507e19
wip debugging
brokkoli71 Nov 6, 2024
3727b4a
Merge branch 'master' into default-compressor
brokkoli71 Nov 13, 2024
f93ced2
format
brokkoli71 Nov 13, 2024
07590ca
fix v2 decode string dtype
brokkoli71 Nov 13, 2024
4e2a3bc
fix config default tests
brokkoli71 Nov 13, 2024
0fc7b23
format
brokkoli71 Nov 13, 2024
35849c7
Merge branch 'main' into default-compressor
brokkoli71 Nov 17, 2024
8ec16e8
Update src/zarr/codecs/_v2.py
normanrz Dec 6, 2024
d6dc146
rename v2_dtype_kind_to_default_filters_and_compressor to v2_default_…
brokkoli71 Dec 11, 2024
78ab221
merge main into default-compressor
brokkoli71 Dec 11, 2024
15577ae
recover test_v2.py
brokkoli71 Dec 11, 2024
67010ce
incorporate feedback
brokkoli71 Dec 11, 2024
f6b98c3
incorporate feedback
brokkoli71 Dec 11, 2024
fcbae8b
fix mypy
brokkoli71 Dec 11, 2024
75a858d
Merge remote-tracking branch 'origin/default-compressor' into default…
brokkoli71 Dec 11, 2024
a77fb0d
allow only one default compressor
brokkoli71 Dec 11, 2024
d11bf30
Merge remote-tracking branch 'refs/remotes/upstream/main' into defaul…
brokkoli71 Dec 14, 2024
876e67d
put `v2_default_compressor` under `array`
brokkoli71 Dec 14, 2024
12dfaf4
deprecate zarr.storage.default_compressor
brokkoli71 Dec 14, 2024
6954b60
test v3_default_codecs
brokkoli71 Dec 14, 2024
80dfc40
use v3_default_codecs
brokkoli71 Dec 14, 2024
6001e93
fix tests that expected codecs==["bytes"]
brokkoli71 Dec 14, 2024
ff76617
fix test_default_codecs
brokkoli71 Dec 14, 2024
f04e0e6
fail-fast: false
brokkoli71 Dec 14, 2024
f63bb67
fix string codecs for np1.25
brokkoli71 Dec 14, 2024
00e241e
format
brokkoli71 Dec 14, 2024
58406c8
add docstrings to create in asynchronous.py and array.py
brokkoli71 Dec 18, 2024
fc09989
add docstrings to creation in group.py
brokkoli71 Dec 18, 2024
eed4427
Merge branch 'main' into default-compressor
brokkoli71 Dec 18, 2024
c62aff5
Apply suggestions from code review
brokkoli71 Dec 18, 2024
48c7448
apply suggestions from review
brokkoli71 Dec 18, 2024
083c4cb
correct code double backticks
brokkoli71 Dec 18, 2024
500bc7b
correct attribute links in docstring
brokkoli71 Dec 18, 2024
cdf5542
link zarr.core.config in docstrings
brokkoli71 Dec 18, 2024
43307b3
Merge branch 'main' into default-compressor
brokkoli71 Dec 18, 2024
390c435
improve docstring readability
brokkoli71 Dec 18, 2024
35e35c4
correct config docstring
brokkoli71 Dec 18, 2024
92de85c
correct config docstring
brokkoli71 Dec 18, 2024
6fd3f25
improve config docstring
brokkoli71 Dec 18, 2024
ea228ca
Merge branch 'main' into default-compressor
normanrz Dec 19, 2024
3933c05
Merge branch 'main' into default-compressor
normanrz Dec 19, 2024
9ac82d1
Merge branch 'main' into default-compressor
normanrz Dec 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion src/zarr/codecs/_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import TYPE_CHECKING

import numcodecs
import numpy as np
from numcodecs.compat import ensure_ndarray_like

from zarr.abc.codec import ArrayBytesCodec
Expand Down Expand Up @@ -43,10 +44,16 @@ async def _decode_single(

# view as numpy array with correct dtype
chunk = ensure_ndarray_like(chunk)
print(chunk)
print(chunk.dtype)
# special case object dtype, because incorrect handling can lead to
# segfaults and other bad things happening
if chunk_spec.dtype != object:
chunk = chunk.view(chunk_spec.dtype)
try:
chunk = chunk.view(chunk_spec.dtype)
except TypeError:
chunk = np.array(chunk).astype(chunk_spec.dtype)

elif chunk.dtype != object:
# If we end up here, someone must have hacked around with the filters.
# We cannot deal with object arrays unless there is an object
Expand Down
8 changes: 0 additions & 8 deletions src/zarr/core/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,14 +493,6 @@ async def create(
order=order,
)
elif zarr_format == 2:
if dtype is str or dtype == "str":
# another special case: zarr v2 added the vlen-utf8 codec
vlen_codec: dict[str, JSON] = {"id": "vlen-utf8"}
if filters and not any(x["id"] == "vlen-utf8" for x in filters):
filters = list(filters) + [vlen_codec]
else:
filters = [vlen_codec]

if codecs is not None:
raise ValueError(
"codecs cannot be used for arrays with version 2. Use filters and compressor instead."
Expand Down
5 changes: 5 additions & 0 deletions src/zarr/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@ def reset(self) -> None:
},
"buffer": "zarr.core.buffer.cpu.Buffer",
"ndbuffer": "zarr.core.buffer.cpu.NDBuffer",
"v2_dtype_kind_to_default_filters_and_compressor": {
"biufcmM": ["zstd"],
"U": ["vlen-utf8"],
"OSV": ["vlen-bytes"],
},
}
],
)
Expand Down
27 changes: 26 additions & 1 deletion src/zarr/core/metadata/v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from collections.abc import Iterable
from enum import Enum
from functools import cached_property
from typing import TYPE_CHECKING, TypedDict, cast
from typing import TYPE_CHECKING, Any, TypedDict, cast

from zarr.abc.metadata import Metadata

Expand Down Expand Up @@ -71,6 +71,14 @@ def __init__(
shape_parsed = parse_shapelike(shape)
dtype_parsed = parse_dtype(dtype)
chunks_parsed = parse_shapelike(chunks)
if not filters and not compressor:
filters, compressor = _default_filters_and_compressor(dtype_parsed)
if dtype is str or dtype == "str":
vlen_codec: dict[str, JSON] = {"id": "vlen-utf8"}
if filters and not any(x["id"] == "vlen-utf8" for x in filters):
filters = list(filters) + [vlen_codec]
else:
filters = [vlen_codec]
compressor_parsed = parse_compressor(compressor)
order_parsed = parse_indexing_order(order)
dimension_separator_parsed = parse_separator(dimension_separator)
Expand Down Expand Up @@ -326,3 +334,20 @@ def _default_fill_value(dtype: np.dtype[Any]) -> Any:
return ""
else:
return dtype.type(0)


def _default_filters_and_compressor(
dtype: np.dtype[Any],
) -> tuple[list[dict[str, str]], dict[str, str] | None]:
"""Get the default filters and compressor for a dtype.

The config contains a mapping from numpy dtype kind to the default compressor.
https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html
"""
dtype_kind_to_default_compressor = config.get("v2_dtype_kind_to_default_filters_and_compressor")
for dtype_kinds, filters_and_compressor in dtype_kind_to_default_compressor.items():
if dtype.kind in dtype_kinds:
filters = [{"id": f} for f in filters_and_compressor]
compressor = None
return filters, compressor
return [], None
2 changes: 2 additions & 0 deletions tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from itertools import accumulate
from typing import Any, Literal

import numcodecs
import numpy as np
import pytest

Expand Down Expand Up @@ -455,6 +456,7 @@ def test_info_v2(self) -> None:
_read_only=False,
_store_type="MemoryStore",
_count_bytes=128,
_filters=(numcodecs.Zstd(),),
)
assert result == expected

Expand Down
5 changes: 5 additions & 0 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,11 @@ def test_config_defaults_set() -> None:
"vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec",
"vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec",
},
"v2_dtype_kind_to_default_filters_and_compressor": {
"biufcmM": ["zstd"],
"U": ["vlen-utf8"],
"OSV": ["vlen-bytes"],
},
}
]
assert config.get("array.order") == "C"
Expand Down
11 changes: 10 additions & 1 deletion tests/test_metadata/test_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from zarr.core.buffer import cpu
from zarr.core.group import ConsolidatedMetadata, GroupMetadata
from zarr.core.metadata import ArrayV2Metadata
from zarr.core.metadata.v2 import parse_zarr_format
from zarr.core.metadata.v2 import _default_filters_and_compressor, parse_zarr_format

if TYPE_CHECKING:
from typing import Any
Expand Down Expand Up @@ -77,6 +77,15 @@ def test_metadata_to_dict(
assert observed["dimension_separator"] == expected_dimension_sep
observed.pop("dimension_separator")

if not filters and not compressor:
assert observed["filters"], observed["compressor"] == _default_filters_and_compressor(
np.dtype(data_type)
)
observed.pop("filters")
observed.pop("compressor")
expected.pop("filters")
expected.pop("compressor")

assert observed == expected


Expand Down
116 changes: 82 additions & 34 deletions tests/test_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import zarr
import zarr.core.buffer
import zarr.storage
from zarr import Array
from zarr import Array, config
from zarr.storage import MemoryStore, StorePath


Expand Down Expand Up @@ -82,47 +82,76 @@ def test_codec_pipeline() -> None:

@pytest.mark.parametrize("dtype", ["|S", "|V"])
async def test_v2_encode_decode(dtype):
store = zarr.storage.MemoryStore()
g = zarr.group(store=store, zarr_format=2)
g.create_array(
name="foo",
shape=(3,),
chunks=(3,),
dtype=dtype,
fill_value=b"X",
)

result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype())
assert result is not None

serialized = json.loads(result.to_bytes())
expected = {
"chunks": [3],
"compressor": None,
"dtype": f"{dtype}0",
"fill_value": "WA==",
"filters": None,
"order": "C",
"shape": [3],
"zarr_format": 2,
"dimension_separator": ".",
}
assert serialized == expected

data = zarr.open_array(store=store, path="foo")[:]
expected = np.full((3,), b"X", dtype=dtype)
np.testing.assert_equal(data, expected)
with config.set(
{
"v2_dtype_kind_to_default_filters_and_compressor": {
"SV": ["vlen-bytes"],
},
}
):
store = zarr.storage.MemoryStore()
g = zarr.group(store=store, zarr_format=2)
g.create_array(
name="foo",
shape=(3,),
chunks=(3,),
dtype=dtype,
fill_value=b"X",
)

result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype())
assert result is not None

serialized = json.loads(result.to_bytes())
expected = {
"chunks": [3],
"compressor": None,
"dtype": f"{dtype}0",
"fill_value": "WA==",
"filters": [{"id": "vlen-bytes"}],
"order": "C",
"shape": [3],
"zarr_format": 2,
"dimension_separator": ".",
}
assert serialized == expected

data = zarr.open_array(store=store, path="foo")[:]
expected = np.full((3,), b"X", dtype=dtype)
np.testing.assert_equal(data, expected)


@pytest.mark.parametrize("dtype_value", [["|S", b"Y"], ["|U", "Y"], ["O", b"Y"]])
def test_v2_encode_decode_with_data(dtype_value):
dtype, value = dtype_value
with config.set(
{
"v2_dtype_kind_to_default_filters_and_compressor": {
"U": ["vlen-utf8"],
"OSV": ["vlen-bytes"],
},
}
):
expected = np.full((3,), value, dtype=dtype)
a = zarr.create(
shape=(3,),
zarr_format=2,
dtype=dtype,
)
a[:] = expected
data = a[:]
np.testing.assert_equal(data, expected)


@pytest.mark.parametrize("dtype", [str, "str"])
async def test_create_dtype_str(dtype: Any) -> None:
arr = zarr.create(shape=3, dtype=dtype, zarr_format=2)
assert arr.dtype.kind == "O"
assert arr.metadata.to_dict()["dtype"] == "|O"
assert arr.metadata.filters == (numcodecs.vlen.VLenUTF8(),)
arr[:] = ["a", "bb", "ccc"]
assert arr.metadata.filters == (numcodecs.vlen.VLenBytes(),)
arr[:] = [b"a", b"bb", b"ccc"]
result = arr[:]
np.testing.assert_array_equal(result, np.array(["a", "bb", "ccc"], dtype="object"))
np.testing.assert_array_equal(result, np.array([b"a", b"bb", b"ccc"], dtype="object"))


@pytest.mark.parametrize("filters", [[], [numcodecs.Delta(dtype="<i4")], [numcodecs.Zlib(level=2)]])
Expand All @@ -132,3 +161,22 @@ def test_v2_filters_codecs(filters: Any) -> None:
arr[:] = array_fixture
result = arr[:]
np.testing.assert_array_equal(result, array_fixture)


@pytest.mark.parametrize(
"dtype_expected",
[["b", "zstd"], ["i", "zstd"], ["f", "zstd"], ["|S1", "vlen-bytes"], ["|U1", "vlen-utf8"]],
)
def test_default_filters_and_compressor(dtype_expected: Any) -> None:
with config.set(
{
"v2_dtype_kind_to_default_filters_and_compressor": {
"biufcmM": ["zstd"],
"U": ["vlen-utf8"],
"OSV": ["vlen-bytes"],
},
}
):
dtype, expected = dtype_expected
arr = zarr.create(shape=(3,), path="foo", store={}, zarr_format=2, dtype=dtype)
assert arr.metadata.filters[0].codec_id == expected