Skip to content

Commit 67010ce

Browse files
committed
incorporate feedback
1 parent 15577ae commit 67010ce

File tree

9 files changed

+119
-54
lines changed

9 files changed

+119
-54
lines changed

src/zarr/api/asynchronous.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,12 @@
1717
ChunkCoords,
1818
MemoryOrder,
1919
ZarrFormat,
20+
parse_dtype,
2021
)
2122
from zarr.core.config import config
2223
from zarr.core.group import AsyncGroup, ConsolidatedMetadata, GroupMetadata
2324
from zarr.core.metadata import ArrayMetadataDict, ArrayV2Metadata, ArrayV3Metadata
25+
from zarr.core.metadata.v2 import _default_filters_and_compressor
2426
from zarr.errors import NodeTypeValidationError
2527
from zarr.storage import (
2628
StoreLike,
@@ -885,8 +887,17 @@ async def create(
885887
or _default_zarr_version()
886888
)
887889

888-
if zarr_format == 2 and chunks is None:
889-
chunks = shape
890+
if zarr_format == 2:
891+
if chunks is None:
892+
chunks = shape
893+
dtype = parse_dtype(dtype, zarr_format)
894+
if not filters and not compressor:
895+
filters, compressor = _default_filters_and_compressor(dtype)
896+
if np.issubdtype(dtype, np.str_):
897+
filters = filters or []
898+
if not any(x["id"] == "vlen-utf8" for x in filters):
899+
filters = list(filters) + [{"id": "vlen-utf8"}]
900+
890901
elif zarr_format == 3 and chunk_shape is None:
891902
if chunks is not None:
892903
chunk_shape = chunks

src/zarr/codecs/_v2.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,12 @@ async def _decode_single(
5050
try:
5151
chunk = chunk.view(chunk_spec.dtype)
5252
except TypeError:
53+
# this will happen if the dtype of the chunk
54+
# does not match the dtype of the array spec i.g. if
55+
# the dtype of the chunk_spec is a string dtype, but the chunk
56+
# is an object array. In this case, we need to convert the object
57+
# array to the correct dtype.
58+
5359
chunk = np.array(chunk).astype(chunk_spec.dtype)
5460

5561
elif chunk.dtype != object:

src/zarr/core/array.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@
7777
ArrayV3MetadataDict,
7878
T_ArrayMetadata,
7979
)
80+
from zarr.core.metadata.v2 import _default_filters_and_compressor
8081
from zarr.core.metadata.v3 import parse_node_type_array
8182
from zarr.core.sync import sync
8283
from zarr.errors import MetadataValidationError
@@ -617,6 +618,14 @@ async def _create_v2(
617618
if dimension_separator is None:
618619
dimension_separator = "."
619620

621+
dtype = parse_dtype(dtype, 2)
622+
if not filters and not compressor:
623+
filters, compressor = _default_filters_and_compressor(dtype)
624+
if np.issubdtype(dtype, np.str_):
625+
filters = filters or []
626+
if not any(x["id"] == "vlen-utf8" for x in filters):
627+
filters = list(filters) + [{"id": "vlen-utf8"}]
628+
620629
metadata = ArrayV2Metadata(
621630
shape=shape,
622631
dtype=np.dtype(dtype),

src/zarr/core/metadata/v2.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -71,14 +71,7 @@ def __init__(
7171
shape_parsed = parse_shapelike(shape)
7272
dtype_parsed = parse_dtype(dtype)
7373
chunks_parsed = parse_shapelike(chunks)
74-
if not filters and not compressor:
75-
filters, compressor = _default_filters_and_compressor(dtype_parsed)
76-
if dtype is str or dtype == "str":
77-
vlen_codec: dict[str, JSON] = {"id": "vlen-utf8"}
78-
if filters and not any(x["id"] == "vlen-utf8" for x in filters):
79-
filters = list(filters) + [vlen_codec]
80-
else:
81-
filters = [vlen_codec]
74+
8275
compressor_parsed = parse_compressor(compressor)
8376
order_parsed = parse_indexing_order(order)
8477
dimension_separator_parsed = parse_separator(dimension_separator)
@@ -343,6 +336,7 @@ def _default_filters_and_compressor(
343336
344337
https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html
345338
"""
339+
dtype = np.dtype(dtype)
346340
default_compressors = config.get("v2_default_compressors")
347341
if dtype.kind in "biufcmM":
348342
dtype_key = "numeric"

tests/test_array.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import numcodecs
99
import numpy as np
1010
import pytest
11+
from numcodecs import Zstd
1112

1213
import zarr.api.asynchronous
1314
from zarr import Array, AsyncArray, Group
@@ -513,6 +514,7 @@ async def test_info_v2_async(self) -> None:
513514
_order="C",
514515
_read_only=False,
515516
_store_type="MemoryStore",
517+
_filters=(Zstd(level=0),),
516518
_count_bytes=128,
517519
)
518520
assert result == expected

tests/test_group.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
import numpy as np
1010
import pytest
11+
from numcodecs import Zstd
1112

1213
import zarr
1314
import zarr.api.asynchronous
@@ -496,6 +497,7 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat
496497
"shape": (1,),
497498
"chunks": (1,),
498499
"order": "C",
500+
"filters": (Zstd(level=0),),
499501
"zarr_format": zarr_format,
500502
},
501503
"subgroup": {

tests/test_metadata/test_consolidated.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import numpy as np
77
import pytest
8+
from numcodecs import Zstd
89

910
import zarr.api.asynchronous
1011
import zarr.api.synchronous
@@ -486,6 +487,7 @@ async def test_consolidated_metadata_v2(self):
486487
attributes={"key": "a"},
487488
chunks=(1,),
488489
fill_value=None,
490+
filters=(Zstd(level=0),),
489491
order="C",
490492
),
491493
"g1": GroupMetadata(

tests/test_metadata/test_v2.py

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from zarr.core.buffer import cpu
1212
from zarr.core.group import ConsolidatedMetadata, GroupMetadata
1313
from zarr.core.metadata import ArrayV2Metadata
14-
from zarr.core.metadata.v2 import _default_filters_and_compressor, parse_zarr_format
14+
from zarr.core.metadata.v2 import parse_zarr_format
1515

1616
if TYPE_CHECKING:
1717
from typing import Any
@@ -77,15 +77,6 @@ def test_metadata_to_dict(
7777
assert observed["dimension_separator"] == expected_dimension_sep
7878
observed.pop("dimension_separator")
7979

80-
if not filters and not compressor:
81-
assert observed["filters"], observed["compressor"] == _default_filters_and_compressor(
82-
np.dtype(data_type)
83-
)
84-
observed.pop("filters")
85-
observed.pop("compressor")
86-
expected.pop("filters")
87-
expected.pop("compressor")
88-
8980
assert observed == expected
9081

9182

tests/test_v2.py

Lines changed: 82 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import zarr
1212
import zarr.core.buffer
1313
import zarr.storage
14-
from zarr import Array
14+
from zarr import Array, config
1515
from zarr.storage import MemoryStore, StorePath
1616

1717

@@ -82,47 +82,76 @@ def test_codec_pipeline() -> None:
8282

8383
@pytest.mark.parametrize("dtype", ["|S", "|V"])
8484
async def test_v2_encode_decode(dtype):
85-
store = zarr.storage.MemoryStore()
86-
g = zarr.group(store=store, zarr_format=2)
87-
g.create_array(
88-
name="foo",
89-
shape=(3,),
90-
chunks=(3,),
91-
dtype=dtype,
92-
fill_value=b"X",
93-
)
94-
95-
result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype())
96-
assert result is not None
97-
98-
serialized = json.loads(result.to_bytes())
99-
expected = {
100-
"chunks": [3],
101-
"compressor": None,
102-
"dtype": f"{dtype}0",
103-
"fill_value": "WA==",
104-
"filters": None,
105-
"order": "C",
106-
"shape": [3],
107-
"zarr_format": 2,
108-
"dimension_separator": ".",
109-
}
110-
assert serialized == expected
111-
112-
data = zarr.open_array(store=store, path="foo")[:]
113-
expected = np.full((3,), b"X", dtype=dtype)
114-
np.testing.assert_equal(data, expected)
85+
with config.set(
86+
{
87+
"v2_default_compressors": {
88+
"bytes": ["vlen-bytes"],
89+
},
90+
}
91+
):
92+
store = zarr.storage.MemoryStore()
93+
g = zarr.group(store=store, zarr_format=2)
94+
g.create_array(
95+
name="foo",
96+
shape=(3,),
97+
chunks=(3,),
98+
dtype=dtype,
99+
fill_value=b"X",
100+
)
101+
102+
result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype())
103+
assert result is not None
104+
105+
serialized = json.loads(result.to_bytes())
106+
expected = {
107+
"chunks": [3],
108+
"compressor": None,
109+
"dtype": f"{dtype}0",
110+
"fill_value": "WA==",
111+
"filters": [{"id": "vlen-bytes"}],
112+
"order": "C",
113+
"shape": [3],
114+
"zarr_format": 2,
115+
"dimension_separator": ".",
116+
}
117+
assert serialized == expected
118+
119+
data = zarr.open_array(store=store, path="foo")[:]
120+
expected = np.full((3,), b"X", dtype=dtype)
121+
np.testing.assert_equal(data, expected)
122+
123+
124+
@pytest.mark.parametrize("dtype_value", [["|S", b"Y"], ["|U", "Y"], ["O", b"Y"]])
125+
def test_v2_encode_decode_with_data(dtype_value):
126+
dtype, value = dtype_value
127+
with config.set(
128+
{
129+
"v2_default_compressors": {
130+
"unicode": ["vlen-utf8"],
131+
"bytes": ["vlen-bytes"],
132+
},
133+
}
134+
):
135+
expected = np.full((3,), value, dtype=dtype)
136+
a = zarr.create(
137+
shape=(3,),
138+
zarr_format=2,
139+
dtype=dtype,
140+
)
141+
a[:] = expected
142+
data = a[:]
143+
np.testing.assert_equal(data, expected)
115144

116145

117146
@pytest.mark.parametrize("dtype", [str, "str"])
118147
async def test_create_dtype_str(dtype: Any) -> None:
119148
arr = zarr.create(shape=3, dtype=dtype, zarr_format=2)
120149
assert arr.dtype.kind == "O"
121150
assert arr.metadata.to_dict()["dtype"] == "|O"
122-
assert arr.metadata.filters == (numcodecs.vlen.VLenUTF8(),)
123-
arr[:] = ["a", "bb", "ccc"]
151+
assert arr.metadata.filters == (numcodecs.vlen.VLenBytes(),)
152+
arr[:] = [b"a", b"bb", b"ccc"]
124153
result = arr[:]
125-
np.testing.assert_array_equal(result, np.array(["a", "bb", "ccc"], dtype="object"))
154+
np.testing.assert_array_equal(result, np.array([b"a", b"bb", b"ccc"], dtype="object"))
126155

127156

128157
@pytest.mark.parametrize("filters", [[], [numcodecs.Delta(dtype="<i4")], [numcodecs.Zlib(level=2)]])
@@ -177,3 +206,22 @@ def test_v2_non_contiguous(array_order: Literal["C", "F"], data_order: Literal["
177206
assert a.flags.c_contiguous
178207
arr[slice(6, 9, None), slice(3, 6, None)] = a
179208
np.testing.assert_array_equal(arr[slice(6, 9, None), slice(3, 6, None)], a)
209+
210+
211+
@pytest.mark.parametrize(
212+
"dtype_expected",
213+
[["b", "zstd"], ["i", "zstd"], ["f", "zstd"], ["|S1", "vlen-bytes"], ["|U1", "vlen-utf8"]],
214+
)
215+
def test_default_filters_and_compressor(dtype_expected: Any) -> None:
216+
with config.set(
217+
{
218+
"v2_dtype_kind_to_default_filters_and_compressor": {
219+
"numeric": ["zstd"],
220+
"unicode": ["vlen-utf8"],
221+
"bytes": ["vlen-bytes"],
222+
},
223+
}
224+
):
225+
dtype, expected = dtype_expected
226+
arr = zarr.create(shape=(3,), path="foo", store={}, zarr_format=2, dtype=dtype)
227+
assert arr.metadata.filters[0].codec_id == expected

0 commit comments

Comments
 (0)