Skip to content

Commit 07590ca

Browse files
committed
fix v2 decode string dtype
1 parent f93ced2 commit 07590ca

File tree

3 files changed

+38
-13
lines changed

3 files changed

+38
-13
lines changed

src/zarr/codecs/_v2.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from typing import TYPE_CHECKING
66

77
import numcodecs
8+
import numpy as np
89
from numcodecs.compat import ensure_ndarray_like
910

1011
from zarr.abc.codec import ArrayBytesCodec
@@ -43,12 +44,15 @@ async def _decode_single(
4344

4445
# view as numpy array with correct dtype
4546
chunk = ensure_ndarray_like(chunk)
47+
print(chunk)
48+
print(chunk.dtype)
4649
# special case object dtype, because incorrect handling can lead to
4750
# segfaults and other bad things happening
4851
if chunk_spec.dtype != object:
49-
print(chunk_spec.dtype, chunk.dtype)
50-
chunk = chunk.view(chunk_spec.dtype)
51-
print("worked")
52+
try:
53+
chunk = chunk.view(chunk_spec.dtype)
54+
except TypeError:
55+
chunk = np.array(chunk).astype(chunk_spec.dtype)
5256

5357
elif chunk.dtype != object:
5458
# If we end up here, someone must have hacked around with the filters.

src/zarr/core/config.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,8 @@ def reset(self) -> None:
6666
"ndbuffer": "zarr.core.buffer.cpu.NDBuffer",
6767
"v2_dtype_kind_to_default_filters_and_compressor": {
6868
"biufcmM": ["zstd"],
69-
"SV": ["vlen-bytes"],
70-
"OU": ["vlen-utf8"],
69+
"U": ["vlen-utf8"],
70+
"OSV": ["vlen-bytes"],
7171
},
7272
}
7373
],

tests/test_v2.py

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ async def test_v2_encode_decode(dtype):
8383
with config.set(
8484
{
8585
"v2_dtype_kind_to_default_filters_and_compressor": {
86-
"OSUV": ["vlen-bytes"],
86+
"SV": ["vlen-bytes"],
8787
},
8888
}
8989
):
@@ -119,15 +119,37 @@ async def test_v2_encode_decode(dtype):
119119
np.testing.assert_equal(data, expected)
120120

121121

122+
@pytest.mark.parametrize("dtype_value", [["|S", b"Y"], ["|U", "Y"], ["O", b"Y"]])
123+
def test_v2_encode_decode_with_data(dtype_value):
124+
dtype, value = dtype_value
125+
with config.set(
126+
{
127+
"v2_dtype_kind_to_default_filters_and_compressor": {
128+
"U": ["vlen-utf8"],
129+
"OSV": ["vlen-bytes"],
130+
},
131+
}
132+
):
133+
expected = np.full((3,), value, dtype=dtype)
134+
a = zarr.create(
135+
shape=(3,),
136+
zarr_format=2,
137+
dtype=dtype,
138+
)
139+
a[:] = expected
140+
data = a[:]
141+
np.testing.assert_equal(data, expected)
142+
143+
122144
@pytest.mark.parametrize("dtype", [str, "str"])
123145
async def test_create_dtype_str(dtype: Any) -> None:
124146
arr = zarr.create(shape=3, dtype=dtype, zarr_format=2)
125147
assert arr.dtype.kind == "O"
126148
assert arr.metadata.to_dict()["dtype"] == "|O"
127-
assert arr.metadata.filters == (numcodecs.vlen.VLenUTF8(),)
128-
arr[:] = ["a", "bb", "ccc"]
149+
assert arr.metadata.filters == (numcodecs.vlen.VLenBytes(),)
150+
arr[:] = [b"a", b"bb", b"ccc"]
129151
result = arr[:]
130-
np.testing.assert_array_equal(result, np.array(["a", "bb", "ccc"], dtype="object"))
152+
np.testing.assert_array_equal(result, np.array([b"a", b"bb", b"ccc"], dtype="object"))
131153

132154

133155
@pytest.mark.parametrize("filters", [[], [numcodecs.Delta(dtype="<i4")], [numcodecs.Zlib(level=2)]])
@@ -141,19 +163,18 @@ def test_v2_filters_codecs(filters: Any) -> None:
141163

142164
@pytest.mark.parametrize(
143165
"dtype_expected",
144-
# [["b", "zstd"], ["i", "zstd"], ["f", "zstd"], ["|S1", "vlen-utf8"], ["|U1", "vlen-utf8"]],
145-
[["|S1", "vlen-bytes"]],
166+
[["b", "zstd"], ["i", "zstd"], ["f", "zstd"], ["|S1", "vlen-bytes"], ["|U1", "vlen-utf8"]],
146167
)
147168
def test_default_filters_and_compressor(dtype_expected: Any) -> None:
148169
with config.set(
149170
{
150171
"v2_dtype_kind_to_default_filters_and_compressor": {
151172
"biufcmM": ["zstd"],
152-
"OSUV": ["vlen-bytes"],
173+
"U": ["vlen-utf8"],
174+
"OSV": ["vlen-bytes"],
153175
},
154176
}
155177
):
156178
dtype, expected = dtype_expected
157179
arr = zarr.create(shape=(3,), path="foo", store={}, zarr_format=2, dtype=dtype)
158180
assert arr.metadata.filters[0].codec_id == expected
159-
arr[:] = np.array(["a", "bb", "ccc"], dtype=dtype)

0 commit comments

Comments
 (0)