Skip to content

Commit bec9512

Browse files
committed
Merge branch 'main' of github.com:zarr-developers/zarr-python into docs/dtype-docs
2 parents a785e35 + baabf08 commit bec9512

File tree

15 files changed

+303
-40
lines changed

15 files changed

+303
-40
lines changed

changes/3151.bugfix.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fixed an issue preventing correct parsing of NumPy ``int32`` dtypes when constructed via
2+
``np.dtype('i')``.

changes/3170.bugfix.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
Fixes a variety of issues related to string data types.
2+
3+
- Brings the ``VariableLengthUTF8`` data type Zarr V3 identifier in alignment with Zarr Python 3.0.8
4+
- Disallows creation of 0-length fixed-length data types
5+
- Adds a regression test for the ``VariableLengthUTF8`` data type that checks against version 3.0.8
6+
- Allows users to request the ``VariableLengthUTF8`` data type with ``str``, ``"str"``, or ``"string"``.

src/zarr/core/dtype/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,12 @@
110110
VariableLengthBytes,
111111
)
112112

113+
# These are aliases for variable-length UTF-8 strings
114+
# We handle them when a user requests a data type instead of using NumPy's dtype inferece because
115+
# the default NumPy behavior -- to inspect the user-provided array data and choose
116+
# an appropriately sized U dtype -- is unworkable for Zarr.
117+
VLEN_UTF8_ALIAS: Final = ("str", str, "string")
118+
113119
# This type models inputs that can be coerced to a ZDType
114120
ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[TBaseDType, TBaseScalar] | Mapping[str, JSON] | str
115121

@@ -158,6 +164,10 @@ def parse_data_type(
158164
# dict and zarr_format 3 means that we have a JSON object representation of the dtype
159165
if zarr_format == 3 and isinstance(dtype_spec, Mapping):
160166
return get_data_type_from_json(dtype_spec, zarr_format=3)
167+
if dtype_spec in VLEN_UTF8_ALIAS:
168+
# If the dtype request is one of the aliases for variable-length UTF-8 strings,
169+
# return that dtype.
170+
return VariableLengthUTF8() # type: ignore[return-value]
161171
# otherwise, we have either a numpy dtype string, or a zarr v3 dtype string, and in either case
162172
# we can create a numpy dtype from it, and do the dtype inference from that
163173
return get_data_type_from_native_dtype(dtype_spec) # type: ignore[arg-type]

src/zarr/core/dtype/npy/bytes.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,14 @@ class NullTerminatedBytes(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLengt
6363
dtype_cls = np.dtypes.BytesDType
6464
_zarr_v3_name: ClassVar[Literal["null_terminated_bytes"]] = "null_terminated_bytes"
6565

66+
def __post_init__(self) -> None:
67+
"""
68+
We don't allow instances of this class with length less than 1 because there is no way such
69+
a data type can contain actual data.
70+
"""
71+
if self.length < 1:
72+
raise ValueError(f"length must be >= 1, got {self.length}.")
73+
6674
@classmethod
6775
def from_native_dtype(cls, dtype: TBaseDType) -> Self:
6876
"""
@@ -429,6 +437,14 @@ class does not support structured data types.
429437
dtype_cls = np.dtypes.VoidDType # type: ignore[assignment]
430438
_zarr_v3_name: ClassVar[Literal["raw_bytes"]] = "raw_bytes"
431439

440+
def __post_init__(self) -> None:
441+
"""
442+
We don't allow instances of this class with length less than 1 because there is no way such
443+
a data type can contain actual data.
444+
"""
445+
if self.length < 1:
446+
raise ValueError(f"length must be >= 1, got {self.length}.")
447+
432448
@classmethod
433449
def _check_native_dtype(
434450
cls: type[Self], dtype: TBaseDType

src/zarr/core/dtype/npy/int.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -876,7 +876,28 @@ class Int32(BaseInt[np.dtypes.Int32DType, np.int32], HasEndianness):
876876
_zarr_v2_names: ClassVar[tuple[Literal[">i4"], Literal["<i4"]]] = (">i4", "<i4")
877877

878878
@classmethod
879-
def from_native_dtype(cls, dtype: TBaseDType) -> Self:
879+
def _check_native_dtype(cls: type[Self], dtype: TBaseDType) -> TypeGuard[np.dtypes.Int32DType]:
880+
"""
881+
A type guard that checks if the input is assignable to the type of ``cls.dtype_class``
882+
883+
This method is overridden for this particular data type because of a Windows-specific issue
884+
where np.dtype('i') creates an instance of ``np.dtypes.IntDType``, rather than an
885+
instance of ``np.dtypes.Int32DType``, even though both represent 32-bit signed integers.
886+
887+
Parameters
888+
----------
889+
dtype : TDType
890+
The dtype to check.
891+
892+
Returns
893+
-------
894+
Bool
895+
True if the dtype matches, False otherwise.
896+
"""
897+
return super()._check_native_dtype(dtype) or dtype == np.dtypes.Int32DType()
898+
899+
@classmethod
900+
def from_native_dtype(cls: type[Self], dtype: TBaseDType) -> Self:
880901
"""
881902
Create an Int32 from a np.dtype('int32') instance.
882903

src/zarr/core/dtype/npy/string.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,14 @@ class FixedLengthUTF32(
7979
_zarr_v3_name: ClassVar[Literal["fixed_length_utf32"]] = "fixed_length_utf32"
8080
code_point_bytes: ClassVar[int] = 4 # utf32 is 4 bytes per code point
8181

82+
def __post_init__(self) -> None:
83+
"""
84+
We don't allow instances of this class with length less than 1 because there is no way such
85+
a data type can contain actual data.
86+
"""
87+
if self.length < 1:
88+
raise ValueError(f"length must be >= 1, got {self.length}.")
89+
8290
@classmethod
8391
def from_native_dtype(cls, dtype: TBaseDType) -> Self:
8492
"""
@@ -386,7 +394,7 @@ class UTF8Base(ZDType[TDType_co, str], HasObjectCodec):
386394
The object codec ID for this data type.
387395
"""
388396

389-
_zarr_v3_name: ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8"
397+
_zarr_v3_name: ClassVar[Literal["string"]] = "string"
390398
object_codec_id: ClassVar[Literal["vlen-utf8"]] = "vlen-utf8"
391399

392400
@classmethod
@@ -509,11 +517,11 @@ def to_json(
509517
self, zarr_format: Literal[2]
510518
) -> DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]]: ...
511519
@overload
512-
def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_utf8"]: ...
520+
def to_json(self, zarr_format: Literal[3]) -> Literal["string"]: ...
513521

514522
def to_json(
515523
self, zarr_format: ZarrFormat
516-
) -> DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]] | Literal["variable_length_utf8"]:
524+
) -> DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]] | Literal["string"]:
517525
"""
518526
Convert this data type to a JSON representation.
519527
@@ -524,13 +532,12 @@ def to_json(
524532
525533
Returns
526534
-------
527-
``DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]] | Literal["variable_length_utf8"]``
535+
``DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]] | Literal["string"]``
528536
The JSON representation of this data type.
529537
"""
530538
if zarr_format == 2:
531539
return {"name": "|O", "object_codec_id": self.object_codec_id}
532540
elif zarr_format == 3:
533-
v3_unstable_dtype_warning(self)
534541
return self._zarr_v3_name
535542
raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover
536543

src/zarr/core/dtype/npy/structured.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,10 @@ class Structured(ZDType[np.dtypes.VoidDType[int], np.void], HasItemSize):
4949
_zarr_v3_name: ClassVar[Literal["structured"]] = "structured"
5050
fields: tuple[tuple[str, ZDType[TBaseDType, TBaseScalar]], ...]
5151

52+
def __post_init__(self) -> None:
53+
if len(self.fields) < 1:
54+
raise ValueError(f"must have at least one field. Got {self.fields!r}")
55+
5256
@classmethod
5357
def _check_native_dtype(cls, dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]:
5458
"""

tests/test_array.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
from zarr.core.chunk_grids import _auto_partition
4242
from zarr.core.chunk_key_encodings import ChunkKeyEncodingParams
4343
from zarr.core.common import JSON, MemoryOrder, ZarrFormat
44-
from zarr.core.dtype import get_data_type_from_native_dtype
44+
from zarr.core.dtype import parse_data_type
4545
from zarr.core.dtype.common import ENDIANNESS_STR, EndiannessStr
4646
from zarr.core.dtype.npy.common import NUMPY_ENDIANNESS_STR, endianness_from_numpy_str
4747
from zarr.core.dtype.npy.float import Float32, Float64
@@ -1285,7 +1285,7 @@ async def test_v2_chunk_encoding(
12851285
filters=filters,
12861286
)
12871287
filters_expected, compressor_expected = _parse_chunk_encoding_v2(
1288-
filters=filters, compressor=compressors, dtype=get_data_type_from_native_dtype(dtype)
1288+
filters=filters, compressor=compressors, dtype=parse_data_type(dtype, zarr_format=2)
12891289
)
12901290
assert arr.metadata.zarr_format == 2 # guard for mypy
12911291
assert arr.metadata.compressor == compressor_expected

tests/test_dtype/test_npy/test_bytes.py

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ class TestNullTerminatedBytes(BaseTestZDType):
1515
np.dtype("|U10"),
1616
)
1717
valid_json_v2 = (
18-
{"name": "|S0", "object_codec_id": None},
18+
{"name": "|S1", "object_codec_id": None},
1919
{"name": "|S2", "object_codec_id": None},
2020
{"name": "|S4", "object_codec_id": None},
2121
)
@@ -31,22 +31,22 @@ class TestNullTerminatedBytes(BaseTestZDType):
3131
)
3232

3333
scalar_v2_params = (
34-
(NullTerminatedBytes(length=0), ""),
34+
(NullTerminatedBytes(length=1), "MA=="),
3535
(NullTerminatedBytes(length=2), "YWI="),
3636
(NullTerminatedBytes(length=4), "YWJjZA=="),
3737
)
3838
scalar_v3_params = (
39-
(NullTerminatedBytes(length=0), ""),
39+
(NullTerminatedBytes(length=1), "MA=="),
4040
(NullTerminatedBytes(length=2), "YWI="),
4141
(NullTerminatedBytes(length=4), "YWJjZA=="),
4242
)
4343
cast_value_params = (
44-
(NullTerminatedBytes(length=0), "", np.bytes_("")),
44+
(NullTerminatedBytes(length=1), "", np.bytes_("")),
4545
(NullTerminatedBytes(length=2), "ab", np.bytes_("ab")),
4646
(NullTerminatedBytes(length=4), "abcdefg", np.bytes_("abcd")),
4747
)
4848
item_size_params = (
49-
NullTerminatedBytes(length=0),
49+
NullTerminatedBytes(length=1),
5050
NullTerminatedBytes(length=4),
5151
NullTerminatedBytes(length=10),
5252
)
@@ -62,7 +62,7 @@ class TestRawBytes(BaseTestZDType):
6262
)
6363
valid_json_v2 = ({"name": "|V10", "object_codec_id": None},)
6464
valid_json_v3 = (
65-
{"name": "raw_bytes", "configuration": {"length_bytes": 0}},
65+
{"name": "raw_bytes", "configuration": {"length_bytes": 1}},
6666
{"name": "raw_bytes", "configuration": {"length_bytes": 8}},
6767
)
6868

@@ -77,22 +77,22 @@ class TestRawBytes(BaseTestZDType):
7777
)
7878

7979
scalar_v2_params = (
80-
(RawBytes(length=0), ""),
80+
(RawBytes(length=1), "AA=="),
8181
(RawBytes(length=2), "YWI="),
8282
(RawBytes(length=4), "YWJjZA=="),
8383
)
8484
scalar_v3_params = (
85-
(RawBytes(length=0), ""),
85+
(RawBytes(length=1), "AA=="),
8686
(RawBytes(length=2), "YWI="),
8787
(RawBytes(length=4), "YWJjZA=="),
8888
)
8989
cast_value_params = (
90-
(RawBytes(length=0), b"", np.void(b"")),
90+
(RawBytes(length=1), b"\x00", np.void(b"\x00")),
9191
(RawBytes(length=2), b"ab", np.void(b"ab")),
9292
(RawBytes(length=4), b"abcd", np.void(b"abcd")),
9393
)
9494
item_size_params = (
95-
RawBytes(length=0),
95+
RawBytes(length=1),
9696
RawBytes(length=4),
9797
RawBytes(length=10),
9898
)
@@ -152,3 +152,14 @@ def test_unstable_dtype_warning(
152152
"""
153153
with pytest.raises(UnstableSpecificationWarning):
154154
zdtype.to_json(zarr_format=3)
155+
156+
157+
@pytest.mark.parametrize("zdtype_cls", [NullTerminatedBytes, RawBytes])
158+
def test_invalid_size(zdtype_cls: type[NullTerminatedBytes] | type[RawBytes]) -> None:
159+
"""
160+
Test that it's impossible to create a data type that has no length
161+
"""
162+
length = 0
163+
msg = f"length must be >= 1, got {length}."
164+
with pytest.raises(ValueError, match=msg):
165+
zdtype_cls(length=length)

tests/test_dtype/test_npy/test_int.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,10 @@ class TestInt16(BaseTestZDType):
7575
class TestInt32(BaseTestZDType):
7676
test_cls = Int32
7777
scalar_type = np.int32
78-
valid_dtype = (np.dtype(">i4"), np.dtype("<i4"))
78+
# The behavior of some tests associated with this class variable are
79+
# order-dependent -- np.dtype('i') correctly fails certain tests only if it's not
80+
# in the last position of the tuple. I have no idea how this is possible!
81+
valid_dtype = (np.dtype("i"), np.dtype(">i4"), np.dtype("<i4"))
7982
invalid_dtype = (
8083
np.dtype(np.int8),
8184
np.dtype(np.uint16),

0 commit comments

Comments
 (0)