Skip to content

Commit c8d7680

Browse files
committed
update data type names
1 parent 4f3381f commit c8d7680

File tree

3 files changed

+48
-40
lines changed

3 files changed

+48
-40
lines changed

src/zarr/core/array.py

Lines changed: 24 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,6 @@
3030
from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec
3131
from zarr.abc.store import Store, set_or_delete
3232
from zarr.codecs._v2 import V2Codec
33-
from zarr.codecs.bytes import BytesCodec
34-
from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec
35-
from zarr.codecs.zstd import ZstdCodec
3633
from zarr.core._info import ArrayInfo
3734
from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, ArraySpec, parse_array_config
3835
from zarr.core.attributes import Attributes
@@ -69,8 +66,6 @@
6966
from zarr.core.config import config as zarr_config
7067
from zarr.core.dtype import (
7168
DTypeWrapper,
72-
FixedLengthAsciiString,
73-
VariableLengthString,
7469
parse_data_type,
7570
)
7671
from zarr.core.indexing import (
@@ -4224,21 +4219,29 @@ def _get_default_chunk_encoding_v3(
42244219
"""
42254220
Get the default ArrayArrayCodecs, ArrayBytesCodec, and BytesBytesCodec for a given dtype.
42264221
"""
4227-
filters = ()
4228-
compressors = (ZstdCodec(level=0, checksum=False),)
42294222
# TODO: find a registry-style solution for this that isn't bloated
42304223
# We need to associate specific dtypes with specific encoding schemes
42314224

4232-
if isinstance(dtype, VariableLengthString):
4233-
serializer = VLenUTF8Codec()
4234-
elif isinstance(dtype, FixedLengthAsciiString):
4235-
serializer = VLenBytesCodec()
4225+
if dtype._zarr_v3_name in zarr_config.get("array.v3_default_filters"):
4226+
filters = zarr_config.get(f"array.v3_default_filters.{dtype._zarr_v3_name}")
42364227
else:
4237-
if dtype.to_dtype().itemsize == 1:
4238-
serializer = BytesCodec(endian=None)
4239-
else:
4240-
serializer = BytesCodec()
4241-
return filters, serializer, compressors
4228+
filters = zarr_config.get("array.v3_default_filters.default")
4229+
4230+
if dtype._zarr_v3_name in zarr_config.get("array.v3_default_compressors"):
4231+
compressors = zarr_config.get(f"array.v3_default_compressors.{dtype._zarr_v3_name}")
4232+
else:
4233+
compressors = zarr_config.get("array.v3_default_compressors.default")
4234+
4235+
if dtype._zarr_v3_name in zarr_config.get("array.v3_default_serializer"):
4236+
serializer = zarr_config.get(f"array.v3_default_serializer.{dtype._zarr_v3_name}")
4237+
else:
4238+
serializer = zarr_config.get("array.v3_default_serializer.default")
4239+
4240+
return (
4241+
tuple(_parse_array_array_codec(f) for f in filters),
4242+
_parse_array_bytes_codec(serializer),
4243+
tuple(_parse_bytes_bytes_codec(c) for c in compressors),
4244+
)
42424245

42434246

42444247
def _get_default_chunk_encoding_v2(
@@ -4256,7 +4259,11 @@ def _get_default_chunk_encoding_v2(
42564259
compressor = zarr_config.get(f"array.v2_default_compressor.{dtype._zarr_v3_name}")
42574260
else:
42584261
compressor = zarr_config.get("array.v2_default_compressor.default")
4259-
return filters, compressor
4262+
4263+
if filters is not None:
4264+
filters = tuple(numcodecs.get_codec(f) for f in filters)
4265+
4266+
return filters, numcodecs.get_codec(compressor)
42604267

42614268

42624269
def _parse_chunk_encoding_v2(

src/zarr/core/config.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,6 @@
3636
if TYPE_CHECKING:
3737
from donfig.config_obj import ConfigSet
3838

39-
from collections import defaultdict
40-
4139

4240
class BadConfigError(ValueError):
4341
_msg = "bad Config: %r"
@@ -82,15 +80,15 @@ def enable_gpu(self) -> ConfigSet:
8280
"v2_default_compressor": {"default": {"id": "zstd", "level": 0, "checksum": False}},
8381
"v2_default_filters": {
8482
"default": None,
85-
"numpy.variable_length_unicode_string": [{"id": "vlen-utf8"}],
86-
"numpy.fixed_length_unicode_string": [{"id": "vlen-utf8"}],
87-
"r*": [{"id": "vlen-bytes"}],
83+
"variable_length_utf8": [{"id": "vlen-utf8"}],
84+
"fixed_length_ucs4": [{"id": "vlen-utf8"}],
85+
"fixed_length_ascii": [{"id": "vlen-bytes"}],
8886
},
89-
"v3_default_filters": defaultdict(list),
87+
"v3_default_filters": {"default": ()},
9088
"v3_default_serializer": {
9189
"default": {"name": "bytes", "configuration": {"endian": "little"}},
92-
"numpy.variable_length_unicode_string": [{"name": "vlen-utf8"}],
93-
"numpy.fixed_length_unicode_string": [{"name": "vlen-utf8"}],
90+
"variable_length_utf8": {"name": "vlen-utf8"},
91+
"fixed_length_ucs4": {"name": "vlen-utf8"},
9492
"r*": {"name": "vlen-bytes"},
9593
},
9694
"v3_default_compressors": {

src/zarr/core/dtype/_numpy.py

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -496,7 +496,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.complex1
496496
@dataclass(frozen=True, kw_only=True)
497497
class FixedLengthAsciiString(DTypeWrapper[np.dtypes.BytesDType[Any], np.bytes_]):
498498
dtype_cls = np.dtypes.BytesDType
499-
_zarr_v3_name = "numpy.static_byte_string"
499+
_zarr_v3_name = "fixed_length_ascii"
500500
item_size_bits: ClassVar[int] = 8
501501
length: int = 1
502502

@@ -523,20 +523,20 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_:
523523

524524

525525
@dataclass(frozen=True, kw_only=True)
526-
class FixedLengthBytes(DTypeWrapper[np.dtypes.VoidDType[Any], np.void]):
526+
class FixedLengthBytes(DTypeWrapper[np.dtypes.VoidDType, np.void]):
527527
dtype_cls = np.dtypes.VoidDType
528528
_zarr_v3_name = "r*"
529529
item_size_bits: ClassVar[int] = 8
530530
length: int = 1
531531

532532
@classmethod
533-
def _from_dtype_unsafe(cls, dtype: np.dtypes.VoidDType[Any]) -> Self:
533+
def _from_dtype_unsafe(cls, dtype: np.dtypes.VoidDType) -> Self:
534534
return cls(length=dtype.itemsize // (cls.item_size_bits // 8))
535535

536536
def default_value(self) -> np.void:
537537
return self.cast_value(("\x00" * self.length).encode("ascii"))
538538

539-
def to_dtype(self) -> np.dtypes.VoidDType[Any]:
539+
def to_dtype(self) -> np.dtypes.VoidDType:
540540
# Numpy does not allow creating a void type
541541
# by invoking np.dtypes.VoidDType directly
542542
return np.dtype(f"V{self.length}")
@@ -577,7 +577,7 @@ def check_dict(cls, data: dict[str, JSON]) -> TypeGuard[dict[str, JSON]]:
577577
isinstance(data, dict)
578578
and "name" in data
579579
and isinstance(data["name"], str)
580-
and re.match(r"^r\d+$", data["name"])
580+
and (re.match(r"^r\d+$", data["name"]) is not None)
581581
)
582582

583583
def to_json_value(self, data: np.void, *, zarr_format: ZarrFormat) -> str:
@@ -592,7 +592,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void:
592592
@dataclass(frozen=True, kw_only=True)
593593
class FixedLengthUnicodeString(DTypeWrapper[np.dtypes.StrDType[int], np.str_]):
594594
dtype_cls = np.dtypes.StrDType
595-
_zarr_v3_name = "numpy.fixed_length_unicode_string"
595+
_zarr_v3_name = "fixed_length_ucs4"
596596
item_size_bits: ClassVar[int] = 32 # UCS4 is 32 bits per code point
597597
endianness: Endianness | None = "native"
598598
length: int = 1
@@ -605,7 +605,10 @@ def _from_dtype_unsafe(cls, dtype: np.dtypes.StrDType[int]) -> Self:
605605
)
606606

607607
def to_dtype(self) -> np.dtypes.StrDType[int]:
608-
return self.dtype_cls(self.length).newbyteorder(endianness_to_numpy_str(self.endianness))
608+
return cast(
609+
np.dtypes.StrDType[int],
610+
self.dtype_cls(self.length).newbyteorder(endianness_to_numpy_str(self.endianness)),
611+
)
609612

610613
def default_value(self) -> np.str_:
611614
return np.str_("")
@@ -627,7 +630,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> np.str_:
627630
@dataclass(frozen=True, kw_only=True)
628631
class VariableLengthString(DTypeWrapper[np.dtypes.StringDType, str]):
629632
dtype_cls = np.dtypes.StringDType
630-
_zarr_v3_name = "numpy.variable_length_string"
633+
_zarr_v3_name = "variable_length_utf8"
631634

632635
@classmethod
633636
def _from_dtype_unsafe(cls, dtype: np.dtypes.StringDType) -> Self:
@@ -658,14 +661,14 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str:
658661
@dataclass(frozen=True, kw_only=True)
659662
class VariableLengthString(DTypeWrapper[np.dtypes.ObjectDType, str]):
660663
dtype_cls = np.dtypes.ObjectDType
661-
_zarr_v3_name = "numpy.variable_length_string"
664+
_zarr_v3_name = "variable_length_utf8"
662665

663666
@classmethod
664667
def _from_dtype_unsafe(cls, dtype: np.dtypes.ObjectDType) -> Self:
665668
return cls()
666669

667670
def to_dtype(self) -> np.dtypes.ObjectDType:
668-
return self.dtype_cls()
671+
return cast(np.dtypes.ObjectDType, self.dtype_cls())
669672

670673
def cast_value(self, value: object) -> str:
671674
return str(value)
@@ -695,7 +698,7 @@ def from_json_value(self, data: JSON, *, zarr_format: ZarrFormat) -> str:
695698
@dataclass(frozen=True, kw_only=True)
696699
class DateTime64(DTypeWrapper[np.dtypes.DateTime64DType, np.datetime64]):
697700
dtype_cls = np.dtypes.DateTime64DType
698-
_zarr_v3_name = "numpy.datetime64"
701+
_zarr_v3_name = "datetime64"
699702
unit: DateUnit | TimeUnit = "s"
700703
endianness: Endianness = "native"
701704

@@ -713,7 +716,7 @@ def _from_dtype_unsafe(cls, dtype: np.dtypes.DateTime64DType) -> Self:
713716
return cls(unit=unit, endianness=endianness_from_numpy_str(dtype.byteorder))
714717

715718
def cast_value(self, value: object) -> np.datetime64:
716-
return self.to_dtype().type(value, self.unit)
719+
return cast(np.datetime64, self.to_dtype().type(value, self.unit))
717720

718721
def to_dtype(self) -> np.dtypes.DateTime64DType:
719722
# Numpy does not allow creating datetime64 via
@@ -734,14 +737,14 @@ def to_json_value(self, data: np.datetime64, *, zarr_format: ZarrFormat) -> int:
734737
@dataclass(frozen=True, kw_only=True)
735738
class Structured(DTypeWrapper[np.dtypes.VoidDType, np.void]):
736739
dtype_cls = np.dtypes.VoidDType
737-
_zarr_v3_name = "numpy.structured"
740+
_zarr_v3_name = "structured"
738741
fields: tuple[tuple[str, DTypeWrapper[Any, Any]], ...]
739742

740743
def default_value(self) -> np.void:
741744
return self.cast_value(0)
742745

743746
def cast_value(self, value: object) -> np.void:
744-
return np.array([value], dtype=self.to_dtype())[0]
747+
return cast(np.void, np.array([value], dtype=self.to_dtype())[0])
745748

746749
@classmethod
747750
def check_dtype(cls, dtype: np.dtypes.DTypeLike) -> TypeGuard[np.dtypes.VoidDType]:
@@ -787,7 +790,7 @@ def to_dict(self) -> dict[str, JSON]:
787790
return base_dict
788791

789792
@classmethod
790-
def check_dict(cls, data: JSON) -> bool:
793+
def check_dict(cls, data: JSON) -> TypeGuard[JSON]:
791794
return (
792795
isinstance(data, dict)
793796
and "name" in data

0 commit comments

Comments
 (0)