Skip to content

Commit b9c3627

Browse files
committed
Prototype of new DType interface
1 parent 2be9f36 commit b9c3627

File tree

3 files changed

+329
-10
lines changed

3 files changed

+329
-10
lines changed

src/zarr/core/dtype/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from zarr.core.dtype.core import (
2+
ZarrDType
3+
)
4+
5+
__all__ = [
6+
"ZarrDType"
7+
]

src/zarr/core/dtype/core.py

Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
"""
2+
# Overview
3+
4+
This module provides a proof-of-concept standalone interface for managing dtypes in the zarr-python codebase.
5+
6+
The `ZarrDType` class introduced in this module effectively acts as a replacement for `np.dtype` throughout the
7+
zarr-python codebase. It attempts to encapsulate all relevant runtime information necessary for working with
8+
dtypes in the context of the Zarr V3 specification (e.g. is this a core dtype or not, how many bytes and what
9+
endianness is the dtype etc). By providing this abstraction, the module aims to:
10+
11+
- Simplify dtype management within zarr-python
12+
- Support runtime flexibility and custom extensions
13+
- Remove unnecessary dependencies on the numpy API
14+
15+
## Extensibility
16+
17+
The module attempts to support user-driven extensions, allowing developers to introduce custom dtypes
18+
without requiring immediate changes to zarr-python. Extensions can leverage the current entrypoint mechanism,
19+
enabling integration of experimental features. Over time, widely adopted extensions may be formalized through
20+
inclusion in zarr-python or standardized via a Zarr Enhancement Proposal (ZEP), but this is not essential.
21+
22+
## Examples
23+
24+
### Core `dtype` Registration
25+
26+
The following example demonstrates how to register a built-in `dtype` in the core codebase:
27+
28+
```python
29+
from zarr.core.dtype import ZarrDType
30+
from zarr.registry import register_v3dtype
31+
32+
class Float16(ZarrDType):
33+
zarr_spec_format = "3"
34+
experimental = False
35+
endianness = "little"
36+
byte_count = 2
37+
to_numpy = np.dtype('float16')
38+
39+
register_v3dtype(Float16)
40+
```
41+
42+
### Entrypoint Extension
43+
44+
The following example demonstrates how users can register a new `bfloat16` dtype for Zarr.
45+
This approach adheres to the existing Zarr entrypoint pattern as much as possible, ensuring
46+
consistency with other extensions. The code below would typically be part of a Python package
47+
that specifies the entrypoints for the extension:
48+
49+
```python
50+
import ml_dtypes
51+
from zarr.core.dtype import ZarrDType # User inherits from ZarrDType when creating their dtype
52+
53+
class Bfloat16(ZarrDType):
54+
zarr_spec_format = "3"
55+
experimental = True
56+
endianness = "little"
57+
byte_count = 2
58+
to_numpy = np.dtype('bfloat16') # Enabled by importing ml_dtypes
59+
configuration_v3 = {
60+
"version": "example_value",
61+
"author": "example_value",
62+
"ml_dtypes_version": "example_value"
63+
}
64+
```
65+
66+
### dtype lookup
67+
68+
The following examples demonstrate how to perform a lookup for the relevant ZarrDType, given
69+
a string that matches the dtype Zarr specification ID, or a numpy dtype object:
70+
71+
```
72+
from zarr.registry import get_v3dtype_class, get_v3dtype_class_from_numpy
73+
74+
get_v3dtype_class('complex64') # returns little-endian Complex64 ZarrDType
75+
get_v3dtype_class('not_registered_dtype') # ValueError
76+
77+
get_v3dtype_class_from_numpy('>i2') # returns big-endian Int16 ZarrDType
78+
get_v3dtype_class_from_numpy(np.dtype('float32')) # returns little-endian Float32 ZarrDType
79+
get_v3dtype_class_from_numpy('i10') # ValueError
80+
```
81+
82+
### String dtypes
83+
84+
The following indicates one possibility for supporting variable-length strings. It is via the
85+
entrypoint mechanism as in a previous example. The Apache Arrow specification does not currently
86+
include a dtype for fixed-length strings (only for fixed-length bytes) and so I am using string
87+
here to implicitly refer to a variable-length string data (there may be some subtleties with codecs
88+
that means this needs to be refined further):
89+
90+
```python
91+
import numpy as np
92+
from zarr.core.dtype import ZarrDType # User inherits from ZarrDType when creating their dtype
93+
94+
try:
95+
to_numpy = np.dtypes.StringDType()
96+
except AttributeError:
97+
to_numpy = np.dtypes.ObjectDType()
98+
99+
class String(ZarrDType):
100+
zarr_spec_format = "3"
101+
experimental = True
102+
endianness = 'little'
103+
byte_count = None # None is defined to mean variable
104+
to_numpy = to_numpy
105+
```
106+
107+
### int4 dtype
108+
109+
There is currently considerable interest in the AI community in 'quantising' models - storing
110+
models at reduced precision, while minimising loss of information content. There are a number
111+
of sub-byte dtypes that the community are using e.g. int4. Unfortunately numpy does not
112+
currently have support for handling such sub-byte dtypes in an easy way. However, they can
113+
still be held in a numpy array and then passed (in a zero-copy way) to something like pytorch
114+
which can handle appropriately:
115+
116+
```python
117+
import numpy as np
118+
from zarr.core.dtype import ZarrDType # User inherits from ZarrDType when creating their dtype
119+
120+
class Int4(ZarrDType):
121+
zarr_spec_format = "3"
122+
experimental = True
123+
endianness = 'little'
124+
byte_count = 1 # this is ugly, but I could change this from byte_count to bit_count if there was consensus
125+
to_numpy = np.dtype('B') # could also be np.dtype('V1'), but this would prevent bit-twiddling
126+
configuration_v3 = {
127+
"version": "example_value",
128+
"author": "example_value",
129+
}
130+
```
131+
"""
132+
133+
from __future__ import annotations
134+
135+
from typing import Any, Literal
136+
137+
import numpy as np
138+
139+
140+
# perhaps over-complicating, but I don't want to allow the attributes to be patched
141+
class FrozenClassVariables(type):
142+
def __setattr__(cls, attr, value):
143+
if hasattr(cls, attr):
144+
raise ValueError(
145+
f"Attribute {attr} on ZarrDType class can not be changed once set."
146+
)
147+
148+
149+
class ZarrDType(metaclass=FrozenClassVariables):
150+
151+
zarr_spec_format: Literal["2", "3"] # the version of the zarr spec used
152+
experimental: bool # is this in the core spec or not
153+
endianness: Literal[
154+
"big", "little", None
155+
] # None indicates not defined i.e. single byte or byte strings
156+
byte_count: int | None # None indicates variable count
157+
to_numpy: np.dtype[
158+
Any
159+
] # may involve installing a a numpy extension e.g. ml_dtypes;
160+
161+
configuration_v3: (
162+
dict | None
163+
) # TODO: understand better how this is recommended by the spec
164+
165+
_zarr_spec_identifier: str # implementation detail used to map to core spec
166+
167+
def __init_subclass__( # enforces all required fields are set and basic sanity checks
168+
cls,
169+
**kwargs,
170+
) -> None:
171+
172+
required_attrs = [
173+
"zarr_spec_format",
174+
"experimental",
175+
"endianness",
176+
"byte_count",
177+
"to_numpy",
178+
]
179+
for attr in required_attrs:
180+
if not hasattr(cls, attr):
181+
raise ValueError(f"{attr} is a required attribute for a Zarr dtype.")
182+
183+
if not hasattr(cls, "configuration_v3"):
184+
cls.configuration_v3 = None
185+
186+
cls._zarr_spec_identifier = (
187+
"big_" + cls.__qualname__.lower()
188+
if cls.endianness == "big"
189+
else cls.__qualname__.lower()
190+
) # how this dtype is identified in core spec; convention is prefix with big_ for big-endian
191+
192+
cls._validate() # sanity check on basic requirements
193+
194+
super().__init_subclass__(**kwargs)
195+
196+
# TODO: add further checks
197+
@classmethod
198+
def _validate(cls):
199+
200+
if cls.byte_count is not None and cls.byte_count <= 0:
201+
raise ValueError("byte_count must be a positive integer.")
202+
203+
if cls.byte_count == 1 and cls.endianness is not None:
204+
raise ValueError("Endianness must be None for single-byte types.")

0 commit comments

Comments
 (0)