|  | 
|  | 1 | +""" | 
|  | 2 | +# Overview | 
|  | 3 | +
 | 
|  | 4 | +This module provides a proof-of-concept standalone interface for managing dtypes in the zarr-python codebase.  | 
|  | 5 | +
 | 
|  | 6 | +The `ZarrDType` class introduced in this module effectively acts as a replacement for `np.dtype` throughout the | 
|  | 7 | +zarr-python codebase. It attempts to encapsulate all relevant runtime information necessary for working with | 
|  | 8 | +dtypes in the context of the Zarr V3 specification (e.g. is this a core dtype or not, how many bytes and what | 
|  | 9 | +endianness is the dtype etc). By providing this abstraction, the module aims to: | 
|  | 10 | +
 | 
|  | 11 | +- Simplify dtype management within zarr-python | 
|  | 12 | +- Support runtime flexibility and custom extensions | 
|  | 13 | +- Remove unnecessary dependencies on the numpy API | 
|  | 14 | +
 | 
|  | 15 | +## Extensibility | 
|  | 16 | +
 | 
|  | 17 | +The module attempts to support user-driven extensions, allowing developers to introduce custom dtypes | 
|  | 18 | +without requiring immediate changes to zarr-python. Extensions can leverage the current entrypoint mechanism,  | 
|  | 19 | +enabling integration of experimental features. Over time, widely adopted extensions may be formalized through | 
|  | 20 | +inclusion in zarr-python or standardized via a Zarr Enhancement Proposal (ZEP), but this is not essential. | 
|  | 21 | +
 | 
|  | 22 | +## Examples | 
|  | 23 | +
 | 
|  | 24 | +### Core `dtype` Registration | 
|  | 25 | +
 | 
|  | 26 | +The following example demonstrates how to register a built-in `dtype` in the core codebase: | 
|  | 27 | +
 | 
|  | 28 | +```python | 
|  | 29 | +from zarr.core.dtype import ZarrDType | 
|  | 30 | +from zarr.registry import register_v3dtype | 
|  | 31 | +
 | 
|  | 32 | +class Float16(ZarrDType): | 
|  | 33 | +    zarr_spec_format = "3" | 
|  | 34 | +    experimental = False | 
|  | 35 | +    endianness = "little" | 
|  | 36 | +    byte_count = 2 | 
|  | 37 | +    to_numpy = np.dtype('float16') | 
|  | 38 | +
 | 
|  | 39 | +register_v3dtype(Float16) | 
|  | 40 | +``` | 
|  | 41 | +
 | 
|  | 42 | +### Entrypoint Extension | 
|  | 43 | +
 | 
|  | 44 | +The following example demonstrates how users can register a new `bfloat16` dtype for Zarr. | 
|  | 45 | +This approach adheres to the existing Zarr entrypoint pattern as much as possible, ensuring | 
|  | 46 | +consistency with other extensions. The code below would typically be part of a Python package | 
|  | 47 | +that specifies the entrypoints for the extension: | 
|  | 48 | +
 | 
|  | 49 | +```python | 
|  | 50 | +import ml_dtypes | 
|  | 51 | +from zarr.core.dtype import ZarrDType  # User inherits from ZarrDType when creating their dtype | 
|  | 52 | +
 | 
|  | 53 | +class Bfloat16(ZarrDType): | 
|  | 54 | +    zarr_spec_format = "3" | 
|  | 55 | +    experimental = True | 
|  | 56 | +    endianness = "little" | 
|  | 57 | +    byte_count = 2 | 
|  | 58 | +    to_numpy = np.dtype('bfloat16')  # Enabled by importing ml_dtypes | 
|  | 59 | +    configuration_v3 = { | 
|  | 60 | +        "version": "example_value", | 
|  | 61 | +        "author": "example_value", | 
|  | 62 | +        "ml_dtypes_version": "example_value" | 
|  | 63 | +    } | 
|  | 64 | +``` | 
|  | 65 | +
 | 
|  | 66 | +### dtype lookup | 
|  | 67 | +
 | 
|  | 68 | +The following examples demonstrate how to perform a lookup for the relevant ZarrDType, given | 
|  | 69 | +a string that matches the dtype Zarr specification ID, or a numpy dtype object: | 
|  | 70 | +
 | 
|  | 71 | +``` | 
|  | 72 | +from zarr.registry import get_v3dtype_class, get_v3dtype_class_from_numpy | 
|  | 73 | +
 | 
|  | 74 | +get_v3dtype_class('complex64')  # returns little-endian Complex64 ZarrDType | 
|  | 75 | +get_v3dtype_class('not_registered_dtype')  # ValueError | 
|  | 76 | +
 | 
|  | 77 | +get_v3dtype_class_from_numpy('>i2')  # returns big-endian Int16 ZarrDType | 
|  | 78 | +get_v3dtype_class_from_numpy(np.dtype('float32'))  # returns little-endian Float32 ZarrDType | 
|  | 79 | +get_v3dtype_class_from_numpy('i10')  # ValueError | 
|  | 80 | +``` | 
|  | 81 | +
 | 
|  | 82 | +### String dtypes | 
|  | 83 | +
 | 
|  | 84 | +The following indicates one possibility for supporting variable-length strings. It is via the | 
|  | 85 | +entrypoint mechanism as in a previous example. The Apache Arrow specification does not currently | 
|  | 86 | +include a dtype for fixed-length strings (only for fixed-length bytes) and so I am using string | 
|  | 87 | +here to implicitly refer to a variable-length string data (there may be some subtleties with codecs | 
|  | 88 | +that means this needs to be refined further): | 
|  | 89 | +
 | 
|  | 90 | +```python | 
|  | 91 | +import numpy as np | 
|  | 92 | +from zarr.core.dtype import ZarrDType  # User inherits from ZarrDType when creating their dtype | 
|  | 93 | +
 | 
|  | 94 | +try: | 
|  | 95 | +    to_numpy = np.dtypes.StringDType() | 
|  | 96 | +except AttributeError: | 
|  | 97 | +    to_numpy = np.dtypes.ObjectDType() | 
|  | 98 | +
 | 
|  | 99 | +class String(ZarrDType): | 
|  | 100 | +    zarr_spec_format = "3" | 
|  | 101 | +    experimental = True | 
|  | 102 | +    endianness = 'little' | 
|  | 103 | +    byte_count = None  # None is defined to mean variable | 
|  | 104 | +    to_numpy = to_numpy | 
|  | 105 | +``` | 
|  | 106 | +
 | 
|  | 107 | +### int4 dtype | 
|  | 108 | +
 | 
|  | 109 | +There is currently considerable interest in the AI community in 'quantising' models - storing | 
|  | 110 | +models at reduced precision, while minimising loss of information content. There are a number | 
|  | 111 | +of sub-byte dtypes that the community are using e.g. int4. Unfortunately numpy does not | 
|  | 112 | +currently have support for handling such sub-byte dtypes in an easy way. However, they can | 
|  | 113 | +still be held in a numpy array and then passed (in a zero-copy way) to something like pytorch | 
|  | 114 | +which can handle appropriately: | 
|  | 115 | +
 | 
|  | 116 | +```python | 
|  | 117 | +import numpy as np | 
|  | 118 | +from zarr.core.dtype import ZarrDType  # User inherits from ZarrDType when creating their dtype | 
|  | 119 | +
 | 
|  | 120 | +class Int4(ZarrDType): | 
|  | 121 | +    zarr_spec_format = "3" | 
|  | 122 | +    experimental = True | 
|  | 123 | +    endianness = 'little' | 
|  | 124 | +    byte_count = 1  # this is ugly, but I could change this from byte_count to bit_count if there was consensus  | 
|  | 125 | +    to_numpy = np.dtype('B')  # could also be np.dtype('V1'), but this would prevent bit-twiddling | 
|  | 126 | +    configuration_v3 = { | 
|  | 127 | +        "version": "example_value", | 
|  | 128 | +        "author": "example_value", | 
|  | 129 | +    } | 
|  | 130 | +``` | 
|  | 131 | +""" | 
|  | 132 | + | 
|  | 133 | +from __future__ import annotations | 
|  | 134 | + | 
|  | 135 | +from typing import Any, Literal | 
|  | 136 | + | 
|  | 137 | +import numpy as np | 
|  | 138 | + | 
|  | 139 | + | 
|  | 140 | +# perhaps over-complicating, but I don't want to allow the attributes to be patched | 
|  | 141 | +class FrozenClassVariables(type): | 
|  | 142 | +    def __setattr__(cls, attr, value): | 
|  | 143 | +        if hasattr(cls, attr): | 
|  | 144 | +            raise ValueError( | 
|  | 145 | +                f"Attribute {attr} on ZarrDType class can not be changed once set." | 
|  | 146 | +            ) | 
|  | 147 | + | 
|  | 148 | + | 
|  | 149 | +class ZarrDType(metaclass=FrozenClassVariables): | 
|  | 150 | + | 
|  | 151 | +    zarr_spec_format: Literal["2", "3"]  # the version of the zarr spec used | 
|  | 152 | +    experimental: bool  # is this in the core spec or not | 
|  | 153 | +    endianness: Literal[ | 
|  | 154 | +        "big", "little", None | 
|  | 155 | +    ]  # None indicates not defined i.e. single byte or byte strings | 
|  | 156 | +    byte_count: int | None  # None indicates variable count | 
|  | 157 | +    to_numpy: np.dtype[ | 
|  | 158 | +        Any | 
|  | 159 | +    ]  # may involve installing a a numpy extension e.g. ml_dtypes; | 
|  | 160 | + | 
|  | 161 | +    configuration_v3: ( | 
|  | 162 | +        dict | None | 
|  | 163 | +    )  # TODO: understand better how this is recommended by the spec | 
|  | 164 | + | 
|  | 165 | +    _zarr_spec_identifier: str  # implementation detail used to map to core spec | 
|  | 166 | + | 
|  | 167 | +    def __init_subclass__(  # enforces all required fields are set and basic sanity checks | 
|  | 168 | +        cls, | 
|  | 169 | +        **kwargs, | 
|  | 170 | +    ) -> None: | 
|  | 171 | + | 
|  | 172 | +        required_attrs = [ | 
|  | 173 | +            "zarr_spec_format", | 
|  | 174 | +            "experimental", | 
|  | 175 | +            "endianness", | 
|  | 176 | +            "byte_count", | 
|  | 177 | +            "to_numpy", | 
|  | 178 | +        ] | 
|  | 179 | +        for attr in required_attrs: | 
|  | 180 | +            if not hasattr(cls, attr): | 
|  | 181 | +                raise ValueError(f"{attr} is a required attribute for a Zarr dtype.") | 
|  | 182 | + | 
|  | 183 | +        if not hasattr(cls, "configuration_v3"): | 
|  | 184 | +            cls.configuration_v3 = None | 
|  | 185 | + | 
|  | 186 | +        cls._zarr_spec_identifier = ( | 
|  | 187 | +            "big_" + cls.__qualname__.lower() | 
|  | 188 | +            if cls.endianness == "big" | 
|  | 189 | +            else cls.__qualname__.lower() | 
|  | 190 | +        )  # how this dtype is identified in core spec; convention is prefix with big_ for big-endian | 
|  | 191 | + | 
|  | 192 | +        cls._validate()  # sanity check on basic requirements | 
|  | 193 | + | 
|  | 194 | +        super().__init_subclass__(**kwargs) | 
|  | 195 | + | 
|  | 196 | +    # TODO: add further checks | 
|  | 197 | +    @classmethod | 
|  | 198 | +    def _validate(cls): | 
|  | 199 | + | 
|  | 200 | +        if cls.byte_count is not None and cls.byte_count <= 0: | 
|  | 201 | +            raise ValueError("byte_count must be a positive integer.") | 
|  | 202 | + | 
|  | 203 | +        if cls.byte_count == 1 and cls.endianness is not None: | 
|  | 204 | +            raise ValueError("Endianness must be None for single-byte types.") | 
0 commit comments