|
1 | 1 | from __future__ import annotations |
2 | 2 |
|
3 | 3 | import pathlib |
| 4 | +from collections.abc import Iterable |
4 | 5 | from dataclasses import dataclass, field |
5 | 6 | from typing import TYPE_CHECKING |
6 | 7 |
|
|
10 | 11 | from hypothesis import HealthCheck, Verbosity, settings |
11 | 12 |
|
12 | 13 | from zarr import AsyncGroup, config |
| 14 | +from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec |
13 | 15 | from zarr.abc.store import Store |
| 16 | +from zarr.codecs.bytes import BytesCodec |
| 17 | +from zarr.codecs.sharding import ShardingCodec |
| 18 | +from zarr.core.chunk_grids import _guess_chunks |
| 19 | +from zarr.core.chunk_key_encodings import ChunkKeyEncoding |
| 20 | +from zarr.core.metadata.v2 import ArrayV2Metadata |
| 21 | +from zarr.core.metadata.v3 import ArrayV3Metadata |
14 | 22 | from zarr.core.sync import sync |
15 | 23 | from zarr.storage import FsspecStore, LocalStore, MemoryStore, StorePath, ZipStore |
16 | 24 |
|
@@ -159,3 +167,183 @@ def zarr_format(request: pytest.FixtureRequest) -> ZarrFormat: |
159 | 167 | suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.too_slow], |
160 | 168 | verbosity=Verbosity.verbose, |
161 | 169 | ) |
| 170 | +import numcodecs |
| 171 | + |
| 172 | + |
| 173 | +def meta_from_array_v2( |
| 174 | + array: np.ndarray[Any, Any], |
| 175 | + chunks: ChunkCoords | Literal["auto"] = "auto", |
| 176 | + compressor: numcodecs.abc.Codec | Literal["auto"] | None = "auto", |
| 177 | + filters: Iterable[numcodecs.abc.Codec] | Literal["auto"] = "auto", |
| 178 | + fill_value: Any = "auto", |
| 179 | + order: MemoryOrder | Literal["auto"] = "auto", |
| 180 | + dimension_separator: Literal[".", "/", "auto"] = "auto", |
| 181 | + attributes: dict[str, Any] | None = None, |
| 182 | +) -> ArrayV2Metadata: |
| 183 | + """ |
| 184 | + Create a v2 metadata object from a numpy array |
| 185 | + """ |
| 186 | + |
| 187 | + _chunks = auto_chunks(chunks, array.shape, array.dtype) |
| 188 | + _compressor = auto_compressor(compressor) |
| 189 | + _filters = auto_filters(filters) |
| 190 | + _fill_value = auto_fill_value(fill_value) |
| 191 | + _order = auto_order(order) |
| 192 | + _dimension_separator = auto_dimension_separator(dimension_separator) |
| 193 | + return ArrayV2Metadata( |
| 194 | + shape=array.shape, |
| 195 | + dtype=array.dtype, |
| 196 | + chunks=_chunks, |
| 197 | + compressor=_compressor, |
| 198 | + filters=_filters, |
| 199 | + fill_value=_fill_value, |
| 200 | + order=_order, |
| 201 | + dimension_separator=_dimension_separator, |
| 202 | + attributes=attributes, |
| 203 | + ) |
| 204 | + |
| 205 | + |
| 206 | +from typing import TypedDict |
| 207 | + |
| 208 | + |
| 209 | +class ChunkEncoding(TypedDict): |
| 210 | + filters: tuple[ArrayArrayCodec] |
| 211 | + compressors: tuple[BytesBytesCodec] |
| 212 | + serializer: ArrayBytesCodec |
| 213 | + |
| 214 | + |
| 215 | +class ChunkingSpec(TypedDict): |
| 216 | + shard_shape: tuple[int, ...] |
| 217 | + chunk_shape: tuple[int, ...] | None |
| 218 | + chunk_key_encoding: ChunkKeyEncoding |
| 219 | + |
| 220 | + |
| 221 | +def meta_from_array_v3( |
| 222 | + array: np.ndarray[Any, Any], |
| 223 | + shard_shape: tuple[int, ...] | Literal["auto"] | None, |
| 224 | + chunk_shape: tuple[int, ...] | Literal["auto"], |
| 225 | + serializer: ArrayBytesCodec | Literal["auto"] = "auto", |
| 226 | + compressors: Iterable[BytesBytesCodec] | Literal["auto"] = "auto", |
| 227 | + filters: Iterable[ArrayArrayCodec] | Literal["auto"] = "auto", |
| 228 | + fill_value: Any = "auto", |
| 229 | + chunk_key_encoding: ChunkKeyEncoding | Literal["auto"] = "auto", |
| 230 | + dimension_names: Iterable[str] | None = None, |
| 231 | + attributes: dict[str, Any] | None = None, |
| 232 | +) -> ArrayV3Metadata: |
| 233 | + _write_chunks, _read_chunks = auto_chunks_v3( |
| 234 | + shard_shape=shard_shape, chunk_shape=chunk_shape, array_shape=array.shape, dtype=array.dtype |
| 235 | + ) |
| 236 | + _codecs = auto_codecs(serializer=serializer, compressors=compressors, filters=filters) |
| 237 | + if _read_chunks is not None: |
| 238 | + _codecs = (ShardingCodec(codecs=_codecs, chunk_shape=_read_chunks),) |
| 239 | + |
| 240 | + _fill_value = auto_fill_value(fill_value) |
| 241 | + _chunk_key_encoding = auto_chunk_key_encoding(chunk_key_encoding) |
| 242 | + return ArrayV3Metadata( |
| 243 | + shape=array.shape, |
| 244 | + dtype=array.dtype, |
| 245 | + codecs=_codecs, |
| 246 | + chunk_key_encoding=_chunk_key_encoding, |
| 247 | + fill_value=fill_value, |
| 248 | + chunk_grid={"name": "regular", "config": {"chunk_shape": shard_shape}}, |
| 249 | + attributes=attributes, |
| 250 | + dimension_names=dimension_names, |
| 251 | + ) |
| 252 | + |
| 253 | + |
| 254 | +from zarr.abc.codec import Codec |
| 255 | +from zarr.codecs import ZstdCodec |
| 256 | + |
| 257 | + |
| 258 | +def auto_codecs( |
| 259 | + *, |
| 260 | + filters: Iterable[ArrayArrayCodec] | Literal["auto"] = "auto", |
| 261 | + compressors: Iterable[BytesBytesCodec] | Literal["auto"] = "auto", |
| 262 | + serializer: ArrayBytesCodec | Literal["auto"] = "auto", |
| 263 | +) -> tuple[Codec, ...]: |
| 264 | + """ |
| 265 | + Heuristically generate a tuple of codecs |
| 266 | + """ |
| 267 | + _compressors: tuple[BytesBytesCodec, ...] |
| 268 | + _filters: tuple[ArrayArrayCodec, ...] |
| 269 | + _serializer: ArrayBytesCodec |
| 270 | + if filters == "auto": |
| 271 | + _filters = () |
| 272 | + else: |
| 273 | + _filters = tuple(filters) |
| 274 | + |
| 275 | + if compressors == "auto": |
| 276 | + _compressors = (ZstdCodec(level=3),) |
| 277 | + else: |
| 278 | + _compressors = tuple(compressors) |
| 279 | + |
| 280 | + if serializer == "auto": |
| 281 | + _serializer = BytesCodec() |
| 282 | + else: |
| 283 | + _serializer = serializer |
| 284 | + return (*_filters, _serializer, *_compressors) |
| 285 | + |
| 286 | + |
| 287 | +def auto_dimension_separator(dimension_separator: Literal[".", "/", "auto"]) -> Literal[".", "/"]: |
| 288 | + if dimension_separator == "auto": |
| 289 | + return "/" |
| 290 | + return dimension_separator |
| 291 | + |
| 292 | + |
| 293 | +def auto_order(order: MemoryOrder | Literal["auto"]) -> MemoryOrder: |
| 294 | + if order == "auto": |
| 295 | + return "C" |
| 296 | + return order |
| 297 | + |
| 298 | + |
| 299 | +def auto_fill_value(fill_value: Any) -> Any: |
| 300 | + if fill_value == "auto": |
| 301 | + return 0 |
| 302 | + return fill_value |
| 303 | + |
| 304 | + |
| 305 | +def auto_compressor( |
| 306 | + compressor: numcodecs.abc.Codec | Literal["auto"] | None, |
| 307 | +) -> numcodecs.abc.Codec | None: |
| 308 | + if compressor == "auto": |
| 309 | + return numcodecs.Zstd(level=3) |
| 310 | + return compressor |
| 311 | + |
| 312 | + |
| 313 | +def auto_filters( |
| 314 | + filters: Iterable[numcodecs.abc.Codec] | Literal["auto"], |
| 315 | +) -> tuple[numcodecs.abc.Codec, ...]: |
| 316 | + if filters == "auto": |
| 317 | + return () |
| 318 | + return tuple(filters) |
| 319 | + |
| 320 | + |
| 321 | +def auto_chunks( |
| 322 | + chunks: tuple[int, ...] | Literal["auto"], shape: tuple[int, ...], dtype: npt.DTypeLike |
| 323 | +) -> tuple[int, ...]: |
| 324 | + if chunks == "auto": |
| 325 | + return _guess_chunks(shape, np.dtype(dtype).itemsize) |
| 326 | + return chunks |
| 327 | + |
| 328 | + |
| 329 | +def auto_chunks_v3( |
| 330 | + *, |
| 331 | + shard_shape: tuple[int, ...] | Literal["auto"], |
| 332 | + chunk_shape: tuple[int, ...] | Literal["auto"] | None, |
| 333 | + array_shape: tuple[int, ...], |
| 334 | + dtype: npt.DTypeLike, |
| 335 | +) -> tuple[tuple[int, ...], tuple[int, ...] | None]: |
| 336 | + match (shard_shape, chunk_shape): |
| 337 | + case ("auto", "auto"): |
| 338 | + # stupid default but easy to think about |
| 339 | + return ((256,) * len(array_shape), (64,) * len(array_shape)) |
| 340 | + case ("auto", None): |
| 341 | + return (_guess_chunks(array_shape, np.dtype(dtype).itemsize), None) |
| 342 | + case ("auto", _): |
| 343 | + return (chunk_shape, chunk_shape) |
| 344 | + case (_, None): |
| 345 | + return (shard_shape, None) |
| 346 | + case (_, "auto"): |
| 347 | + return (shard_shape, shard_shape) |
| 348 | + case _: |
| 349 | + return (shard_shape, chunk_shape) |
0 commit comments