diff --git a/src/zarr/__init__.py b/src/zarr/__init__.py index 51116a929e..bcbdaf7c19 100644 --- a/src/zarr/__init__.py +++ b/src/zarr/__init__.py @@ -6,6 +6,8 @@ copy_all, copy_store, create, + create_array, + create_group, empty, empty_like, full, @@ -46,6 +48,8 @@ "copy_all", "copy_store", "create", + "create_array", + "create_group", "empty", "empty_like", "full", diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 14078944d7..c8125a9641 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -18,14 +18,14 @@ ChunkCoords, MemoryOrder, ZarrFormat, + _default_zarr_version, _warn_order_kwarg, _warn_write_empty_chunks_kwarg, parse_dtype, ) -from zarr.core.config import config from zarr.core.group import AsyncGroup, ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayMetadataDict, ArrayV2Metadata, ArrayV3Metadata -from zarr.core.metadata.v2 import _default_filters_and_compressor +from zarr.core.metadata.v2 import _default_compressor, _default_filters from zarr.errors import NodeTypeValidationError from zarr.storage import ( StoreLike, @@ -150,11 +150,6 @@ def _handle_zarr_version_or_format( return zarr_format -def _default_zarr_version() -> ZarrFormat: - """Return the default zarr_version""" - return cast(ZarrFormat, int(config.get("default_zarr_version", 3))) - - async def consolidate_metadata( store: StoreLike, path: str | None = None, @@ -300,8 +295,8 @@ async def open( path : str or None, optional The path within the store to open. storage_options : dict - If using an fsspec URL to create the store, these will be passed to - the backend implementation. Ignored otherwise. + If the store is backed by an fsspec-based implementation, then this dict will be passed to + the Store constructor for that implementation. Ignored otherwise. **kwargs Additional parameters are passed through to :func:`zarr.creation.open_array` or :func:`zarr.hierarchy.open_group`. @@ -666,6 +661,54 @@ async def group( ) +async def create_group( + *, + store: StoreLike, + path: str | None = None, + overwrite: bool = False, + zarr_format: ZarrFormat | None = None, + attributes: dict[str, Any] | None = None, + storage_options: dict[str, Any] | None = None, +) -> AsyncGroup: + """Create a group. + + Parameters + ---------- + store : Store or str + Store or path to directory in file system. + path : str, optional + Group path within store. + overwrite : bool, optional + If True, pre-existing data at ``path`` will be deleted before + creating the group. + zarr_format : {2, 3, None}, optional + The zarr format to use when saving. + storage_options : dict + If using an fsspec URL to create the store, these will be passed to + the backend implementation. Ignored otherwise. + + Returns + ------- + g : group + The new group. + """ + + if zarr_format is None: + zarr_format = _default_zarr_version() + + # TODO: fix this when modes make sense. It should be `w` for overwriting, `w-` otherwise + mode: Literal["a"] = "a" + + store_path = await make_store_path(store, path=path, mode=mode, storage_options=storage_options) + + return await AsyncGroup.from_store( + store=store_path, + zarr_format=zarr_format, + overwrite=overwrite, + attributes=attributes, + ) + + async def open_group( store: StoreLike | None = None, *, # Note: this is a change from v2 @@ -843,8 +886,8 @@ async def create( If no codecs are provided, default codecs will be used: - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``. + - For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. compressor : Codec, optional @@ -857,7 +900,8 @@ async def create( - For Unicode strings, the default is ``VLenUTF8Codec``. - For bytes or objects, the default is ``VLenBytesCodec``. - These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. fill_value : object + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. + fill_value : object Default value to use for uninitialized portions of the array. order : {'C', 'F'}, optional Deprecated in favor of the ``config`` keyword argument. @@ -878,8 +922,8 @@ async def create( for storage of both chunks and metadata. filters : sequence of Codecs, optional Sequence of filters to use to encode chunk data prior to compression. - V2 only. If neither ``compressor`` nor ``filters`` are provided, a default - compressor will be used. (see ``compressor`` for details). + V2 only. If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`. cache_metadata : bool, optional If True, array configuration metadata will be cached for the lifetime of the object. If False, array metadata will be reloaded @@ -932,8 +976,10 @@ async def create( if chunks is None: chunks = shape dtype = parse_dtype(dtype, zarr_format) - if not filters and not compressor: - filters, compressor = _default_filters_and_compressor(dtype) + if not filters: + filters = _default_filters(dtype) + if not compressor: + compressor = _default_compressor(dtype) elif zarr_format == 3 and chunk_shape is None: # type: ignore[redundant-expr] if chunks is not None: chunk_shape = chunks @@ -1015,6 +1061,41 @@ async def create( ) +async def read_array( + store: StoreLike, + *, + path: str | None = None, + zarr_format: ZarrFormat | None = None, + storage_options: dict[str, Any] | None = None, +) -> AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata]: + """Create an array for reading. Wraps `:func:zarr.api.asynchronous.create`. + See the documentation of that function for details. + + Parameters + ---------- + store : Store or str + Store or path to directory in file system or name of zip file. + path : str, optional + Path under which the array is stored. + zarr_format : {2, 3, None}, optional + The zarr format to require. The default value of ``None`` will first look for Zarr v3 data, + then Zarr v2 data, then fail if neither format is found. + storage_options : dict + If using an fsspec URL to create the store, these will be passed to + the backend implementation. Ignored otherwise. + + Returns + ------- + z : array + The array. + """ + store_path = await make_store_path(store, path=path, mode="r", storage_options=storage_options) + return await AsyncArray.open( + store=store_path, + zarr_format=zarr_format, + ) + + async def empty( shape: ChunkCoords, **kwargs: Any ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index cd1ef8b38d..f15513715a 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -5,6 +5,7 @@ from typing_extensions import deprecated import zarr.api.asynchronous as async_api +import zarr.core.array from zarr._compat import _deprecate_positional_args from zarr.core.array import Array, AsyncArray from zarr.core.group import Group @@ -523,6 +524,29 @@ def open_group( ) +def create_group( + store: StoreLike, + *, + path: str | None = None, + zarr_format: ZarrFormat | None = None, + overwrite: bool = False, + attributes: dict[str, Any] | None = None, + storage_options: dict[str, Any] | None = None, +) -> Group: + return Group( + sync( + async_api.create_group( + store=store, + path=path, + overwrite=overwrite, + storage_options=storage_options, + zarr_format=zarr_format, + attributes=attributes, + ) + ) + ) + + # TODO: add type annotations for kwargs def create( shape: ChunkCoords | int, @@ -675,6 +699,10 @@ def create( ) +def create_array(*args: Any, **kwargs: Any) -> Array: + return Array(sync(zarr.core.array.create_array(*args, **kwargs))) + + # TODO: add type annotations for kwargs def empty(shape: ChunkCoords, **kwargs: Any) -> Array: """Create an empty array. diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 717eff36dc..429fa4f748 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -1,21 +1,25 @@ from __future__ import annotations import json +import warnings from asyncio import gather +from collections.abc import Iterable, Mapping from dataclasses import dataclass, field from itertools import starmap from logging import getLogger -from typing import TYPE_CHECKING, Any, Generic, Literal, cast, overload +from typing import TYPE_CHECKING, Any, Generic, Literal, TypeAlias, cast, overload from warnings import warn +import numcodecs import numpy as np import numpy.typing as npt from zarr._compat import _deprecate_positional_args +from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.abc.store import Store, set_or_delete from zarr.codecs._v2 import V2Codec from zarr.core._info import ArrayInfo -from zarr.core.array_spec import ArrayConfig, ArrayConfigParams, normalize_array_config +from zarr.core.array_spec import ArrayConfig, ArrayConfigParams, parse_array_config from zarr.core.attributes import Attributes from zarr.core.buffer import ( BufferPrototype, @@ -23,9 +27,10 @@ NDBuffer, default_buffer_prototype, ) -from zarr.core.chunk_grids import RegularChunkGrid, normalize_chunks +from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition, normalize_chunks from zarr.core.chunk_key_encodings import ( ChunkKeyEncoding, + ChunkKeyEncodingParams, DefaultChunkKeyEncoding, V2ChunkKeyEncoding, ) @@ -38,6 +43,7 @@ MemoryOrder, ShapeLike, ZarrFormat, + _default_zarr_version, _warn_order_kwarg, concurrent_map, parse_dtype, @@ -80,19 +86,22 @@ ArrayV3MetadataDict, T_ArrayMetadata, ) -from zarr.core.metadata.v2 import _default_filters_and_compressor +from zarr.core.metadata.v2 import ( + _default_compressor, + _default_filters, +) from zarr.core.metadata.v3 import DataType, parse_node_type_array from zarr.core.sync import sync from zarr.errors import MetadataValidationError -from zarr.registry import get_pipeline_class +from zarr.registry import get_codec_class, get_pipeline_class from zarr.storage import StoreLike, make_store_path from zarr.storage.common import StorePath, ensure_no_existing_node if TYPE_CHECKING: - from collections.abc import Iterable, Iterator, Sequence + from collections.abc import Iterator, Sequence from typing import Self - from zarr.abc.codec import Codec, CodecPipeline + from zarr.abc.codec import CodecPipeline from zarr.core.group import AsyncGroup # Array and AsyncArray are defined in the base ``zarr`` namespace @@ -149,9 +158,9 @@ async def get_array_metadata( (store_path / ZATTRS_JSON).get(), ) if zarr_json_bytes is not None and zarray_bytes is not None: - # TODO: revisit this exception type - # alternatively, we could warn and favor v3 - raise ValueError("Both zarr.json and .zarray objects exist") + # warn and favor v3 + msg = f"Both zarr.json (zarr v3) and .zarray (zarr v2) metadata objects exist at {store_path}." + warnings.warn(msg, stacklevel=1) if zarr_json_bytes is None and zarray_bytes is None: raise FileNotFoundError(store_path) # set zarr_format based on which keys were found @@ -431,8 +440,8 @@ async def create( If no codecs are provided, default codecs will be used: - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``. + - For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. dimension_names : Iterable[str], optional @@ -453,14 +462,14 @@ async def create( order for Zarr 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. filters : list[dict[str, JSON]], optional Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` - nor ``filters`` are provided, a default compressor will be used. (see - ``compressor`` for details) + V2 only. V3 arrays should use ``codecs`` instead. If no ``filters`` + are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`. compressor : dict[str, JSON], optional The compressor used to compress the data (default is None). V2 only. V3 arrays should use ``codecs`` instead. - If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: + If no ``compressor`` is provided, a default compressor will be used: - For numeric arrays, the default is ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. @@ -502,7 +511,7 @@ async def create( _chunks = normalize_chunks(chunks, shape, dtype_parsed.itemsize) else: _chunks = normalize_chunks(chunk_shape, shape, dtype_parsed.itemsize) - config_parsed = normalize_array_config(config) + config_parsed = parse_array_config(config) result: AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] if zarr_format == 3: @@ -670,8 +679,10 @@ async def _create_v2( dimension_separator = "." dtype = parse_dtype(dtype, zarr_format=2) - if not filters and not compressor: - filters, compressor = _default_filters_and_compressor(dtype) + if not filters: + filters = _default_filters(dtype) + if not compressor: + compressor = _default_compressor(dtype) if np.issubdtype(dtype, np.str_): filters = filters or [] if not any(x["id"] == "vlen-utf8" for x in filters): @@ -1565,8 +1576,8 @@ def create( If no codecs are provided, default codecs will be used: - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``. + - For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. dimension_names : Iterable[str], optional @@ -1587,14 +1598,14 @@ def create( order for Zarr 3 arrays is via the ``config`` parameter, e.g. ``{'order': 'C'}``. filters : list[dict[str, JSON]], optional Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` - nor ``filters`` are provided, a default compressor will be used. (see - ``compressor`` for details) + V2 only. V3 arrays should use ``codecs`` instead. If no ``filters`` + are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`. compressor : dict[str, JSON], optional Primary compressor to compress chunk data. V2 only. V3 arrays should use ``codecs`` instead. - If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: + If no ``compressor`` is provided, a default compressor will be used: - For numeric arrays, the default is ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. @@ -3448,4 +3459,321 @@ def _get_default_codecs( else: dtype_key = "numeric" - return [{"name": codec_id, "configuration": {}} for codec_id in default_codecs[dtype_key]] + return default_codecs[dtype_key] + + +FiltersParam: TypeAlias = ( + Iterable[dict[str, JSON] | Codec] | Iterable[numcodecs.abc.Codec] | Literal["auto"] +) +CompressionParam: TypeAlias = ( + Iterable[dict[str, JSON] | Codec] | Codec | numcodecs.abc.Codec | Literal["auto"] +) + + +async def create_array( + store: str | StoreLike, + *, + name: str | None = None, + shape: ShapeLike, + dtype: npt.DTypeLike, + chunk_shape: ChunkCoords | Literal["auto"] = "auto", + shard_shape: ChunkCoords | None = None, + filters: FiltersParam = "auto", + compression: CompressionParam = "auto", + fill_value: Any | None = 0, + order: MemoryOrder | None = "C", + zarr_format: ZarrFormat | None = 3, + attributes: dict[str, JSON] | None = None, + chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingParams | None = None, + dimension_names: Iterable[str] | None = None, + storage_options: dict[str, Any] | None = None, + overwrite: bool = False, + config: ArrayConfig | ArrayConfigParams | None = None, + data: npt.ArrayLike | None = None, +) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: + """Create an array. + + Parameters + ---------- + store : str or Store + Store or path to directory in file system or name of zip file. + name : str or None, optional + The name of the array within the store. If ``name`` is ``None``, the array will be located + at the root of the store. + shape : ChunkCoords + Shape of the array. + dtype : npt.DTypeLike + Data type of the array. + chunk_shape : ChunkCoords + Chunk shape of the array. + shard_shape : ChunkCoords, optional + Shard shape of the array. The default value of ``None`` results in no sharding at all. + filters : Iterable[Codec], optional + List of filters to apply to the array. + compression : Iterable[Codec], optional + List of compressors to apply to the array. + fill_value : Any, optional + Fill value for the array. + order : {"C", "F"}, optional + Memory layout of the array. + zarr_format : {2, 3}, optional + The zarr format to use when saving. + attributes : dict, optional + Attributes for the array. + chunk_key_encoding : ChunkKeyEncoding, optional + The chunk key encoding to use. + dimension_names : Iterable[str], optional + Dimension names for the array. + storage_options : dict, optional + If using an fsspec URL to create the store, these will be passed to the backend implementation. + Ignored otherwise. + overwrite : bool, default False + Whether to overwrite an array with the same name in the store, if one exists. + config : ArrayConfig or ArrayConfigParams, optional + Runtime configuration for the array. + data : np.ndarray, optional + Initial data for the array. + + Returns + ------- + z : array + The array. + """ + + if zarr_format is None: + zarr_format = _default_zarr_version() + + # TODO: figure out why putting these imports at top-level causes circular imports + from zarr.codecs.sharding import ShardingCodec + + # TODO: fix this when modes make sense. It should be `w` for overwriting, `w-` otherwise + mode: Literal["a"] = "a" + dtype_parsed = parse_dtype(dtype, zarr_format=zarr_format) + config_parsed = parse_array_config(config) + shape_parsed = parse_shapelike(shape) + chunk_key_encoding_parsed = _parse_chunk_key_encoding( + chunk_key_encoding, zarr_format=zarr_format + ) + store_path = await make_store_path(store, path=name, mode=mode, storage_options=storage_options) + shard_shape_parsed, chunk_shape_parsed = _auto_partition( + shape_parsed, shard_shape, chunk_shape, dtype_parsed + ) + result: AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] + + if zarr_format == 2: + if shard_shape_parsed is not None: + msg = ( + 'Zarr v2 arrays can only be created with `shard_shape` set to `None` or `"auto"`.' + f"Got `shard_shape={shard_shape}` instead." + ) + + raise ValueError(msg) + if filters != "auto" and not all(isinstance(f, numcodecs.abc.Codec) for f in filters): + raise TypeError( + "For Zarr v2 arrays, all elements of `filters` must be numcodecs codecs." + ) + filters = cast(Iterable[numcodecs.abc.Codec] | Literal["auto"], filters) + filters_parsed, compressor_parsed = _parse_chunk_encoding_v2( + compression=compression, filters=filters, dtype=dtype_parsed + ) + if dimension_names is not None: + raise ValueError("Zarr v2 arrays do not support dimension names.") + if order is None: + order_parsed = zarr_config.get("array.order") + else: + order_parsed = order + + result = await AsyncArray._create_v2( + store_path=store_path, + shape=shape_parsed, + dtype=dtype_parsed, + chunks=chunk_shape_parsed, + dimension_separator=chunk_key_encoding_parsed.separator, + fill_value=fill_value, + order=order_parsed, + filters=filters_parsed, + compressor=compressor_parsed, + attributes=attributes, + overwrite=overwrite, + config=config_parsed, + ) + else: + array_array, array_bytes, bytes_bytes = _parse_chunk_encoding_v3( + compression=compression, filters=filters, dtype=dtype_parsed + ) + sub_codecs = (*array_array, array_bytes, *bytes_bytes) + codecs_out: tuple[Codec, ...] + if shard_shape_parsed is not None: + sharding_codec = ShardingCodec(chunk_shape=chunk_shape_parsed, codecs=sub_codecs) + sharding_codec.validate( + shape=chunk_shape_parsed, + dtype=dtype_parsed, + chunk_grid=RegularChunkGrid(chunk_shape=shard_shape), + ) + codecs_out = (sharding_codec,) + chunks_out = shard_shape + else: + chunks_out = chunk_shape_parsed + codecs_out = sub_codecs + + result = await AsyncArray._create_v3( + store_path=store_path, + shape=shape_parsed, + dtype=dtype_parsed, + fill_value=fill_value, + attributes=attributes, + chunk_shape=chunks_out, + chunk_key_encoding=chunk_key_encoding_parsed, + codecs=codecs_out, + dimension_names=dimension_names, + overwrite=overwrite, + config=config_parsed, + ) + + if data is not None: + await result.setitem( + selection=slice(None), value=data, prototype=default_buffer_prototype() + ) + return result + + +def _parse_chunk_key_encoding( + data: ChunkKeyEncoding | ChunkKeyEncodingParams | None, zarr_format: ZarrFormat +) -> ChunkKeyEncoding: + """ + Take an implicit specification of a chunk key encoding and parse it into a ChunkKeyEncoding object. + """ + if data is None: + if zarr_format == 2: + result = ChunkKeyEncoding.from_dict({"name": "v2", "separator": "/"}) + else: + result = ChunkKeyEncoding.from_dict({"name": "default", "separator": "/"}) + elif isinstance(data, ChunkKeyEncoding): + result = data + else: + result = ChunkKeyEncoding.from_dict(data) + if zarr_format == 2 and result.name != "v2": + msg = ( + "Invalid chunk key encoding. For Zarr v2 arrays, the `name` field of the " + f"chunk key encoding must be 'v2'. Got `name` = {result.name} instead." + ) + raise ValueError(msg) + return result + + +def _get_default_encoding_v3( + np_dtype: np.dtype[Any], +) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: + """ + Get the default ArrayArrayCodecs, ArrayBytesCodec, and BytesBytesCodec for a given dtype. + """ + default_codecs = zarr_config.get("array.v3_default_codecs") + dtype = DataType.from_numpy(np_dtype) + if dtype == DataType.string: + dtype_key = "string" + elif dtype == DataType.bytes: + dtype_key = "bytes" + else: + dtype_key = "numeric" + + codec_dicts = default_codecs[dtype_key] + codecs = tuple(get_codec_class(c["name"]).from_dict(c) for c in codec_dicts) + array_bytes_maybe = None + array_array: list[ArrayArrayCodec] = [] + bytes_bytes: list[BytesBytesCodec] = [] + + for codec in codecs: + if isinstance(codec, ArrayBytesCodec): + if array_bytes_maybe is not None: + raise ValueError( + f"Got two instances of ArrayBytesCodec: {array_bytes_maybe} and {codec}. " + "Only one array-to-bytes codec is allowed." + ) + array_bytes_maybe = codec + elif isinstance(codec, ArrayArrayCodec): + array_array.append(codec) + elif isinstance(codec, BytesBytesCodec): + bytes_bytes.append(codec) + else: + raise TypeError(f"Unexpected codec type: {type(codec)}") + + if array_bytes_maybe is None: + raise ValueError("Required ArrayBytesCodec was not found.") + + return tuple(array_array), array_bytes_maybe, tuple(bytes_bytes) + + +def _get_default_chunk_encoding_v2( + dtype: np.dtype[Any], +) -> tuple[tuple[numcodecs.abc.Codec, ...], numcodecs.abc.Codec | None]: + """ + Get the default chunk encoding for zarr v2 arrays, given a dtype + """ + if dtype.kind in "biufcmM": + dtype_key = "numeric" + elif dtype.kind in "U": + dtype_key = "string" + elif dtype.kind in "OSV": + dtype_key = "bytes" + else: + raise ValueError(f"Unsupported dtype kind {dtype.kind}") + + compressor_dict = zarr_config.get("array.v2_default_compressor").get(dtype_key, None) + filter_dicts = zarr_config.get("array.v2_default_filters").get(dtype_key, []) + + compressor = None + if compressor_dict is not None: + compressor = numcodecs.get_codec(compressor_dict) + filters = tuple(numcodecs.get_codec(f) for f in filter_dicts) + return filters, compressor + + +def _parse_chunk_encoding_v2( + *, + compression: numcodecs.abc.Codec | Literal["auto"], + filters: tuple[numcodecs.abc.Codec, ...] | Literal["auto"], + dtype: np.dtype[Any], +) -> tuple[tuple[numcodecs.abc.Codec, ...], numcodecs.abc.Codec]: + """ + Generate chunk encoding classes for v2 arrays with optional defaults. + """ + default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype) + _filters: tuple[numcodecs.abc.Codec, ...] = () + if compression == "auto": + _compressor = default_compressor + else: + _compressor = compression + if filters == "auto": + _filters = default_filters + else: + _filters = filters + return _filters, _compressor + + +def _parse_chunk_encoding_v3( + *, + compression: Iterable[BytesBytesCodec] | Literal["auto"], + filters: Iterable[ArrayArrayCodec] | Literal["auto"], + dtype: np.dtype[Any], +) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: + """ + Generate chunk encoding classes for v3 arrays with optional defaults. + """ + default_array_array, default_array_bytes, default_bytes_bytes = _get_default_encoding_v3(dtype) + + if compression == "auto": + out_bytes_bytes = default_bytes_bytes + else: + if isinstance(compression, Mapping | Codec): + out_bytes_bytes = (compression,) + else: + out_bytes_bytes = tuple(compression) + if filters == "auto": + out_array_array = default_array_array + else: + if isinstance(filters, Mapping | Codec): + out_array_array = (filters,) + else: + out_array_array = tuple(filters) + + return out_array_array, default_array_bytes, out_bytes_bytes diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index ee6934d05f..ed5adf5526 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -72,7 +72,7 @@ def from_dict(cls, data: ArrayConfigParams) -> Self: return cls(**kwargs_out) -def normalize_array_config(data: ArrayConfig | ArrayConfigParams | None) -> ArrayConfig: +def parse_array_config(data: ArrayConfig | ArrayConfigParams | None) -> ArrayConfig: """ Convert various types of data to an ArrayConfig. """ diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index ea050e39ef..394d6807d3 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -7,7 +7,7 @@ from abc import abstractmethod from dataclasses import dataclass from functools import reduce -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Literal import numpy as np @@ -194,3 +194,48 @@ def get_nchunks(self, array_shape: ChunkCoords) -> int: itertools.starmap(ceildiv, zip(array_shape, self.chunk_shape, strict=True)), 1, ) + + +def _auto_partition( + array_shape: tuple[int, ...], + shard_shape: tuple[int, ...] | Literal["auto"] | None, + chunk_shape: tuple[int, ...] | Literal["auto"], + dtype: np.dtype[Any], +) -> tuple[tuple[int, ...] | None, tuple[int, ...]]: + """ + Automatically determine the shard shape and chunk shape for an array, given the shape and dtype of the array. + If `shard_shape` is `None` and the chunk_shape is "auto", the chunks will be set heuristically based + on the dtype and shape of the array. + If `shard_shape` is "auto", then the shard shape will be set heuristically from the dtype and shape + of the array; if the `chunk_shape` is also "auto", then the chunks will be set heuristically as well, + given the dtype and shard shape. Otherwise, the chunks will be returned as-is. + """ + + item_size = dtype.itemsize + if shard_shape is None: + _shards_out: None | tuple[int, ...] = None + if chunk_shape == "auto": + _chunks_out = _guess_chunks(array_shape, item_size) + else: + _chunks_out = chunk_shape + else: + if chunk_shape == "auto": + # aim for a 1MiB chunk + _chunks_out = _guess_chunks(array_shape, item_size, max_bytes=1024) + else: + _chunks_out = chunk_shape + + if shard_shape == "auto": + _shards_out = () + for a_shape, c_shape in zip(array_shape, _chunks_out, strict=True): + # TODO: make a better heuristic than this. + # for each axis, if there are more than 16 chunks along that axis, then make put + # 2 chunks in each shard for that axis. + if a_shape // c_shape > 16: + _shards_out += (c_shape * 2,) + else: + _shards_out += (c_shape,) + else: + _shards_out = shard_shape + + return _shards_out, _chunks_out diff --git a/src/zarr/core/chunk_key_encodings.py b/src/zarr/core/chunk_key_encodings.py index ed12ee3065..33b44b3232 100644 --- a/src/zarr/core/chunk_key_encodings.py +++ b/src/zarr/core/chunk_key_encodings.py @@ -2,7 +2,7 @@ from abc import abstractmethod from dataclasses import dataclass -from typing import Literal, cast +from typing import Literal, TypedDict, cast from zarr.abc.metadata import Metadata from zarr.core.common import ( @@ -20,6 +20,11 @@ def parse_separator(data: JSON) -> SeparatorLiteral: return cast(SeparatorLiteral, data) +class ChunkKeyEncodingParams(TypedDict): + name: Literal["v2", "default"] + separator: SeparatorLiteral + + @dataclass(frozen=True) class ChunkKeyEncoding(Metadata): name: str diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 3db00b1a06..7e7b2e73da 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -18,6 +18,7 @@ import numpy as np +from zarr.core.config import config as zarr_config from zarr.core.strings import _STRING_DTYPE if TYPE_CHECKING: @@ -197,3 +198,8 @@ def _warn_order_kwarg() -> None: "or change the global 'array.order' configuration variable." ) warnings.warn(msg, RuntimeWarning, stacklevel=2) + + +def _default_zarr_version() -> ZarrFormat: + """Return the default zarr_version""" + return cast(ZarrFormat, int(zarr_config.get("default_zarr_version", 3))) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index a14305aef8..739529a3f9 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -67,14 +67,28 @@ def reset(self) -> None: "order": "C", "write_empty_chunks": False, "v2_default_compressor": { - "numeric": "zstd", - "string": "vlen-utf8", - "bytes": "vlen-bytes", + "numeric": {"id": "zstd", "level": 0, "checksum": True}, + "string": {"id": "zstd", "level": 0, "checksum": True}, + "bytes": {"id": "zstd", "level": 0, "checksum": True}, + }, + "v2_default_filters": { + "numeric": [], + "string": [{"id": "vlen-utf8"}], + "bytes": [{"id": "vlen-bytes"}], }, "v3_default_codecs": { - "numeric": ["bytes", "zstd"], - "string": ["vlen-utf8"], - "bytes": ["vlen-bytes"], + "numeric": [ + {"name": "bytes", "configuration": {"endian": "little"}}, + {"name": "zstd", "configuration": {"level": 0, "checksum": True}}, + ], + "string": [ + {"name": "vlen-utf8"}, + {"name": "zstd", "configuration": {"level": 0, "checksum": True}}, + ], + "bytes": [ + {"name": "vlen-bytes"}, + {"name": "zstd", "configuration": {"level": 0, "checksum": True}}, + ], }, }, "async": {"concurrency": 10, "timeout": None}, diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 2d7a21911a..f3bc3f3eec 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -18,7 +18,7 @@ from zarr.abc.metadata import Metadata from zarr.abc.store import Store, set_or_delete from zarr.core._info import GroupInfo -from zarr.core.array import Array, AsyncArray, _build_parents +from zarr.core.array import Array, AsyncArray, _build_parents, create_array from zarr.core.attributes import Attributes from zarr.core.buffer import default_buffer_prototype from zarr.core.common import ( @@ -47,8 +47,10 @@ from typing import Any from zarr.abc.codec import Codec + from zarr.core.array_spec import ArrayConfig, ArrayConfigParams from zarr.core.buffer import Buffer, BufferPrototype - from zarr.core.chunk_key_encodings import ChunkKeyEncoding + from zarr.core.chunk_key_encodings import ChunkKeyEncoding, ChunkKeyEncodingParams + from zarr.core.common import MemoryOrder logger = logging.getLogger("zarr.group") @@ -501,9 +503,9 @@ async def open( (store_path / str(consolidated_key)).get(), ) if zarr_json_bytes is not None and zgroup_bytes is not None: - # TODO: revisit this exception type - # alternatively, we could warn and favor v3 - raise ValueError("Both zarr.json and .zgroup objects exist") + # warn and favor v3 + msg = f"Both zarr.json (zarr v3) and .zgroup (zarr v2) metadata objects exist at {store_path}." + warnings.warn(msg, stacklevel=1) if zarr_json_bytes is None and zgroup_bytes is None: raise FileNotFoundError( f"could not find zarr.json or .zgroup objects in {store_path}" @@ -998,115 +1000,83 @@ async def create_array( name: str, *, shape: ShapeLike, - dtype: npt.DTypeLike = "float64", - fill_value: Any | None = None, + dtype: npt.DTypeLike, + chunk_shape: ChunkCoords | Literal["auto"] = "auto", + shard_shape: ChunkCoords | Literal["auto"] | None = None, + filters: Iterable[dict[str, JSON] | Codec] = (), + compression: Iterable[dict[str, JSON] | Codec] = (), + fill_value: Any | None = 0, + order: MemoryOrder | None = "C", attributes: dict[str, JSON] | None = None, - # v3 only - chunk_shape: ChunkCoords | None = None, - chunk_key_encoding: ( - ChunkKeyEncoding - | tuple[Literal["default"], Literal[".", "/"]] - | tuple[Literal["v2"], Literal[".", "/"]] - | None - ) = None, - codecs: Iterable[Codec | dict[str, JSON]] | None = None, + chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingParams | None = None, dimension_names: Iterable[str] | None = None, - # v2 only - chunks: ShapeLike | None = None, - dimension_separator: Literal[".", "/"] | None = None, - order: Literal["C", "F"] | None = None, - filters: list[dict[str, JSON]] | None = None, - compressor: dict[str, JSON] | None = None, - # runtime + storage_options: dict[str, Any] | None = None, overwrite: bool = False, + config: ArrayConfig | ArrayConfigParams | None = None, data: npt.ArrayLike | None = None, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """ Create a Zarr array within this AsyncGroup. - This method lightly wraps AsyncArray.create. + This method lightly wraps ``zarr.core.array.create_array``. Parameters ---------- name : str - The name of the array. - shape : tuple[int, ...] - The shape of the array. - dtype : np.DtypeLike = float64 - The data type of the array. - chunk_shape : tuple[int, ...] | None = None - The shape of the chunks of the array. - V3 only. V2 arrays should use `chunks` instead. - If not specified, default are guessed based on the shape and dtype. - chunk_key_encoding : ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None = None - A specification of how the chunk keys are represented in storage. - V3 only. V2 arrays should use `dimension_separator` instead. - Default is ``("default", "/")``. - codecs : Iterable[Codec | dict[str, JSON]] | None = None - An iterable of Codec or dict serializations of Codecs. The elements of - this collection specify the transformation from array values to stored bytes. - V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. - - If no codecs are provided, default codecs will be used: - - - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. - - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. - dimension_names : Iterable[str] | None = None - The names of the dimensions of the array. V3 only. - chunks : ChunkCoords | None = None - The shape of the chunks of the array. - V2 only. V3 arrays should use ``chunk_shape`` instead. - If not specified, default are guessed based on the shape and dtype. - dimension_separator : Literal[".", "/"] | None = None - The delimiter used for the chunk keys. (default: ".") - V2 only. V3 arrays should use ``chunk_key_encoding`` instead. - order : Literal["C", "F"] | None = None - The memory order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). - filters : list[dict[str, JSON]] | None = None - Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` - nor ``filters`` are provided, a default compressor will be used. (see - ``compressor`` for details) - compressor : dict[str, JSON] | None = None - The compressor used to compress the data (default is None). - V2 only. V3 arrays should use ``codecs`` instead. - - If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: - - - For numeric arrays, the default is ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. - - These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. - overwrite : bool = False - If True, a pre-existing array or group at the path of this array will - be overwritten. If False, the presence of a pre-existing array or group is - an error. + The name of the array relative to the group. If ``path`` is ``None``, the array will be located + at the root of the store. + shape : ChunkCoords + Shape of the array. + dtype : npt.DTypeLike + Data type of the array. + chunk_shape : ChunkCoords | Literal["auto"], default is "auto". + Chunk shape of the array. + shard_shape : ChunkCoords, optional + Shard shape of the array. The default value of ``None`` results in no sharding at all. + filters : Iterable[Codec], optional + List of filters to apply to the array. + compressors : Iterable[Codec], optional + List of compressors to apply to the array. + fill_value : Any, optional + Fill value for the array. + order : {"C", "F"}, optional + Memory layout of the array. + attributes : dict, optional + Attributes for the array. + chunk_key_encoding : ChunkKeyEncoding, optional + The chunk key encoding to use. + dimension_names : Iterable[str], optional + Dimension names for the array. + storage_options : dict, optional + If using an fsspec URL to create the store, these will be passed to the backend implementation. + Ignored otherwise. + overwrite : bool, default False + Whether to overwrite an array with the same name in the store, if one exists. + config : ArrayConfig or ArrayConfigParams, optional + Runtime configuration for the array. Returns ------- AsyncArray """ - return await AsyncArray.create( - self.store_path / name, + return await create_array( + store=self.store_path, + name=name, shape=shape, dtype=dtype, - chunk_shape=chunk_shape, + chunks=chunk_shape, + shards=shard_shape, + filters=filters, + compression=compression, fill_value=fill_value, + order=order, + zarr_format=self.metadata.zarr_format, + attributes=attributes, chunk_key_encoding=chunk_key_encoding, - codecs=codecs, dimension_names=dimension_names, - attributes=attributes, - chunks=chunks, - dimension_separator=dimension_separator, - order=order, - filters=filters, - compressor=compressor, + storage_options=storage_options, overwrite=overwrite, - zarr_format=self.metadata.zarr_format, + config=config, data=data, ) @@ -2225,119 +2195,83 @@ def create_array( name: str, *, shape: ShapeLike, - dtype: npt.DTypeLike = "float64", - fill_value: Any | None = None, + dtype: npt.DTypeLike, + chunk_shape: ChunkCoords | Literal["auto"] = "auto", + shard_shape: ChunkCoords | None = None, + filters: Iterable[dict[str, JSON] | Codec] | Literal["auto"] = "auto", + compression: Iterable[dict[str, JSON] | Codec] | Codec | Literal["auto"] = "auto", + fill_value: Any | None = 0, + order: MemoryOrder | None = "C", attributes: dict[str, JSON] | None = None, - # v3 only - chunk_shape: ChunkCoords | None = None, - chunk_key_encoding: ( - ChunkKeyEncoding - | tuple[Literal["default"], Literal[".", "/"]] - | tuple[Literal["v2"], Literal[".", "/"]] - | None - ) = None, - codecs: Iterable[Codec | dict[str, JSON]] | None = None, + chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingParams | None = None, dimension_names: Iterable[str] | None = None, - # v2 only - chunks: ShapeLike | None = None, - dimension_separator: Literal[".", "/"] | None = None, - order: Literal["C", "F"] | None = None, - filters: list[dict[str, JSON]] | None = None, - compressor: dict[str, JSON] | None = None, - # runtime + storage_options: dict[str, Any] | None = None, overwrite: bool = False, + config: ArrayConfig | ArrayConfigParams | None = None, data: npt.ArrayLike | None = None, ) -> Array: - """Create a zarr array within this AsyncGroup. - - This method lightly wraps `AsyncArray.create`. + """ + Create a Zarr array within this AsyncGroup. + This method lightly wraps ``zarr.core.array.create_array``. Parameters ---------- - name : str - The name of the array. - shape : tuple[int, ...] - The shape of the array. - dtype : np.DtypeLike = float64 - The data type of the array. - chunk_shape : tuple[int, ...] | None = None - The shape of the chunks of the array. - V3 only. V2 arrays should use `chunks` instead. - If not specified, default are guessed based on the shape and dtype. - chunk_key_encoding : ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None = None - A specification of how the chunk keys are represented in storage. - V3 only. V2 arrays should use `dimension_separator` instead. - Default is ``("default", "/")``. - codecs : Iterable[Codec | dict[str, JSON]] | None = None - An iterable of Codec or dict serializations of Codecs. The elements of - this collection specify the transformation from array values to stored bytes. - V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. - - If no codecs are provided, default codecs will be used: - - - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. - - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. - dimension_names : Iterable[str] | None = None - The names of the dimensions of the array. V3 only. - chunks : ChunkCoords | None = None - The shape of the chunks of the array. - V2 only. V3 arrays should use ``chunk_shape`` instead. - If not specified, default are guessed based on the shape and dtype. - dimension_separator : Literal[".", "/"] | None = None - The delimiter used for the chunk keys. (default: ".") - V2 only. V3 arrays should use ``chunk_key_encoding`` instead. - order : Literal["C", "F"] | None = None - The memory order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). - filters : list[dict[str, JSON]] | None = None - Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` - nor ``filters`` are provided, a default compressor will be used. (see - ``compressor`` for details) - compressor : dict[str, JSON] | None = None - The compressor used to compress the data (default is None). - V2 only. V3 arrays should use ``codecs`` instead. - - If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: - - - For numeric arrays, the default is ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. - - These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. - overwrite : bool = False - If True, a pre-existing array or group at the path of this array will - be overwritten. If False, the presence of a pre-existing array or group is - an error. - data : npt.ArrayLike | None = None - Array data to initialize the array with. + path : str + The name of the array relative to the group. If ``path`` is ``None``, the array will be located + at the root of the store. + shape : ChunkCoords + Shape of the array. + dtype : npt.DTypeLike + Data type of the array. + chunk_shape : ChunkCoords | Literal["auto"], default is "auto" + Chunk shape of the array. + shard_shape : ChunkCoords, optional + Shard shape of the array. The default value of ``None`` results in no sharding at all. + filters : Iterable[Codec], optional + List of filters to apply to the array. + compressors : Iterable[Codec], optional + List of compressors to apply to the array. + fill_value : Any, optional + Fill value for the array. + order : {"C", "F"}, optional + Memory layout of the array. + attributes : dict, optional + Attributes for the array. + chunk_key_encoding : ChunkKeyEncoding, optional + The chunk key encoding to use. + dimension_names : Iterable[str], optional + Dimension names for the array. + storage_options : dict, optional + If using an fsspec URL to create the store, these will be passed to the backend implementation. + Ignored otherwise. + overwrite : bool, default False + Whether to overwrite an array with the same name in the store, if one exists. + config : ArrayConfig or ArrayConfigParams, optional + Runtime configuration for the array. Returns ------- - - Array - + AsyncArray """ + return Array( self._sync( self._async_group.create_array( name=name, shape=shape, dtype=dtype, + chunk_shape=chunk_shape, + shard_shape=shard_shape, fill_value=fill_value, attributes=attributes, - chunk_shape=chunk_shape, chunk_key_encoding=chunk_key_encoding, - codecs=codecs, + compression=compression, dimension_names=dimension_names, - chunks=chunks, - dimension_separator=dimension_separator, order=order, filters=filters, - compressor=compressor, overwrite=overwrite, + storage_options=storage_options, + config=config, data=data, ) ) @@ -2594,120 +2528,84 @@ def array( self, name: str, *, - shape: ChunkCoords, - dtype: npt.DTypeLike = "float64", - fill_value: Any | None = None, + shape: ShapeLike, + dtype: npt.DTypeLike, + chunk_shape: ChunkCoords | Literal["auto"] = "auto", + shard_shape: ChunkCoords | Literal["auto"] | None = None, + filters: Iterable[dict[str, JSON] | Codec] = (), + compression: Iterable[dict[str, JSON] | Codec] = (), + fill_value: Any | None = 0, + order: MemoryOrder | None = "C", attributes: dict[str, JSON] | None = None, - # v3 only - chunk_shape: ChunkCoords | None = None, - chunk_key_encoding: ( - ChunkKeyEncoding - | tuple[Literal["default"], Literal[".", "/"]] - | tuple[Literal["v2"], Literal[".", "/"]] - | None - ) = None, - codecs: Iterable[Codec | dict[str, JSON]] | None = None, + chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingParams | None = None, dimension_names: Iterable[str] | None = None, - # v2 only - chunks: ChunkCoords | None = None, - dimension_separator: Literal[".", "/"] | None = None, - order: Literal["C", "F"] | None = None, - filters: list[dict[str, JSON]] | None = None, - compressor: dict[str, JSON] | None = None, - # runtime + storage_options: dict[str, Any] | None = None, overwrite: bool = False, + config: ArrayConfig | ArrayConfigParams | None = None, data: npt.ArrayLike | None = None, ) -> Array: - """Create a zarr array within this AsyncGroup. - - This method lightly wraps `AsyncArray.create`. + """ + Create a Zarr array within this AsyncGroup. + This method lightly wraps ``zarr.core.array.create_array``. Parameters ---------- - name : str - The name of the array. - shape : tuple[int, ...] - The shape of the array. - dtype : np.DtypeLike = float64 - The data type of the array. - chunk_shape : tuple[int, ...] | None = None - The shape of the chunks of the array. - V3 only. V2 arrays should use `chunks` instead. - If not specified, default are guessed based on the shape and dtype. - chunk_key_encoding : ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None = None - A specification of how the chunk keys are represented in storage. - V3 only. V2 arrays should use `dimension_separator` instead. - Default is ``("default", "/")``. - codecs : Iterable[Codec | dict[str, JSON]] | None = None - An iterable of Codec or dict serializations of Codecs. The elements of - this collection specify the transformation from array values to stored bytes. - V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. - - If no codecs are provided, default codecs will be used: - - - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. - - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. - dimension_names : Iterable[str] | None = None - The names of the dimensions of the array. V3 only. - chunks : ChunkCoords | None = None - The shape of the chunks of the array. - V2 only. V3 arrays should use ``chunk_shape`` instead. - If not specified, default are guessed based on the shape and dtype. - dimension_separator : Literal[".", "/"] | None = None - The delimiter used for the chunk keys. (default: ".") - V2 only. V3 arrays should use ``chunk_key_encoding`` instead. - order : Literal["C", "F"] | None = None - The memory order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). - filters : list[dict[str, JSON]] | None = None - Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` - nor ``filters`` are provided, a default compressor will be used. (see - ``compressor`` for details) - compressor : dict[str, JSON] | None = None - The compressor used to compress the data (default is None). - V2 only. V3 arrays should use ``codecs`` instead. - - If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: - - - For numeric arrays, the default is ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. - - These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. - overwrite : bool = False - If True, a pre-existing array or group at the path of this array will - be overwritten. If False, the presence of a pre-existing array or group is - an error. - data : npt.ArrayLike | None = None - Array data to initialize the array with. + path : str + The name of the array relative to the group. If ``path`` is ``None``, the array will be located + at the root of the store. + shape : ChunkCoords + Shape of the array. + dtype : npt.DTypeLike + Data type of the array. + chunk_shape : ChunkCoords + Chunk shape of the array. + shard_shape : ChunkCoords, optional + Shard shape of the array. The default value of ``None`` results in no sharding at all. + filters : Iterable[Codec], optional + List of filters to apply to the array. + compressors : Iterable[Codec], optional + List of compressors to apply to the array. + fill_value : Any, optional + Fill value for the array. + order : {"C", "F"}, optional + Memory layout of the array. + attributes : dict, optional + Attributes for the array. + chunk_key_encoding : ChunkKeyEncoding, optional + The chunk key encoding to use. + dimension_names : Iterable[str], optional + Dimension names for the array. + storage_options : dict, optional + If using an fsspec URL to create the store, these will be passed to the backend implementation. + Ignored otherwise. + overwrite : bool, default False + Whether to overwrite an array with the same name in the store, if one exists. + config : ArrayConfig or ArrayConfigParams, optional + Runtime configuration for the array. Returns ------- - - Array - + AsyncArray """ + return Array( self._sync( self._async_group.create_array( name=name, shape=shape, dtype=dtype, + chunk_shape=chunk_shape, + shard_shape=shard_shape, fill_value=fill_value, attributes=attributes, - chunk_shape=chunk_shape, chunk_key_encoding=chunk_key_encoding, - codecs=codecs, + compression=compression, dimension_names=dimension_names, - chunks=chunks, - dimension_separator=dimension_separator, order=order, filters=filters, - compressor=compressor, overwrite=overwrite, + storage_options=storage_options, + config=config, data=data, ) ) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index af26034b1d..ddfc85a617 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -6,6 +6,8 @@ from functools import cached_property from typing import TYPE_CHECKING, TypedDict, cast +import numcodecs.abc + from zarr.abc.metadata import Metadata if TYPE_CHECKING: @@ -329,9 +331,9 @@ def _default_fill_value(dtype: np.dtype[Any]) -> Any: return dtype.type(0) -def _default_filters_and_compressor( +def _default_compressor( dtype: np.dtype[Any], -) -> tuple[list[dict[str, JSON]], dict[str, JSON] | None]: +) -> dict[str, JSON] | None: """Get the default filters and compressor for a dtype. https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html @@ -346,4 +348,24 @@ def _default_filters_and_compressor( else: raise ValueError(f"Unsupported dtype kind {dtype.kind}") - return [{"id": default_compressor[dtype_key]}], None + return default_compressor.get(dtype_key, None) + + +def _default_filters( + dtype: np.dtype[Any], +) -> list[dict[str, JSON]]: + """Get the default filters and compressor for a dtype. + + https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html + """ + default_filters = config.get("array.v2_default_filters") + if dtype.kind in "biufcmM": + dtype_key = "numeric" + elif dtype.kind in "U": + dtype_key = "string" + elif dtype.kind in "OSV": + dtype_key = "bytes" + else: + raise ValueError(f"Unsupported dtype kind {dtype.kind}") + + return default_filters.get(dtype_key, []) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 4cf5860ffd..67415f89aa 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -449,7 +449,7 @@ def parse_fill_value( return np.bytes_(fill_value) # the rest are numeric types - np_dtype = cast(np.dtype[np.generic], data_type.to_numpy()) + np_dtype = cast(np.dtype[Any], data_type.to_numpy()) if isinstance(fill_value, Sequence) and not isinstance(fill_value, str): if data_type in (DataType.complex64, DataType.complex128): @@ -513,7 +513,7 @@ def default_fill_value(dtype: DataType) -> str | bytes | np.generic: return b"" else: np_dtype = dtype.to_numpy() - np_dtype = cast(np.dtype[np.generic], np_dtype) + np_dtype = cast(np.dtype[Any], np_dtype) return np_dtype.type(0) @@ -586,7 +586,7 @@ def to_numpy_shortname(self) -> str: } return data_type_to_numpy[self] - def to_numpy(self) -> np.dtypes.StringDType | np.dtypes.ObjectDType | np.dtype[np.generic]: + def to_numpy(self) -> np.dtypes.StringDType | np.dtypes.ObjectDType | np.dtype[Any]: # note: it is not possible to round trip DataType <-> np.dtype # due to the fact that DataType.string and DataType.bytes both # generally return np.dtype("O") from this function, even though diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index 85a67e3e69..8a352b601c 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -143,7 +143,7 @@ def arrays( a = root.create_array( array_path, shape=nparray.shape, - chunks=chunks, + chunk_shape=chunks, dtype=nparray.dtype, attributes=attributes, # compressor=compressor, # FIXME diff --git a/tests/test_api.py b/tests/test_api.py index d25ec54bfe..bf6395edf7 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -13,6 +13,8 @@ from zarr.abc.store import Store from zarr.api.synchronous import ( create, + create_array, + create_group, group, load, open, @@ -21,13 +23,15 @@ save_array, save_group, ) +from zarr.codecs.transpose import TransposeCodec +from zarr.codecs.zstd import ZstdCodec from zarr.core.common import MemoryOrder, ZarrFormat from zarr.errors import MetadataValidationError from zarr.storage._utils import normalize_path from zarr.storage.memory import MemoryStore -def test_create_array(memory_store: Store) -> None: +def test_create(memory_store: Store) -> None: store = memory_store # create array @@ -56,6 +60,22 @@ def test_create_array(memory_store: Store) -> None: z = create(shape=(400, 100), chunks=(16, 16.5), store=store, overwrite=True) # type: ignore [arg-type] +# TODO: parametrize over everything this function takes +@pytest.mark.parametrize("store", ["memory"], indirect=True) +def test_create_array(store: Store) -> None: + attrs = {"foo": 100} + shape = (10, 10) + path = "foo" + data_val = 1 + array_w = create_array( + store, name=path, shape=shape, attributes=attrs, chunk_shape=shape, dtype="uint8" + ) + array_w[:] = data_val + assert array_w.shape == shape + assert array_w.attrs == attrs + assert np.array_equal(array_w[:], np.zeros(shape, dtype=array_w.dtype) + data_val) + + @pytest.mark.parametrize("write_empty_chunks", [True, False]) def test_write_empty_chunks_warns(write_empty_chunks: bool) -> None: """ @@ -113,6 +133,16 @@ async def test_open_array(memory_store: MemoryStore) -> None: open(store="doesnotexist", mode="r") +@pytest.mark.parametrize("store", ["memory"], indirect=True) +async def test_create_group(store: Store, zarr_format: ZarrFormat) -> None: + attrs = {"foo": 100} + path = "node" + node = create_group(store, path=path, attributes=attrs, zarr_format=zarr_format) + assert isinstance(node, Group) + assert node.attrs == attrs + assert node.metadata.zarr_format == zarr_format + + async def test_open_group(memory_store: MemoryStore) -> None: store = memory_store @@ -1086,3 +1116,36 @@ def test_open_array_with_mode_r_plus(store: Store) -> None: assert isinstance(z2, Array) assert (z2[:] == 1).all() z2[:] = 3 + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +async def test_create_array_v3(store: MemoryStore) -> None: + # TODO: fill in + _ = zarr.create_array( + store=store, + dtype="uint8", + shape=(10,), + shard_shape=(4,), + chunk_shape=(4,), + zarr_format=3, + filters=(TransposeCodec(order=(0,)),), + compression=ZstdCodec(level=3), + ) + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +async def test_create_array_v2(store: MemoryStore) -> None: + from numcodecs import Delta, Zstd + + # TODO: fill in + dtype = "uint8" + _ = zarr.create_array( + store=store, + dtype=dtype, + shape=(10,), + shard_shape=None, + chunk_shape=(4,), + zarr_format=2, + filters=(Delta(dtype=dtype),), + compression=Zstd(level=3), + ) diff --git a/tests/test_array.py b/tests/test_array.py index 1899e384dc..e41f5cd548 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -135,13 +135,13 @@ def test_array_name_properties_with_group( store: LocalStore | MemoryStore, zarr_format: ZarrFormat ) -> None: root = Group.from_store(store=store, zarr_format=zarr_format) - foo = root.create_array("foo", shape=(100,), chunks=(10,), dtype="i4") + foo = root.create_array("foo", shape=(100,), chunk_shape=(10,), dtype="i4") assert foo.path == "foo" assert foo.name == "/foo" assert foo.basename == "foo" bar = root.create_group("bar") - spam = bar.create_array("spam", shape=(100,), chunks=(10,), dtype="i4") + spam = bar.create_array("spam", shape=(100,), chunk_shape=(10,), dtype="i4") assert spam.path == "bar/spam" assert spam.name == "/bar/spam" diff --git a/tests/test_config.py b/tests/test_config.py index ea8e70a994..d5a364dd15 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -54,14 +54,28 @@ def test_config_defaults_set() -> None: "order": "C", "write_empty_chunks": False, "v2_default_compressor": { - "numeric": "zstd", - "string": "vlen-utf8", - "bytes": "vlen-bytes", + "numeric": {"id": "zstd", "level": 0, "checksum": True}, + "string": {"id": "zstd", "level": 0, "checksum": True}, + "bytes": {"id": "zstd", "level": 0, "checksum": True}, + }, + "v2_default_filters": { + "numeric": [], + "string": [{"id": "vlen-utf8"}], + "bytes": [{"id": "vlen-bytes"}], }, "v3_default_codecs": { - "bytes": ["vlen-bytes"], - "numeric": ["bytes", "zstd"], - "string": ["vlen-utf8"], + "bytes": [ + {"name": "vlen-bytes"}, + {"name": "zstd", "configuration": {"level": 0, "checksum": True}}, + ], + "numeric": [ + {"name": "bytes", "configuration": {"endian": "little"}}, + {"name": "zstd", "configuration": {"level": 0, "checksum": True}}, + ], + "string": [ + {"name": "vlen-utf8"}, + {"name": "zstd", "configuration": {"level": 0, "checksum": True}}, + ], }, }, "async": {"concurrency": 10, "timeout": None}, @@ -291,17 +305,26 @@ class NewCodec2(BytesCodec): ("dtype", "expected_codecs"), [ ("int", [BytesCodec(), GzipCodec()]), - ("bytes", [VLenBytesCodec()]), - ("str", [VLenUTF8Codec()]), + ("bytes", [VLenBytesCodec(), GzipCodec()]), + ("str", [VLenUTF8Codec(), GzipCodec()]), ], ) async def test_default_codecs(dtype: str, expected_codecs: list[Codec]) -> None: with config.set( { - "array.v3_default_codecs": { - "numeric": ["bytes", "gzip"], # test setting non-standard codecs - "string": ["vlen-utf8"], - "bytes": ["vlen-bytes"], + "array.v3_default_codecs": { # test setting non-standard codecs + "numeric": [ + {"name": "bytes", "configuration": {"endian": "little"}}, + {"name": "gzip", "configuration": {"level": 5}}, + ], + "string": [ + {"name": "vlen-utf8"}, + {"name": "gzip", "configuration": {"level": 5}}, + ], + "bytes": [ + {"name": "vlen-bytes"}, + {"name": "gzip", "configuration": {"level": 5}}, + ], } } ): diff --git a/tests/test_group.py b/tests/test_group.py index e0bc304b9b..44a98f65b4 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -157,7 +157,6 @@ def test_group_members(store: Store, zarr_format: ZarrFormat, consolidated_metad members_expected["subarray"] = group.create_array( "subarray", shape=(100,), dtype="uint8", chunk_shape=(10,), overwrite=True ) - # add an extra object to the domain of the group. # the list of children should ignore this object. sync( @@ -313,8 +312,10 @@ def test_group_getitem(store: Store, zarr_format: ZarrFormat, consolidated: bool group = Group.from_store(store, zarr_format=zarr_format) subgroup = group.create_group(name="subgroup") - subarray = group.create_array(name="subarray", shape=(10,), chunk_shape=(10,)) - subsubarray = subgroup.create_array(name="subarray", shape=(10,), chunk_shape=(10,)) + subarray = group.create_array(name="subarray", shape=(10,), chunk_shape=(10,), dtype="uint8") + subsubarray = subgroup.create_array( + name="subarray", shape=(10,), chunk_shape=(10,), dtype="uint8" + ) if consolidated: group = zarr.api.synchronous.consolidate_metadata(store=store, zarr_format=zarr_format) @@ -391,7 +392,7 @@ def test_group_delitem(store: Store, zarr_format: ZarrFormat, consolidated: bool group = Group.from_store(store, zarr_format=zarr_format) subgroup = group.create_group(name="subgroup") - subarray = group.create_array(name="subarray", shape=(10,), chunk_shape=(10,)) + subarray = group.create_array(name="subarray", shape=(10,), chunk_shape=(10,), dtype="uint8") if consolidated: group = zarr.api.synchronous.consolidate_metadata(store=store, zarr_format=zarr_format) @@ -472,19 +473,21 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat expected_group_values = [group.create_group(name=name) for name in expected_group_keys] expected_groups = list(zip(expected_group_keys, expected_group_values, strict=False)) + fill_value = 3 + dtype = "uint8" + expected_group_values[0].create_group("subgroup") - expected_group_values[0].create_array("subarray", shape=(1,)) + expected_group_values[0].create_array( + "subarray", shape=(1,), dtype=dtype, fill_value=fill_value + ) expected_array_keys = ["a0", "a1"] + expected_array_values = [ - group.create_array(name=name, shape=(1,)) for name in expected_array_keys + group.create_array(name=name, shape=(1,), dtype=dtype, fill_value=fill_value) + for name in expected_array_keys ] expected_arrays = list(zip(expected_array_keys, expected_array_values, strict=False)) - fill_value: float | None - if zarr_format == 2: - fill_value = None - else: - fill_value = np.float64(0.0) if consolidate: group = zarr.consolidate_metadata(store) @@ -492,12 +495,13 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat metadata = { "subarray": { "attributes": {}, - "dtype": "float64", + "dtype": dtype, "fill_value": fill_value, "shape": (1,), "chunks": (1,), "order": "C", - "filters": (Zstd(level=0),), + "filters": (), + "compressor": Zstd(level=0), "zarr_format": zarr_format, }, "subgroup": { @@ -527,7 +531,7 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat {"configuration": {"endian": "little"}, "name": "bytes"}, {"configuration": {}, "name": "zstd"}, ), - "data_type": "float64", + "data_type": dtype, "fill_value": fill_value, "node_type": "array", "shape": (1,), @@ -1014,11 +1018,11 @@ async def test_group_members_async(store: Store, consolidated_metadata: bool) -> group = await AsyncGroup.from_store( store=store, ) - a0 = await group.create_array("a0", shape=(1,)) + a0 = await group.create_array("a0", shape=(1,), dtype="uint8") g0 = await group.create_group("g0") - a1 = await g0.create_array("a1", shape=(1,)) + a1 = await g0.create_array("a1", shape=(1,), dtype="uint8") g1 = await g0.create_group("g1") - a2 = await g1.create_array("a2", shape=(1,)) + a2 = await g1.create_array("a2", shape=(1,), dtype="uint8") g2 = await g1.create_group("g2") # immediate children @@ -1179,9 +1183,9 @@ async def test_require_array(store: Store, zarr_format: ZarrFormat) -> None: async def test_members_name(store: Store, consolidate: bool, zarr_format: ZarrFormat): group = Group.from_store(store=store, zarr_format=zarr_format) a = group.create_group(name="a") - a.create_array("array", shape=(1,)) + a.create_array("array", shape=(1,), dtype="uint8") b = a.create_group(name="b") - b.create_array("array", shape=(1,)) + b.create_array("array", shape=(1,), dtype="uint8") if consolidate: group = zarr.api.synchronous.consolidate_metadata(store) @@ -1284,12 +1288,12 @@ async def test_group_delitem_consolidated(self, store: Store) -> None: g0 = await root.create_group("g0") g1 = await g0.create_group("g1") g2 = await g1.create_group("g2") - await g2.create_array("data", shape=(1,)) + await g2.create_array("data", shape=(1,), dtype="uint8") x0 = await root.create_group("x0") x1 = await x0.create_group("x1") x2 = await x1.create_group("x2") - await x2.create_array("data", shape=(1,)) + await x2.create_array("data", shape=(1,), dtype="uint8") await zarr.api.asynchronous.consolidate_metadata(store) @@ -1360,8 +1364,8 @@ def test_info(self): A = zarr.group(store=store, path="A") B = A.create_group(name="B") - B.create_array(name="x", shape=(1,)) - B.create_array(name="y", shape=(2,)) + B.create_array(name="x", shape=(1,), dtype="uint8") + B.create_array(name="y", shape=(2,), dtype="uint8") result = A.info expected = GroupInfo( @@ -1420,7 +1424,7 @@ def test_delitem_removes_children(store: Store, zarr_format: ZarrFormat) -> None g1 = zarr.group(store=store, zarr_format=zarr_format) g1.create_group("0") g1.create_group("0/0") - arr = g1.create_array("0/0/0", shape=(1,)) + arr = g1.create_array("0/0/0", shape=(1,), dtype="uint8") arr[:] = 1 del g1["0"] with pytest.raises(KeyError): diff --git a/tests/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py index 7f0c49338e..ba7fe0cb08 100644 --- a/tests/test_metadata/test_consolidated.py +++ b/tests/test_metadata/test_consolidated.py @@ -31,16 +31,19 @@ @pytest.fixture async def memory_store_with_hierarchy(memory_store: Store) -> None: g = await group(store=memory_store, attributes={"foo": "bar"}) - await g.create_array(name="air", shape=(1, 2, 3)) - await g.create_array(name="lat", shape=(1,)) - await g.create_array(name="lon", shape=(2,)) - await g.create_array(name="time", shape=(3,)) + dtype = "uint8" + await g.create_array(name="air", shape=(1, 2, 3), dtype=dtype) + await g.create_array(name="lat", shape=(1,), dtype=dtype) + await g.create_array(name="lon", shape=(2,), dtype=dtype) + await g.create_array(name="time", shape=(3,), dtype=dtype) child = await g.create_group("child", attributes={"key": "child"}) - await child.create_array("array", shape=(4, 4), attributes={"key": "child"}) + await child.create_array("array", shape=(4, 4), attributes={"key": "child"}, dtype=dtype) grandchild = await child.create_group("grandchild", attributes={"key": "grandchild"}) - await grandchild.create_array("array", shape=(4, 4), attributes={"key": "grandchild"}) + await grandchild.create_array( + "array", shape=(4, 4), attributes={"key": "grandchild"}, dtype=dtype + ) await grandchild.create_group("empty_group", attributes={"key": "empty"}) return memory_store @@ -76,8 +79,8 @@ async def test_consolidated(self, memory_store_with_hierarchy: Store) -> None: {"configuration": {"endian": "little"}, "name": "bytes"}, {"configuration": {}, "name": "zstd"}, ), - "data_type": "float64", - "fill_value": np.float64(0.0), + "data_type": "uint8", + "fill_value": 0, "node_type": "array", # "shape": (1, 2, 3), "zarr_format": 3, @@ -205,10 +208,11 @@ async def test_consolidated(self, memory_store_with_hierarchy: Store) -> None: def test_consolidated_sync(self, memory_store): g = zarr.api.synchronous.group(store=memory_store, attributes={"foo": "bar"}) - g.create_array(name="air", shape=(1, 2, 3)) - g.create_array(name="lat", shape=(1,)) - g.create_array(name="lon", shape=(2,)) - g.create_array(name="time", shape=(3,)) + dtype = "uint8" + g.create_array(name="air", shape=(1, 2, 3), dtype=dtype) + g.create_array(name="lat", shape=(1,), dtype=dtype) + g.create_array(name="lon", shape=(2,), dtype=dtype) + g.create_array(name="time", shape=(3,), dtype=dtype) zarr.api.synchronous.consolidate_metadata(memory_store) group2 = zarr.api.synchronous.Group.open(memory_store) @@ -223,8 +227,8 @@ def test_consolidated_sync(self, memory_store): {"configuration": {"endian": "little"}, "name": "bytes"}, {"configuration": {}, "name": "zstd"}, ), - "data_type": "float64", - "fill_value": np.float64(0.0), + "data_type": dtype, + "fill_value": 0, "node_type": "array", # "shape": (1, 2, 3), "zarr_format": 3, @@ -475,7 +479,8 @@ async def test_open_consolidated_raises_async(self, zarr_format: ZarrFormat): async def test_consolidated_metadata_v2(self): store = zarr.storage.MemoryStore() g = await AsyncGroup.from_store(store, attributes={"key": "root"}, zarr_format=2) - await g.create_array(name="a", shape=(1,), attributes={"key": "a"}) + dtype = "uint8" + await g.create_array(name="a", shape=(1,), attributes={"key": "a"}, dtype=dtype) g1 = await g.create_group(name="g1", attributes={"key": "g1"}) await g1.create_group(name="g2", attributes={"key": "g2"}) @@ -489,10 +494,10 @@ async def test_consolidated_metadata_v2(self): metadata={ "a": ArrayV2Metadata( shape=(1,), - dtype="float64", + dtype=dtype, attributes={"key": "a"}, chunks=(1,), - fill_value=None, + fill_value=0, filters=(Zstd(level=0),), order="C", ), diff --git a/tests/test_store/test_zip.py b/tests/test_store/test_zip.py index df22b76e1e..c207adebe1 100644 --- a/tests/test_store/test_zip.py +++ b/tests/test_store/test_zip.py @@ -69,7 +69,7 @@ def test_api_integration(self, store: ZipStore) -> None: data = np.arange(10000, dtype=np.uint16).reshape(100, 100) z = root.create_array( - shape=data.shape, chunks=(10, 10), name="foo", dtype=np.uint16, fill_value=99 + shape=data.shape, chunk_shape=(10, 10), name="foo", dtype=np.uint16, fill_value=99 ) z[:] = data diff --git a/tests/test_v2.py b/tests/test_v2.py index 80897db8e5..3cf4fecc72 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -82,13 +82,13 @@ def test_codec_pipeline() -> None: @pytest.mark.parametrize("dtype", ["|S", "|V"]) async def test_v2_encode_decode(dtype): - with config.set({"array.v2_default_compressor.bytes": "vlen-bytes"}): + with config.set({"array.v2_default_compressor.bytes": {"id": "vlen-bytes"}}): store = zarr.storage.MemoryStore() g = zarr.group(store=store, zarr_format=2) g.create_array( name="foo", shape=(3,), - chunks=(3,), + chunk_shape=(3,), dtype=dtype, fill_value=b"X", ) @@ -120,9 +120,9 @@ def test_v2_encode_decode_with_data(dtype_value): dtype, value = dtype_value with config.set( { - "array.v2_default_compressor": { - "string": "vlen-utf8", - "bytes": "vlen-bytes", + "array.v2_default_filters": { + "string": [{"id": "vlen-utf8"}], + "bytes": [{"id": "vlen-bytes"}], }, } ): @@ -210,18 +210,31 @@ def test_default_compressor_deprecation_warning(): @pytest.mark.parametrize( "dtype_expected", - [["b", "zstd"], ["i", "zstd"], ["f", "zstd"], ["|S1", "vlen-bytes"], ["|U1", "vlen-utf8"]], + [ + ["b", "zstd", None], + ["i", "zstd", None], + ["f", "zstd", None], + ["|S1", "zstd", "vlen-bytes"], + ["|U1", "zstd", "vlen-utf8"], + ], ) def test_default_filters_and_compressor(dtype_expected: Any) -> None: with config.set( { "array.v2_default_compressor": { - "numeric": "zstd", - "string": "vlen-utf8", - "bytes": "vlen-bytes", + "numeric": {"id": "zstd", "level": "0"}, + "string": {"id": "zstd", "level": "0"}, + "bytes": {"id": "zstd", "level": "0"}, + }, + "array.v2_default_filters": { + "numeric": [], + "string": [{"id": "vlen-utf8"}], + "bytes": [{"id": "vlen-bytes"}], }, } ): - dtype, expected = dtype_expected + dtype, expected_compressor, expected_filter = dtype_expected arr = zarr.create(shape=(3,), path="foo", store={}, zarr_format=2, dtype=dtype) - assert arr.metadata.filters[0].codec_id == expected + assert arr.metadata.compressor.codec_id == expected_compressor + if expected_filter is not None: + assert arr.metadata.filters[0].codec_id == expected_filter diff --git a/tests/test_zarr.py b/tests/test_zarr.py new file mode 100644 index 0000000000..2aa62e4231 --- /dev/null +++ b/tests/test_zarr.py @@ -0,0 +1,11 @@ +import zarr + + +def test_exports() -> None: + """ + Ensure that everything in __all__ can be imported. + """ + from zarr import __all__ + + for export in __all__: + getattr(zarr, export)