diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 8b20676e8b..14078944d7 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -10,6 +10,7 @@ from typing_extensions import deprecated from zarr.core.array import Array, AsyncArray, get_array_metadata +from zarr.core.array_spec import ArrayConfig, ArrayConfigParams from zarr.core.buffer import NDArrayLike from zarr.core.common import ( JSON, @@ -17,6 +18,8 @@ ChunkCoords, MemoryOrder, ZarrFormat, + _warn_order_kwarg, + _warn_write_empty_chunks_kwarg, parse_dtype, ) from zarr.core.config import config @@ -794,7 +797,7 @@ async def create( read_only: bool | None = None, object_codec: Codec | None = None, # TODO: type has changed dimension_separator: Literal[".", "/"] | None = None, - write_empty_chunks: bool = False, # TODO: default has changed + write_empty_chunks: bool | None = None, zarr_version: ZarrFormat | None = None, # deprecated zarr_format: ZarrFormat | None = None, meta_array: Any | None = None, # TODO: need type @@ -810,6 +813,7 @@ async def create( codecs: Iterable[Codec | dict[str, JSON]] | None = None, dimension_names: Iterable[str] | None = None, storage_options: dict[str, Any] | None = None, + config: ArrayConfig | ArrayConfigParams | None = None, **kwargs: Any, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """Create an array. @@ -856,8 +860,10 @@ async def create( These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. fill_value : object Default value to use for uninitialized portions of the array. order : {'C', 'F'}, optional + Deprecated in favor of the ``config`` keyword argument. + Pass ``{'order': }`` to ``create`` instead of using this parameter. Memory layout to be used within each chunk. - If not specified, default is taken from the Zarr config ```array.order```. + If not specified, the ``array.order`` parameter in the global config will be used. store : Store or str Store or path to directory in file system or name of zip file. synchronizer : object, optional @@ -891,30 +897,26 @@ async def create( Separator placed between the dimensions of a chunk. V2 only. V3 arrays should use ``chunk_key_encoding`` instead. Default is ".". - .. versionadded:: 2.8 - write_empty_chunks : bool, optional - If True (default), all chunks will be stored regardless of their + Deprecated in favor of the ``config`` keyword argument. + Pass ``{'write_empty_chunks': }`` to ``create`` instead of using this parameter. + If True, all chunks will be stored regardless of their contents. If False, each chunk is compared to the array's fill value prior to storing. If a chunk is uniformly equal to the fill value, then that chunk is not be stored, and the store entry for that chunk's key - is deleted. This setting enables sparser storage, as only chunks with - non-fill-value data are stored, at the expense of overhead associated - with checking the data of each chunk. - - .. versionadded:: 2.11 - + is deleted. zarr_format : {2, 3, None}, optional The zarr format to use when saving. Default is 3. meta_array : array-like, optional An array instance to use for determining arrays to create and return to users. Use `numpy.empty(())` by default. - - .. versionadded:: 2.13 storage_options : dict If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. + config : ArrayConfig or ArrayConfigParams, optional + Runtime configuration of the array. If provided, will override the + default values from `zarr.config.array`. Returns ------- @@ -951,19 +953,16 @@ async def create( warnings.warn("object_codec is not yet implemented", RuntimeWarning, stacklevel=2) if read_only is not None: warnings.warn("read_only is not yet implemented", RuntimeWarning, stacklevel=2) - if dimension_separator is not None: - if zarr_format == 3: - raise ValueError( - "dimension_separator is not supported for zarr format 3, use chunk_key_encoding instead" - ) - else: - warnings.warn( - "dimension_separator is not yet implemented", - RuntimeWarning, - stacklevel=2, - ) - if write_empty_chunks: - warnings.warn("write_empty_chunks is not yet implemented", RuntimeWarning, stacklevel=2) + if dimension_separator is not None and zarr_format == 3: + raise ValueError( + "dimension_separator is not supported for zarr format 3, use chunk_key_encoding instead" + ) + + if order is not None: + _warn_order_kwarg() + if write_empty_chunks is not None: + _warn_write_empty_chunks_kwarg() + if meta_array is not None: warnings.warn("meta_array is not yet implemented", RuntimeWarning, stacklevel=2) @@ -971,6 +970,30 @@ async def create( if mode is None: mode = "a" store_path = await make_store_path(store, path=path, mode=mode, storage_options=storage_options) + + config_dict: ArrayConfigParams = {} + + if write_empty_chunks is not None: + if config is not None: + msg = ( + "Both write_empty_chunks and config keyword arguments are set. " + "This is redundant. When both are set, write_empty_chunks will be ignored and " + "config will be used." + ) + warnings.warn(UserWarning(msg), stacklevel=1) + config_dict["write_empty_chunks"] = write_empty_chunks + if order is not None: + if config is not None: + msg = ( + "Both order and config keyword arguments are set. " + "This is redundant. When both are set, order will be ignored and " + "config will be used." + ) + warnings.warn(UserWarning(msg), stacklevel=1) + config_dict["order"] = order + + config_parsed = ArrayConfig.from_dict(config_dict) + return await AsyncArray.create( store_path, shape=shape, @@ -987,7 +1010,7 @@ async def create( codecs=codecs, dimension_names=dimension_names, attributes=attributes, - order=order, + config=config_parsed, **kwargs, ) @@ -1163,6 +1186,11 @@ async def open_array( zarr_format = _handle_zarr_version_or_format(zarr_version=zarr_version, zarr_format=zarr_format) + if "order" in kwargs: + _warn_order_kwarg() + if "write_empty_chunks" in kwargs: + _warn_write_empty_chunks_kwarg() + try: return await AsyncArray.open(store_path, zarr_format=zarr_format) except FileNotFoundError: diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 6ae062865c..cd1ef8b38d 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -17,6 +17,7 @@ from zarr.abc.codec import Codec from zarr.api.asynchronous import ArrayLike, PathLike + from zarr.core.array_spec import ArrayConfig, ArrayConfigParams from zarr.core.buffer import NDArrayLike from zarr.core.chunk_key_encodings import ChunkKeyEncoding from zarr.core.common import JSON, AccessModeLiteral, ChunkCoords, MemoryOrder, ZarrFormat @@ -542,7 +543,7 @@ def create( read_only: bool | None = None, object_codec: Codec | None = None, # TODO: type has changed dimension_separator: Literal[".", "/"] | None = None, - write_empty_chunks: bool = False, # TODO: default has changed + write_empty_chunks: bool | None = None, # TODO: default has changed zarr_version: ZarrFormat | None = None, # deprecated zarr_format: ZarrFormat | None = None, meta_array: Any | None = None, # TODO: need type @@ -558,6 +559,7 @@ def create( codecs: Iterable[Codec | dict[str, JSON]] | None = None, dimension_names: Iterable[str] | None = None, storage_options: dict[str, Any] | None = None, + config: ArrayConfig | ArrayConfigParams | None = None, **kwargs: Any, ) -> Array: """Create an array. @@ -578,8 +580,10 @@ def create( fill_value : object Default value to use for uninitialized portions of the array. order : {'C', 'F'}, optional + Deprecated in favor of the ``config`` keyword argument. + Pass ``{'order': }`` to ``create`` instead of using this parameter. Memory layout to be used within each chunk. - Default is set in Zarr's config (`array.order`). + If not specified, the ``array.order`` parameter in the global config will be used. store : Store or str Store or path to directory in file system or name of zip file. synchronizer : object, optional @@ -609,30 +613,25 @@ def create( A codec to encode object arrays, only needed if dtype=object. dimension_separator : {'.', '/'}, optional Separator placed between the dimensions of a chunk. - - .. versionadded:: 2.8 - write_empty_chunks : bool, optional - If True (default), all chunks will be stored regardless of their + Deprecated in favor of the ``config`` keyword argument. + Pass ``{'write_empty_chunks': }`` to ``create`` instead of using this parameter. + If True, all chunks will be stored regardless of their contents. If False, each chunk is compared to the array's fill value prior to storing. If a chunk is uniformly equal to the fill value, then that chunk is not be stored, and the store entry for that chunk's key - is deleted. This setting enables sparser storage, as only chunks with - non-fill-value data are stored, at the expense of overhead associated - with checking the data of each chunk. - - .. versionadded:: 2.11 - + is deleted. zarr_format : {2, 3, None}, optional The zarr format to use when saving. meta_array : array-like, optional An array instance to use for determining arrays to create and return to users. Use `numpy.empty(())` by default. - - .. versionadded:: 2.13 storage_options : dict If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. + config : ArrayConfig or ArrayConfigParams, optional + Runtime configuration of the array. If provided, will override the + default values from `zarr.config.array`. Returns ------- @@ -669,6 +668,7 @@ def create( codecs=codecs, dimension_names=dimension_names, storage_options=storage_options, + config=config, **kwargs, ) ) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 5372d5ec50..a01145b3b2 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -20,7 +20,7 @@ from zarr.abc.store import ByteGetter, ByteRangeRequest, ByteSetter from zarr.codecs.bytes import BytesCodec from zarr.codecs.crc32c_ import Crc32cCodec -from zarr.core.array_spec import ArraySpec +from zarr.core.array_spec import ArrayConfig, ArraySpec from zarr.core.buffer import ( Buffer, BufferPrototype, @@ -665,7 +665,9 @@ def _get_index_chunk_spec(self, chunks_per_shard: ChunkCoords) -> ArraySpec: shape=chunks_per_shard + (2,), dtype=np.dtype(" ArraySpec: shape=self.chunk_shape, dtype=shard_spec.dtype, fill_value=shard_spec.fill_value, - order=shard_spec.order, + config=shard_spec.config, prototype=shard_spec.prototype, ) diff --git a/src/zarr/codecs/transpose.py b/src/zarr/codecs/transpose.py index 3a471beaf5..1aa1eb40e2 100644 --- a/src/zarr/codecs/transpose.py +++ b/src/zarr/codecs/transpose.py @@ -84,7 +84,7 @@ def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: shape=tuple(chunk_spec.shape[self.order[i]] for i in range(chunk_spec.ndim)), dtype=chunk_spec.dtype, fill_value=chunk_spec.fill_value, - order=chunk_spec.order, + config=chunk_spec.config, prototype=chunk_spec.prototype, ) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 07ed0e5069..717eff36dc 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -15,6 +15,7 @@ from zarr.abc.store import Store, set_or_delete from zarr.codecs._v2 import V2Codec from zarr.core._info import ArrayInfo +from zarr.core.array_spec import ArrayConfig, ArrayConfigParams, normalize_array_config from zarr.core.attributes import Attributes from zarr.core.buffer import ( BufferPrototype, @@ -37,12 +38,14 @@ MemoryOrder, ShapeLike, ZarrFormat, + _warn_order_kwarg, concurrent_map, parse_dtype, + parse_order, parse_shapelike, product, ) -from zarr.core.config import config, parse_indexing_order +from zarr.core.config import config as zarr_config from zarr.core.indexing import ( BasicIndexer, BasicSelection, @@ -187,8 +190,8 @@ class AsyncArray(Generic[T_ArrayMetadata]): The metadata of the array. store_path : StorePath The path to the Zarr store. - order : {'C', 'F'}, optional - The order of the array data in memory, by default None. + config : ArrayConfig, optional + The runtime configuration of the array, by default None. Attributes ---------- @@ -198,21 +201,21 @@ class AsyncArray(Generic[T_ArrayMetadata]): The path to the Zarr store. codec_pipeline : CodecPipeline The codec pipeline used for encoding and decoding chunks. - order : {'C', 'F'} - The order of the array data in memory. + _config : ArrayConfig + The runtime configuration of the array. """ metadata: T_ArrayMetadata store_path: StorePath codec_pipeline: CodecPipeline = field(init=False) - order: MemoryOrder + _config: ArrayConfig @overload def __init__( self: AsyncArray[ArrayV2Metadata], metadata: ArrayV2Metadata | ArrayV2MetadataDict, store_path: StorePath, - order: MemoryOrder | None = None, + config: ArrayConfig | None = None, ) -> None: ... @overload @@ -220,14 +223,14 @@ def __init__( self: AsyncArray[ArrayV3Metadata], metadata: ArrayV3Metadata | ArrayV3MetadataDict, store_path: StorePath, - order: MemoryOrder | None = None, + config: ArrayConfig | None = None, ) -> None: ... def __init__( self, metadata: ArrayMetadata | ArrayMetadataDict, store_path: StorePath, - order: MemoryOrder | None = None, + config: ArrayConfig | None = None, ) -> None: if isinstance(metadata, dict): zarr_format = metadata["zarr_format"] @@ -241,11 +244,12 @@ def __init__( raise ValueError(f"Invalid zarr_format: {zarr_format}. Expected 2 or 3") metadata_parsed = parse_array_metadata(metadata) - order_parsed = parse_indexing_order(order or config.get("array.order")) + + config = ArrayConfig.from_dict({}) if config is None else config object.__setattr__(self, "metadata", metadata_parsed) object.__setattr__(self, "store_path", store_path) - object.__setattr__(self, "order", order_parsed) + object.__setattr__(self, "_config", config) object.__setattr__(self, "codec_pipeline", create_codec_pipeline(metadata=metadata_parsed)) # this overload defines the function signature when zarr_format is 2 @@ -269,6 +273,7 @@ async def create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, + config: ArrayConfig | ArrayConfigParams | None = None, ) -> AsyncArray[ArrayV2Metadata]: ... # this overload defines the function signature when zarr_format is 3 @@ -297,9 +302,9 @@ async def create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, + config: ArrayConfig | ArrayConfigParams | None = None, ) -> AsyncArray[ArrayV3Metadata]: ... - # this overload is necessary to handle the case where the `zarr_format` kwarg is unspecified @overload @classmethod async def create( @@ -325,8 +330,8 @@ async def create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, + config: ArrayConfig | ArrayConfigParams | None = None, ) -> AsyncArray[ArrayV3Metadata]: ... - @overload @classmethod async def create( @@ -358,6 +363,7 @@ async def create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, + config: ArrayConfig | ArrayConfigParams | None = None, ) -> AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata]: ... @classmethod @@ -390,6 +396,7 @@ async def create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, + config: ArrayConfig | ArrayConfigParams | None = None, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """ Method to create a new asynchronous array instance. @@ -439,7 +446,11 @@ async def create( The dimension separator (default is "."). V2 only. V3 arrays should use ``chunk_key_encoding`` instead. order : Literal["C", "F"], optional - The order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). + The memory of the array (default is "C"). + If ``zarr_format`` is 2, this parameter sets the memory order of the array. + If `zarr_format`` is 3, then this parameter is deprecated, because memory order + is a runtime parameter for Zarr 3 arrays. The recommended way to specify the memory + order for Zarr 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. filters : list[dict[str, JSON]], optional Sequence of filters to use to encode chunk data prior to compression. V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` @@ -491,6 +502,7 @@ async def create( _chunks = normalize_chunks(chunks, shape, dtype_parsed.itemsize) else: _chunks = normalize_chunks(chunk_shape, shape, dtype_parsed.itemsize) + config_parsed = normalize_array_config(config) result: AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] if zarr_format == 3: @@ -506,6 +518,10 @@ async def create( raise ValueError( "compressor cannot be used for arrays with version 3. Use bytes-to-bytes codecs instead." ) + + if order is not None: + _warn_order_kwarg() + result = await cls._create_v3( store_path, shape=shape, @@ -517,7 +533,7 @@ async def create( dimension_names=dimension_names, attributes=attributes, overwrite=overwrite, - order=order, + config=config_parsed, ) elif zarr_format == 2: if codecs is not None: @@ -530,6 +546,12 @@ async def create( ) if dimension_names is not None: raise ValueError("dimension_names cannot be used for arrays with version 2.") + + if order is None: + order_parsed = parse_order(zarr_config.get("array.order")) + else: + order_parsed = order + result = await cls._create_v2( store_path, shape=shape, @@ -537,7 +559,8 @@ async def create( chunks=_chunks, dimension_separator=dimension_separator, fill_value=fill_value, - order=order, + order=order_parsed, + config=config_parsed, filters=filters, compressor=compressor, attributes=attributes, @@ -560,8 +583,8 @@ async def _create_v3( shape: ShapeLike, dtype: np.dtype[Any], chunk_shape: ChunkCoords, + config: ArrayConfig, fill_value: Any | None = None, - order: MemoryOrder | None = None, chunk_key_encoding: ( ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] @@ -614,7 +637,7 @@ async def _create_v3( attributes=attributes or {}, ) - array = cls(metadata=metadata, store_path=store_path, order=order) + array = cls(metadata=metadata, store_path=store_path, config=config) await array._save_metadata(metadata, ensure_parents=True) return array @@ -626,9 +649,10 @@ async def _create_v2( shape: ChunkCoords, dtype: np.dtype[Any], chunks: ChunkCoords, + order: MemoryOrder, + config: ArrayConfig, dimension_separator: Literal[".", "/"] | None = None, fill_value: float | None = None, - order: MemoryOrder | None = None, filters: list[dict[str, JSON]] | None = None, compressor: dict[str, JSON] | None = None, attributes: dict[str, JSON] | None = None, @@ -642,9 +666,6 @@ async def _create_v2( else: await ensure_no_existing_node(store_path, zarr_format=2) - if order is None: - order = parse_indexing_order(config.get("array.order")) - if dimension_separator is None: dimension_separator = "." @@ -667,7 +688,7 @@ async def _create_v2( filters=filters, attributes=attributes, ) - array = cls(metadata=metadata, store_path=store_path, order=order) + array = cls(metadata=metadata, store_path=store_path, config=config) await array._save_metadata(metadata, ensure_parents=True) return array @@ -806,6 +827,17 @@ def dtype(self) -> np.dtype[Any]: """ return self.metadata.dtype + @property + def order(self) -> MemoryOrder: + """Returns the memory order of the array. + + Returns + ------- + bool + Memory order of the array + """ + return self._config.order + @property def attrs(self) -> dict[str, JSON]: """Returns the attributes of the array. @@ -1036,7 +1068,7 @@ async def _get_selection( out_buffer = prototype.nd_buffer.create( shape=indexer.shape, dtype=out_dtype, - order=self.order, + order=self._config.order, fill_value=self.metadata.fill_value, ) if product(indexer.shape) > 0: @@ -1045,7 +1077,9 @@ async def _get_selection( [ ( self.store_path / self.metadata.encode_chunk_key(chunk_coords), - self.metadata.get_chunk_spec(chunk_coords, self.order, prototype=prototype), + self.metadata.get_chunk_spec( + chunk_coords, self._config, prototype=prototype + ), chunk_selection, out_selection, ) @@ -1167,7 +1201,7 @@ async def _set_selection( [ ( self.store_path / self.metadata.encode_chunk_key(chunk_coords), - self.metadata.get_chunk_spec(chunk_coords, self.order, prototype), + self.metadata.get_chunk_spec(chunk_coords, self._config, prototype), chunk_selection, out_selection, ) @@ -1270,7 +1304,7 @@ async def _delete_key(key: str) -> None: for chunk_coords in old_chunk_coords.difference(new_chunk_coords) ], _delete_key, - config.get("async.concurrency"), + zarr_config.get("async.concurrency"), ) # Write new metadata @@ -1503,6 +1537,7 @@ def create( compressor: dict[str, JSON] | None = None, # runtime overwrite: bool = False, + config: ArrayConfig | ArrayConfigParams | None = None, ) -> Array: """Creates a new Array instance from an initialized store. @@ -1545,7 +1580,11 @@ def create( The dimension separator (default is "."). V2 only. V3 arrays should use ``chunk_key_encoding`` instead. order : Literal["C", "F"], optional - The order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). + The memory of the array (default is "C"). + If ``zarr_format`` is 2, this parameter sets the memory order of the array. + If `zarr_format`` is 3, then this parameter is deprecated, because memory order + is a runtime parameter for Zarr 3 arrays. The recommended way to specify the memory + order for Zarr 3 arrays is via the ``config`` parameter, e.g. ``{'order': 'C'}``. filters : list[dict[str, JSON]], optional Sequence of filters to use to encode chunk data prior to compression. V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` @@ -1588,6 +1627,7 @@ def create( filters=filters, compressor=compressor, overwrite=overwrite, + config=config, ), ) return cls(async_array) @@ -3399,7 +3439,7 @@ def _build_parents( def _get_default_codecs( np_dtype: np.dtype[Any], ) -> list[dict[str, JSON]]: - default_codecs = config.get("array.v3_default_codecs") + default_codecs = zarr_config.get("array.v3_default_codecs") dtype = DataType.from_numpy(np_dtype) if dtype == DataType.string: dtype_key = "string" diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index c4d9c363fa..ee6934d05f 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -1,23 +1,95 @@ from __future__ import annotations -from dataclasses import dataclass -from typing import TYPE_CHECKING, Any +from dataclasses import dataclass, fields +from typing import TYPE_CHECKING, Any, Literal, Self, TypedDict, cast import numpy as np -from zarr.core.common import MemoryOrder, parse_fill_value, parse_order, parse_shapelike +from zarr.core.common import ( + MemoryOrder, + parse_bool, + parse_fill_value, + parse_order, + parse_shapelike, +) +from zarr.core.config import config as zarr_config if TYPE_CHECKING: + from typing import NotRequired + from zarr.core.buffer import BufferPrototype from zarr.core.common import ChunkCoords +class ArrayConfigParams(TypedDict): + """ + A TypedDict model of the attributes of an ArrayConfig class, but with no required fields. + This allows for partial construction of an ArrayConfig, with the assumption that the unset + keys will be taken from a global configuration. + """ + + order: NotRequired[MemoryOrder] + write_empty_chunks: NotRequired[bool] + + +@dataclass(frozen=True) +class ArrayConfig: + """ + A model of the runtime configuration of an array. + + Parameters + ---------- + order : MemoryOrder + The memory layout of the arrays returned when reading data from the store. + write_empty_chunks : bool + If True, empty chunks will be written to the store. + """ + + order: MemoryOrder + write_empty_chunks: bool + + def __init__(self, order: MemoryOrder, write_empty_chunks: bool) -> None: + order_parsed = parse_order(order) + write_empty_chunks_parsed = parse_bool(write_empty_chunks) + + object.__setattr__(self, "order", order_parsed) + object.__setattr__(self, "write_empty_chunks", write_empty_chunks_parsed) + + @classmethod + def from_dict(cls, data: ArrayConfigParams) -> Self: + """ + Create an ArrayConfig from a dict. The keys of that dict are a subset of the + attributes of the ArrayConfig class. Any keys missing from that dict will be set to the + the values in the ``array`` namespace of ``zarr.config``. + """ + kwargs_out: ArrayConfigParams = {} + for f in fields(ArrayConfig): + field_name = cast(Literal["order", "write_empty_chunks"], f.name) + if field_name not in data: + kwargs_out[field_name] = zarr_config.get(f"array.{field_name}") + else: + kwargs_out[field_name] = data[field_name] + return cls(**kwargs_out) + + +def normalize_array_config(data: ArrayConfig | ArrayConfigParams | None) -> ArrayConfig: + """ + Convert various types of data to an ArrayConfig. + """ + if data is None: + return ArrayConfig.from_dict({}) + elif isinstance(data, ArrayConfig): + return data + else: + return ArrayConfig.from_dict(data) + + @dataclass(frozen=True) class ArraySpec: shape: ChunkCoords dtype: np.dtype[Any] fill_value: Any - order: MemoryOrder + config: ArrayConfig prototype: BufferPrototype def __init__( @@ -25,20 +97,23 @@ def __init__( shape: ChunkCoords, dtype: np.dtype[Any], fill_value: Any, - order: MemoryOrder, + config: ArrayConfig, prototype: BufferPrototype, ) -> None: shape_parsed = parse_shapelike(shape) dtype_parsed = np.dtype(dtype) fill_value_parsed = parse_fill_value(fill_value) - order_parsed = parse_order(order) object.__setattr__(self, "shape", shape_parsed) object.__setattr__(self, "dtype", dtype_parsed) object.__setattr__(self, "fill_value", fill_value_parsed) - object.__setattr__(self, "order", order_parsed) + object.__setattr__(self, "config", config) object.__setattr__(self, "prototype", prototype) @property def ndim(self) -> int: return len(self.shape) + + @property + def order(self) -> MemoryOrder: + return self.config.order diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 038a2eeac2..5a1f069823 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -360,7 +360,7 @@ async def _read_key( _read_key, config.get("async.concurrency"), ) - chunk_array_batch = await self.decode_batch( + chunk_array_decoded = await self.decode_batch( [ (chunk_bytes, chunk_spec) for chunk_bytes, (_, chunk_spec, _, _) in zip( @@ -369,23 +369,27 @@ async def _read_key( ], ) - chunk_array_batch = [ + chunk_array_merged = [ self._merge_chunk_array( chunk_array, value, out_selection, chunk_spec, chunk_selection, drop_axes ) for chunk_array, (_, chunk_spec, chunk_selection, out_selection) in zip( - chunk_array_batch, batch_info, strict=False - ) - ] - - chunk_array_batch = [ - None - if chunk_array is None or chunk_array.all_equal(chunk_spec.fill_value) - else chunk_array - for chunk_array, (_, chunk_spec, _, _) in zip( - chunk_array_batch, batch_info, strict=False + chunk_array_decoded, batch_info, strict=False ) ] + chunk_array_batch: list[NDBuffer | None] = [] + for chunk_array, (_, chunk_spec, _, _) in zip( + chunk_array_merged, batch_info, strict=False + ): + if chunk_array is None: + chunk_array_batch.append(None) # type: ignore[unreachable] + else: + if not chunk_spec.config.write_empty_chunks and chunk_array.all_equal( + chunk_spec.fill_value + ): + chunk_array_batch.append(None) + else: + chunk_array_batch.append(chunk_array) chunk_bytes_batch = await self.encode_batch( [ diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index a4bf33451c..3db00b1a06 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -3,6 +3,7 @@ import asyncio import functools import operator +import warnings from collections.abc import Iterable, Mapping from enum import Enum from itertools import starmap @@ -160,6 +161,12 @@ def parse_order(data: Any) -> Literal["C", "F"]: raise ValueError(f"Expected one of ('C', 'F'), got {data} instead.") +def parse_bool(data: Any) -> bool: + if isinstance(data, bool): + return data + raise ValueError(f"Expected bool, got {data} instead.") + + def parse_dtype(dtype: Any, zarr_format: ZarrFormat) -> np.dtype[Any]: if dtype is str or dtype == "str": if zarr_format == 2: @@ -168,3 +175,25 @@ def parse_dtype(dtype: Any, zarr_format: ZarrFormat) -> np.dtype[Any]: else: return _STRING_DTYPE return np.dtype(dtype) + + +def _warn_write_empty_chunks_kwarg() -> None: + # TODO: link to docs page on array configuration in this message + msg = ( + "The `write_empty_chunks` keyword argument is deprecated and will be removed in future versions. " + "To control whether empty chunks are written to storage, either use the `config` keyword " + "argument, as in `config={'write_empty_chunks: True}`," + "or change the global 'array.write_empty_chunks' configuration variable." + ) + warnings.warn(msg, RuntimeWarning, stacklevel=2) + + +def _warn_order_kwarg() -> None: + # TODO: link to docs page on array configuration in this message + msg = ( + "The `order` keyword argument has no effect for zarr v3 arrays. " + "To control the memory layout of the array, either use the `config` keyword " + "argument, as in `config={'order: 'C'}`," + "or change the global 'array.order' configuration variable." + ) + warnings.warn(msg, RuntimeWarning, stacklevel=2) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 1feb4a6c2f..a14305aef8 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -65,6 +65,7 @@ def reset(self) -> None: "default_zarr_version": 3, "array": { "order": "C", + "write_empty_chunks": False, "v2_default_compressor": { "numeric": "zstd", "string": "vlen-utf8", diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index bd0fbecf4a..bf6b576edd 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -22,7 +22,7 @@ import numcodecs import numpy as np -from zarr.core.array_spec import ArraySpec +from zarr.core.array_spec import ArrayConfig, ArraySpec from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.chunk_key_encodings import parse_separator from zarr.core.common import ZARRAY_JSON, ZATTRS_JSON, MemoryOrder, parse_shapelike @@ -186,13 +186,13 @@ def to_dict(self) -> dict[str, JSON]: return zarray_dict def get_chunk_spec( - self, _chunk_coords: ChunkCoords, order: MemoryOrder, prototype: BufferPrototype + self, _chunk_coords: ChunkCoords, array_config: ArrayConfig, prototype: BufferPrototype ) -> ArraySpec: return ArraySpec( shape=self.chunks, dtype=self.dtype, fill_value=self.fill_value, - order=order, + config=array_config, prototype=prototype, ) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 8dcceb7f31..4cf5860ffd 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -24,14 +24,13 @@ import numpy.typing as npt from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec -from zarr.core.array_spec import ArraySpec +from zarr.core.array_spec import ArrayConfig, ArraySpec from zarr.core.chunk_grids import ChunkGrid, RegularChunkGrid from zarr.core.chunk_key_encodings import ChunkKeyEncoding from zarr.core.common import ( JSON, ZARR_JSON, ChunkCoords, - MemoryOrder, parse_named_configuration, parse_shapelike, ) @@ -252,7 +251,7 @@ def __init__( shape=shape_parsed, dtype=data_type_parsed.to_numpy(), fill_value=fill_value_parsed, - order="C", # TODO: order is not needed here. + config=ArrayConfig.from_dict({}), # TODO: config is not needed here. prototype=default_buffer_prototype(), # TODO: prototype is not needed here. ) codecs_parsed = [c.evolve_from_array_spec(array_spec) for c in codecs_parsed_partial] @@ -298,7 +297,7 @@ def ndim(self) -> int: return len(self.shape) def get_chunk_spec( - self, _chunk_coords: ChunkCoords, order: MemoryOrder, prototype: BufferPrototype + self, _chunk_coords: ChunkCoords, array_config: ArrayConfig, prototype: BufferPrototype ) -> ArraySpec: assert isinstance( self.chunk_grid, RegularChunkGrid @@ -307,7 +306,7 @@ def get_chunk_spec( shape=self.chunk_grid.chunk_shape, dtype=self.dtype, fill_value=self.fill_value, - order=order, + config=array_config, prototype=prototype, ) diff --git a/tests/test_api.py b/tests/test_api.py index f98565ad68..d25ec54bfe 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -56,6 +56,21 @@ def test_create_array(memory_store: Store) -> None: z = create(shape=(400, 100), chunks=(16, 16.5), store=store, overwrite=True) # type: ignore [arg-type] +@pytest.mark.parametrize("write_empty_chunks", [True, False]) +def test_write_empty_chunks_warns(write_empty_chunks: bool) -> None: + """ + Test that using the `write_empty_chunks` kwarg on array access will raise a warning. + """ + match = "The `write_empty_chunks` keyword argument .*" + with pytest.warns(RuntimeWarning, match=match): + _ = zarr.array( + data=np.arange(10), shape=(10,), dtype="uint8", write_empty_chunks=write_empty_chunks + ) + + with pytest.warns(RuntimeWarning, match=match): + _ = zarr.create(shape=(10,), dtype="uint8", write_empty_chunks=write_empty_chunks) + + @pytest.mark.parametrize("path", ["foo", "/", "/foo", "///foo/bar"]) @pytest.mark.parametrize("node_type", ["array", "group"]) def test_open_normalized_path( @@ -245,10 +260,26 @@ def test_open_with_mode_w_minus(tmp_path: pathlib.Path) -> None: zarr.open(store=tmp_path, mode="w-") -@pytest.mark.parametrize("order", ["C", "F", None]) @pytest.mark.parametrize("zarr_format", [2, 3]) -def test_array_order(order: MemoryOrder | None, zarr_format: ZarrFormat) -> None: - arr = zarr.ones(shape=(2, 2), order=order, zarr_format=zarr_format) +def test_array_order(zarr_format: ZarrFormat) -> None: + arr = zarr.ones(shape=(2, 2), order=None, zarr_format=zarr_format) + expected = zarr.config.get("array.order") + assert arr.order == expected + + vals = np.asarray(arr) + if expected == "C": + assert vals.flags.c_contiguous + elif expected == "F": + assert vals.flags.f_contiguous + else: + raise AssertionError + + +@pytest.mark.parametrize("order", ["C", "F"]) +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_array_order_warns(order: MemoryOrder | None, zarr_format: ZarrFormat) -> None: + with pytest.warns(RuntimeWarning, match="The `order` keyword argument .*"): + arr = zarr.ones(shape=(2, 2), order=order, zarr_format=zarr_format) expected = order or zarr.config.get("array.order") assert arr.order == expected diff --git a/tests/test_array.py b/tests/test_array.py index c89b6187c3..1899e384dc 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -748,14 +748,42 @@ def test_append_bad_shape(store: MemoryStore, zarr_format: ZarrFormat) -> None: @pytest.mark.parametrize("order", ["C", "F", None]) -@pytest.mark.parametrize("zarr_format", [2, 3]) @pytest.mark.parametrize("store", ["memory"], indirect=True) -def test_array_create_order( +def test_array_create_metadata_order_v2( order: MemoryOrder | None, zarr_format: int, store: MemoryStore ) -> None: - arr = Array.create(store=store, shape=(2, 2), order=order, zarr_format=zarr_format, dtype="i4") + """ + Test that the ``order`` attribute in zarr v2 array metadata is set correctly via the ``order`` + keyword argument to ``Array.create``. When ``order`` is ``None``, the value of the + ``array.order`` config is used. + """ + arr = Array.create(store=store, shape=(2, 2), order=order, zarr_format=2, dtype="i4") + expected = order or zarr.config.get("array.order") - assert arr.order == expected + assert arr.metadata.order == expected # type: ignore[union-attr] + + +@pytest.mark.parametrize("order_config", ["C", "F", None]) +@pytest.mark.parametrize("store", ["memory"], indirect=True) +def test_array_create_order( + order_config: MemoryOrder | None, + zarr_format: int, + store: MemoryStore, +) -> None: + """ + Test that the arrays generated by array indexing have a memory order defined by the config order + value + """ + if order_config is None: + config = {} + expected = zarr.config.get("array.order") + else: + config = {"order": order_config} + expected = order_config + + arr = Array.create( + store=store, shape=(2, 2), zarr_format=zarr_format, dtype="i4", config=config + ) vals = np.asarray(arr) if expected == "C": @@ -766,6 +794,57 @@ def test_array_create_order( raise AssertionError +@pytest.mark.parametrize("write_empty_chunks", [True, False]) +def test_write_empty_chunks_config(write_empty_chunks: bool) -> None: + """ + Test that the value of write_empty_chunks is sensitive to the global config when not set + explicitly + """ + with zarr.config.set({"array.write_empty_chunks": write_empty_chunks}): + arr = Array.create({}, shape=(2, 2), dtype="i4") + assert arr._async_array._config.write_empty_chunks == write_empty_chunks + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("write_empty_chunks", [True, False]) +@pytest.mark.parametrize("fill_value", [0, 5]) +def test_write_empty_chunks_behavior( + zarr_format: ZarrFormat, store: MemoryStore, write_empty_chunks: bool, fill_value: int +) -> None: + """ + Check that the write_empty_chunks value of the config is applied correctly. We expect that + when write_empty_chunks is True, writing chunks equal to the fill value will result in + those chunks appearing in the store. + + When write_empty_chunks is False, writing chunks that are equal to the fill value will result in + those chunks not being present in the store. In particular, they should be deleted if they were + already present. + """ + + arr = Array.create( + store=store, + shape=(2,), + zarr_format=zarr_format, + dtype="i4", + fill_value=fill_value, + chunk_shape=(1,), + config={"write_empty_chunks": write_empty_chunks}, + ) + + assert arr._async_array._config.write_empty_chunks == write_empty_chunks + + # initialize the store with some non-fill value chunks + arr[:] = fill_value + 1 + assert arr.nchunks_initialized == arr.nchunks + + arr[:] = fill_value + + if not write_empty_chunks: + assert arr.nchunks_initialized == 0 + else: + assert arr.nchunks_initialized == arr.nchunks + + @pytest.mark.parametrize( ("fill_value", "expected"), [ diff --git a/tests/test_config.py b/tests/test_config.py index 8dd15fb75b..ea8e70a994 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -52,6 +52,7 @@ def test_config_defaults_set() -> None: "default_zarr_version": 3, "array": { "order": "C", + "write_empty_chunks": False, "v2_default_compressor": { "numeric": "zstd", "string": "vlen-utf8", diff --git a/tests/test_v2.py b/tests/test_v2.py index ef06c13e26..80897db8e5 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -152,7 +152,8 @@ async def test_create_dtype_str(dtype: Any) -> None: @pytest.mark.parametrize("order", ["C", "F"]) def test_v2_filters_codecs(filters: Any, order: Literal["C", "F"]) -> None: array_fixture = [42] - arr = zarr.create(shape=1, dtype="