Skip to content

Commit 6dc6d07

Browse files
d-v-bnormanrz
andauthored
Feat/write empty chunks (#2429)
* add write_empty_chunks to config.array namespace * use write_empty_chunks from config in write_batch * implement config-sensitive write_empty_chunks in write_batch, and add a test * add literacy to test * add warnings when write_empty_chunks is used as a kwarg * init * add ArrayConfig * docstring * ignore warning * fix v2 test * add test to ensure that write_empty_chunks can be set via the global config * fix tests * remove write_empty_chunks from Array.create; separate metadata order from config order * remove missing overload * Update src/zarr/core/array.py Co-authored-by: Norman Rzepka <[email protected]> * Update src/zarr/core/array.py Co-authored-by: Norman Rzepka <[email protected]> --------- Co-authored-by: Norman Rzepka <[email protected]>
1 parent 6930fe8 commit 6dc6d07

File tree

15 files changed

+399
-109
lines changed

15 files changed

+399
-109
lines changed

src/zarr/api/asynchronous.py

Lines changed: 55 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,16 @@
1010
from typing_extensions import deprecated
1111

1212
from zarr.core.array import Array, AsyncArray, get_array_metadata
13+
from zarr.core.array_spec import ArrayConfig, ArrayConfigParams
1314
from zarr.core.buffer import NDArrayLike
1415
from zarr.core.common import (
1516
JSON,
1617
AccessModeLiteral,
1718
ChunkCoords,
1819
MemoryOrder,
1920
ZarrFormat,
21+
_warn_order_kwarg,
22+
_warn_write_empty_chunks_kwarg,
2023
parse_dtype,
2124
)
2225
from zarr.core.config import config
@@ -794,7 +797,7 @@ async def create(
794797
read_only: bool | None = None,
795798
object_codec: Codec | None = None, # TODO: type has changed
796799
dimension_separator: Literal[".", "/"] | None = None,
797-
write_empty_chunks: bool = False, # TODO: default has changed
800+
write_empty_chunks: bool | None = None,
798801
zarr_version: ZarrFormat | None = None, # deprecated
799802
zarr_format: ZarrFormat | None = None,
800803
meta_array: Any | None = None, # TODO: need type
@@ -810,6 +813,7 @@ async def create(
810813
codecs: Iterable[Codec | dict[str, JSON]] | None = None,
811814
dimension_names: Iterable[str] | None = None,
812815
storage_options: dict[str, Any] | None = None,
816+
config: ArrayConfig | ArrayConfigParams | None = None,
813817
**kwargs: Any,
814818
) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]:
815819
"""Create an array.
@@ -856,8 +860,10 @@ async def create(
856860
These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. fill_value : object
857861
Default value to use for uninitialized portions of the array.
858862
order : {'C', 'F'}, optional
863+
Deprecated in favor of the ``config`` keyword argument.
864+
Pass ``{'order': <value>}`` to ``create`` instead of using this parameter.
859865
Memory layout to be used within each chunk.
860-
If not specified, default is taken from the Zarr config ```array.order```.
866+
If not specified, the ``array.order`` parameter in the global config will be used.
861867
store : Store or str
862868
Store or path to directory in file system or name of zip file.
863869
synchronizer : object, optional
@@ -891,30 +897,26 @@ async def create(
891897
Separator placed between the dimensions of a chunk.
892898
V2 only. V3 arrays should use ``chunk_key_encoding`` instead.
893899
Default is ".".
894-
.. versionadded:: 2.8
895-
896900
write_empty_chunks : bool, optional
897-
If True (default), all chunks will be stored regardless of their
901+
Deprecated in favor of the ``config`` keyword argument.
902+
Pass ``{'write_empty_chunks': <value>}`` to ``create`` instead of using this parameter.
903+
If True, all chunks will be stored regardless of their
898904
contents. If False, each chunk is compared to the array's fill value
899905
prior to storing. If a chunk is uniformly equal to the fill value, then
900906
that chunk is not be stored, and the store entry for that chunk's key
901-
is deleted. This setting enables sparser storage, as only chunks with
902-
non-fill-value data are stored, at the expense of overhead associated
903-
with checking the data of each chunk.
904-
905-
.. versionadded:: 2.11
906-
907+
is deleted.
907908
zarr_format : {2, 3, None}, optional
908909
The zarr format to use when saving.
909910
Default is 3.
910911
meta_array : array-like, optional
911912
An array instance to use for determining arrays to create and return
912913
to users. Use `numpy.empty(())` by default.
913-
914-
.. versionadded:: 2.13
915914
storage_options : dict
916915
If using an fsspec URL to create the store, these will be passed to
917916
the backend implementation. Ignored otherwise.
917+
config : ArrayConfig or ArrayConfigParams, optional
918+
Runtime configuration of the array. If provided, will override the
919+
default values from `zarr.config.array`.
918920
919921
Returns
920922
-------
@@ -951,26 +953,47 @@ async def create(
951953
warnings.warn("object_codec is not yet implemented", RuntimeWarning, stacklevel=2)
952954
if read_only is not None:
953955
warnings.warn("read_only is not yet implemented", RuntimeWarning, stacklevel=2)
954-
if dimension_separator is not None:
955-
if zarr_format == 3:
956-
raise ValueError(
957-
"dimension_separator is not supported for zarr format 3, use chunk_key_encoding instead"
958-
)
959-
else:
960-
warnings.warn(
961-
"dimension_separator is not yet implemented",
962-
RuntimeWarning,
963-
stacklevel=2,
964-
)
965-
if write_empty_chunks:
966-
warnings.warn("write_empty_chunks is not yet implemented", RuntimeWarning, stacklevel=2)
956+
if dimension_separator is not None and zarr_format == 3:
957+
raise ValueError(
958+
"dimension_separator is not supported for zarr format 3, use chunk_key_encoding instead"
959+
)
960+
961+
if order is not None:
962+
_warn_order_kwarg()
963+
if write_empty_chunks is not None:
964+
_warn_write_empty_chunks_kwarg()
965+
967966
if meta_array is not None:
968967
warnings.warn("meta_array is not yet implemented", RuntimeWarning, stacklevel=2)
969968

970969
mode = kwargs.pop("mode", None)
971970
if mode is None:
972971
mode = "a"
973972
store_path = await make_store_path(store, path=path, mode=mode, storage_options=storage_options)
973+
974+
config_dict: ArrayConfigParams = {}
975+
976+
if write_empty_chunks is not None:
977+
if config is not None:
978+
msg = (
979+
"Both write_empty_chunks and config keyword arguments are set. "
980+
"This is redundant. When both are set, write_empty_chunks will be ignored and "
981+
"config will be used."
982+
)
983+
warnings.warn(UserWarning(msg), stacklevel=1)
984+
config_dict["write_empty_chunks"] = write_empty_chunks
985+
if order is not None:
986+
if config is not None:
987+
msg = (
988+
"Both order and config keyword arguments are set. "
989+
"This is redundant. When both are set, order will be ignored and "
990+
"config will be used."
991+
)
992+
warnings.warn(UserWarning(msg), stacklevel=1)
993+
config_dict["order"] = order
994+
995+
config_parsed = ArrayConfig.from_dict(config_dict)
996+
974997
return await AsyncArray.create(
975998
store_path,
976999
shape=shape,
@@ -987,7 +1010,7 @@ async def create(
9871010
codecs=codecs,
9881011
dimension_names=dimension_names,
9891012
attributes=attributes,
990-
order=order,
1013+
config=config_parsed,
9911014
**kwargs,
9921015
)
9931016

@@ -1163,6 +1186,11 @@ async def open_array(
11631186

11641187
zarr_format = _handle_zarr_version_or_format(zarr_version=zarr_version, zarr_format=zarr_format)
11651188

1189+
if "order" in kwargs:
1190+
_warn_order_kwarg()
1191+
if "write_empty_chunks" in kwargs:
1192+
_warn_write_empty_chunks_kwarg()
1193+
11661194
try:
11671195
return await AsyncArray.open(store_path, zarr_format=zarr_format)
11681196
except FileNotFoundError:

src/zarr/api/synchronous.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
from zarr.abc.codec import Codec
1919
from zarr.api.asynchronous import ArrayLike, PathLike
20+
from zarr.core.array_spec import ArrayConfig, ArrayConfigParams
2021
from zarr.core.buffer import NDArrayLike
2122
from zarr.core.chunk_key_encodings import ChunkKeyEncoding
2223
from zarr.core.common import JSON, AccessModeLiteral, ChunkCoords, MemoryOrder, ZarrFormat
@@ -542,7 +543,7 @@ def create(
542543
read_only: bool | None = None,
543544
object_codec: Codec | None = None, # TODO: type has changed
544545
dimension_separator: Literal[".", "/"] | None = None,
545-
write_empty_chunks: bool = False, # TODO: default has changed
546+
write_empty_chunks: bool | None = None, # TODO: default has changed
546547
zarr_version: ZarrFormat | None = None, # deprecated
547548
zarr_format: ZarrFormat | None = None,
548549
meta_array: Any | None = None, # TODO: need type
@@ -558,6 +559,7 @@ def create(
558559
codecs: Iterable[Codec | dict[str, JSON]] | None = None,
559560
dimension_names: Iterable[str] | None = None,
560561
storage_options: dict[str, Any] | None = None,
562+
config: ArrayConfig | ArrayConfigParams | None = None,
561563
**kwargs: Any,
562564
) -> Array:
563565
"""Create an array.
@@ -578,8 +580,10 @@ def create(
578580
fill_value : object
579581
Default value to use for uninitialized portions of the array.
580582
order : {'C', 'F'}, optional
583+
Deprecated in favor of the ``config`` keyword argument.
584+
Pass ``{'order': <value>}`` to ``create`` instead of using this parameter.
581585
Memory layout to be used within each chunk.
582-
Default is set in Zarr's config (`array.order`).
586+
If not specified, the ``array.order`` parameter in the global config will be used.
583587
store : Store or str
584588
Store or path to directory in file system or name of zip file.
585589
synchronizer : object, optional
@@ -609,30 +613,25 @@ def create(
609613
A codec to encode object arrays, only needed if dtype=object.
610614
dimension_separator : {'.', '/'}, optional
611615
Separator placed between the dimensions of a chunk.
612-
613-
.. versionadded:: 2.8
614-
615616
write_empty_chunks : bool, optional
616-
If True (default), all chunks will be stored regardless of their
617+
Deprecated in favor of the ``config`` keyword argument.
618+
Pass ``{'write_empty_chunks': <value>}`` to ``create`` instead of using this parameter.
619+
If True, all chunks will be stored regardless of their
617620
contents. If False, each chunk is compared to the array's fill value
618621
prior to storing. If a chunk is uniformly equal to the fill value, then
619622
that chunk is not be stored, and the store entry for that chunk's key
620-
is deleted. This setting enables sparser storage, as only chunks with
621-
non-fill-value data are stored, at the expense of overhead associated
622-
with checking the data of each chunk.
623-
624-
.. versionadded:: 2.11
625-
623+
is deleted.
626624
zarr_format : {2, 3, None}, optional
627625
The zarr format to use when saving.
628626
meta_array : array-like, optional
629627
An array instance to use for determining arrays to create and return
630628
to users. Use `numpy.empty(())` by default.
631-
632-
.. versionadded:: 2.13
633629
storage_options : dict
634630
If using an fsspec URL to create the store, these will be passed to
635631
the backend implementation. Ignored otherwise.
632+
config : ArrayConfig or ArrayConfigParams, optional
633+
Runtime configuration of the array. If provided, will override the
634+
default values from `zarr.config.array`.
636635
637636
Returns
638637
-------
@@ -669,6 +668,7 @@ def create(
669668
codecs=codecs,
670669
dimension_names=dimension_names,
671670
storage_options=storage_options,
671+
config=config,
672672
**kwargs,
673673
)
674674
)

src/zarr/codecs/sharding.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from zarr.abc.store import ByteGetter, ByteRangeRequest, ByteSetter
2121
from zarr.codecs.bytes import BytesCodec
2222
from zarr.codecs.crc32c_ import Crc32cCodec
23-
from zarr.core.array_spec import ArraySpec
23+
from zarr.core.array_spec import ArrayConfig, ArraySpec
2424
from zarr.core.buffer import (
2525
Buffer,
2626
BufferPrototype,
@@ -665,7 +665,9 @@ def _get_index_chunk_spec(self, chunks_per_shard: ChunkCoords) -> ArraySpec:
665665
shape=chunks_per_shard + (2,),
666666
dtype=np.dtype("<u8"),
667667
fill_value=MAX_UINT_64,
668-
order="C", # Note: this is hard-coded for simplicity -- it is not surfaced into user code
668+
config=ArrayConfig(
669+
order="C", write_empty_chunks=False
670+
), # Note: this is hard-coded for simplicity -- it is not surfaced into user code,
669671
prototype=numpy_buffer_prototype(),
670672
)
671673

@@ -674,7 +676,7 @@ def _get_chunk_spec(self, shard_spec: ArraySpec) -> ArraySpec:
674676
shape=self.chunk_shape,
675677
dtype=shard_spec.dtype,
676678
fill_value=shard_spec.fill_value,
677-
order=shard_spec.order,
679+
config=shard_spec.config,
678680
prototype=shard_spec.prototype,
679681
)
680682

src/zarr/codecs/transpose.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec:
8484
shape=tuple(chunk_spec.shape[self.order[i]] for i in range(chunk_spec.ndim)),
8585
dtype=chunk_spec.dtype,
8686
fill_value=chunk_spec.fill_value,
87-
order=chunk_spec.order,
87+
config=chunk_spec.config,
8888
prototype=chunk_spec.prototype,
8989
)
9090

0 commit comments

Comments
 (0)