zarr-developers
diff --git a/‎changes/2784.feature.rst‎
Lines changed: 1 addition & 0 deletions b/‎changes/2784.feature.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎changes/2799.bugfix.rst‎
Lines changed: 1 addition & 0 deletions b/‎changes/2799.bugfix.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/zarr/abc/codec.py‎
Lines changed: 2 additions & 2 deletions b/‎src/zarr/abc/codec.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/zarr/codecs/sharding.py‎
Lines changed: 9 additions & 5 deletions b/‎src/zarr/codecs/sharding.py‎
Lines changed: 9 additions & 5 deletions
diff --git a/‎src/zarr/core/array.py‎
Lines changed: 4 additions & 2 deletions b/‎src/zarr/core/array.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/zarr/core/codec_pipeline.py‎
Lines changed: 51 additions & 43 deletions b/‎src/zarr/core/codec_pipeline.py‎
Lines changed: 51 additions & 43 deletions
@@ -0,0 +1 @@
+Avoid reading chunks during writes where possible. :issue:`757`
@@ -0,0 +1 @@
+Enitialise empty chunks to the default fill value during writing and add default fill values for datetime, timedelta, structured, and other (void* fixed size) data types
@@ -357,7 +357,7 @@ async def encode(
     @abstractmethod
     async def read(
         self,
-        batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple]],
+        batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
         out: NDBuffer,
         drop_axes: tuple[int, ...] = (),
     ) -> None:
@@ -379,7 +379,7 @@ async def read(
     @abstractmethod
     async def write(
         self,
-        batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple]],
+        batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
         value: NDBuffer,
         drop_axes: tuple[int, ...] = (),
     ) -> None:
 
@@ -455,8 +455,9 @@ async def _decode_single(
                     chunk_spec,
                     chunk_selection,
                     out_selection,
+                    is_complete_shard,
                 )
-                for chunk_coords, chunk_selection, out_selection in indexer
+                for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer
             ],
             out,
         )
@@ -486,7 +487,7 @@ async def _decode_partial_single(
         )
 
         indexed_chunks = list(indexer)
-        all_chunk_coords = {chunk_coords for chunk_coords, _, _ in indexed_chunks}
+        all_chunk_coords = {chunk_coords for chunk_coords, *_ in indexed_chunks}
 
         # reading bytes of all requested chunks
         shard_dict: ShardMapping = {}
@@ -524,8 +525,9 @@ async def _decode_partial_single(
                     chunk_spec,
                     chunk_selection,
                     out_selection,
+                    is_complete_shard,
                 )
-                for chunk_coords, chunk_selection, out_selection in indexer
+                for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer
             ],
             out,
         )
@@ -562,8 +564,9 @@ async def _encode_single(
                     chunk_spec,
                     chunk_selection,
                     out_selection,
+                    is_complete_shard,
                 )
-                for chunk_coords, chunk_selection, out_selection in indexer
+                for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer
             ],
             shard_array,
         )
@@ -605,8 +608,9 @@ async def _encode_partial_single(
                     chunk_spec,
                     chunk_selection,
                     out_selection,
+                    is_complete_shard,
                 )
-                for chunk_coords, chunk_selection, out_selection in indexer
+                for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer
             ],
             shard_array,
         )
 
@@ -1290,8 +1290,9 @@ async def _get_selection(
                         self.metadata.get_chunk_spec(chunk_coords, _config, prototype=prototype),
                         chunk_selection,
                         out_selection,
+                        is_complete_chunk,
                     )
-                    for chunk_coords, chunk_selection, out_selection in indexer
+                    for chunk_coords, chunk_selection, out_selection, is_complete_chunk in indexer
                 ],
                 out_buffer,
                 drop_axes=indexer.drop_axes,
@@ -1417,8 +1418,9 @@ async def _set_selection(
                     self.metadata.get_chunk_spec(chunk_coords, _config, prototype),
                     chunk_selection,
                     out_selection,
+                    is_complete_chunk,
                 )
-                for chunk_coords, chunk_selection, out_selection in indexer
+                for chunk_coords, chunk_selection, out_selection, is_complete_chunk in indexer
             ],
             value_buffer,
             drop_axes=indexer.drop_axes,
 
@@ -16,7 +16,7 @@
 )
 from zarr.core.common import ChunkCoords, concurrent_map
 from zarr.core.config import config
-from zarr.core.indexing import SelectorTuple, is_scalar, is_total_slice
+from zarr.core.indexing import SelectorTuple, is_scalar
 from zarr.core.metadata.v2 import _default_fill_value
 from zarr.registry import register_pipeline
 
@@ -56,6 +56,19 @@ def resolve_batched(codec: Codec, chunk_specs: Iterable[ArraySpec]) -> Iterable[
     return [codec.resolve_metadata(chunk_spec) for chunk_spec in chunk_specs]
 
 
+def fill_value_or_default(chunk_spec: ArraySpec) -> Any:
+    fill_value = chunk_spec.fill_value
+    if fill_value is None:
+        # Zarr V2 allowed `fill_value` to be null in the metadata.
+        # Zarr V3 requires it to be set. This has already been
+        # validated when decoding the metadata, but we support reading
+        # Zarr V2 data and need to support the case where fill_value
+        # is None.
+        return _default_fill_value(dtype=chunk_spec.dtype)
+    else:
+        return fill_value
+
+
 @dataclass(frozen=True)
 class BatchedCodecPipeline(CodecPipeline):
     """Default codec pipeline.
@@ -230,52 +243,39 @@ async def encode_partial_batch(
 
     async def read_batch(
         self,
-        batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple]],
+        batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
         out: NDBuffer,
         drop_axes: tuple[int, ...] = (),
     ) -> None:
         if self.supports_partial_decode:
             chunk_array_batch = await self.decode_partial_batch(
                 [
                     (byte_getter, chunk_selection, chunk_spec)
-                    for byte_getter, chunk_spec, chunk_selection, _ in batch_info
+                    for byte_getter, chunk_spec, chunk_selection, *_ in batch_info
                 ]
             )
-            for chunk_array, (_, chunk_spec, _, out_selection) in zip(
+            for chunk_array, (_, chunk_spec, _, out_selection, _) in zip(
                 chunk_array_batch, batch_info, strict=False
             ):
                 if chunk_array is not None:
                     out[out_selection] = chunk_array
                 else:
-                    fill_value = chunk_spec.fill_value
-
-                    if fill_value is None:
-                        # Zarr V2 allowed `fill_value` to be null in the metadata.
-                        # Zarr V3 requires it to be set. This has already been
-                        # validated when decoding the metadata, but we support reading
-                        # Zarr V2 data and need to support the case where fill_value
-                        # is None.
-                        fill_value = _default_fill_value(dtype=chunk_spec.dtype)
-
-                    out[out_selection] = fill_value
+                    out[out_selection] = fill_value_or_default(chunk_spec)
         else:
             chunk_bytes_batch = await concurrent_map(
-                [
-                    (byte_getter, array_spec.prototype)
-                    for byte_getter, array_spec, _, _ in batch_info
-                ],
+                [(byte_getter, array_spec.prototype) for byte_getter, array_spec, *_ in batch_info],
                 lambda byte_getter, prototype: byte_getter.get(prototype),
                 config.get("async.concurrency"),
             )
             chunk_array_batch = await self.decode_batch(
                 [
                     (chunk_bytes, chunk_spec)
-                    for chunk_bytes, (_, chunk_spec, _, _) in zip(
+                    for chunk_bytes, (_, chunk_spec, *_) in zip(
                         chunk_bytes_batch, batch_info, strict=False
                     )
                 ],
             )
-            for chunk_array, (_, chunk_spec, chunk_selection, out_selection) in zip(
+            for chunk_array, (_, chunk_spec, chunk_selection, out_selection, _) in zip(
                 chunk_array_batch, batch_info, strict=False
             ):
                 if chunk_array is not None:
@@ -284,10 +284,7 @@ async def read_batch(
                         tmp = tmp.squeeze(axis=drop_axes)
                     out[out_selection] = tmp
                 else:
-                    fill_value = chunk_spec.fill_value
-                    if fill_value is None:
-                        fill_value = _default_fill_value(dtype=chunk_spec.dtype)
-                    out[out_selection] = fill_value
+                    out[out_selection] = fill_value_or_default(chunk_spec)
 
     def _merge_chunk_array(
         self,
@@ -296,16 +293,17 @@ def _merge_chunk_array(
         out_selection: SelectorTuple,
         chunk_spec: ArraySpec,
         chunk_selection: SelectorTuple,
+        is_complete_chunk: bool,
         drop_axes: tuple[int, ...],
     ) -> NDBuffer:
-        if is_total_slice(chunk_selection, chunk_spec.shape) and value.shape == chunk_spec.shape:
+        if is_complete_chunk and value.shape == chunk_spec.shape:
             return value
         if existing_chunk_array is None:
             chunk_array = chunk_spec.prototype.nd_buffer.create(
                 shape=chunk_spec.shape,
                 dtype=chunk_spec.dtype,
                 order=chunk_spec.order,
-                fill_value=chunk_spec.fill_value,
+                fill_value=fill_value_or_default(chunk_spec),
             )
         else:
             chunk_array = existing_chunk_array.copy()  # make a writable copy
@@ -327,7 +325,7 @@ def _merge_chunk_array(
 
     async def write_batch(
         self,
-        batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple]],
+        batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
         value: NDBuffer,
         drop_axes: tuple[int, ...] = (),
     ) -> None:
@@ -337,14 +335,14 @@ async def write_batch(
                 await self.encode_partial_batch(
                     [
                         (byte_setter, value, chunk_selection, chunk_spec)
-                        for byte_setter, chunk_spec, chunk_selection, out_selection in batch_info
+                        for byte_setter, chunk_spec, chunk_selection, out_selection, _ in batch_info
                     ],
                 )
             else:
                 await self.encode_partial_batch(
                     [
                         (byte_setter, value[out_selection], chunk_selection, chunk_spec)
-                        for byte_setter, chunk_spec, chunk_selection, out_selection in batch_info
+                        for byte_setter, chunk_spec, chunk_selection, out_selection, _ in batch_info
                     ],
                 )
 
@@ -361,40 +359,50 @@ async def _read_key(
             chunk_bytes_batch = await concurrent_map(
                 [
                     (
-                        None if is_total_slice(chunk_selection, chunk_spec.shape) else byte_setter,
+                        None if is_complete_chunk else byte_setter,
                         chunk_spec.prototype,
                     )
-                    for byte_setter, chunk_spec, chunk_selection, _ in batch_info
+                    for byte_setter, chunk_spec, chunk_selection, _, is_complete_chunk in batch_info
                 ],
                 _read_key,
                 config.get("async.concurrency"),
             )
             chunk_array_decoded = await self.decode_batch(
                 [
                     (chunk_bytes, chunk_spec)
-                    for chunk_bytes, (_, chunk_spec, _, _) in zip(
+                    for chunk_bytes, (_, chunk_spec, *_) in zip(
                         chunk_bytes_batch, batch_info, strict=False
                     )
                 ],
             )
 
             chunk_array_merged = [
                 self._merge_chunk_array(
-                    chunk_array, value, out_selection, chunk_spec, chunk_selection, drop_axes
-                )
-                for chunk_array, (_, chunk_spec, chunk_selection, out_selection) in zip(
-                    chunk_array_decoded, batch_info, strict=False
+                    chunk_array,
+                    value,
+                    out_selection,
+                    chunk_spec,
+                    chunk_selection,
+                    is_complete_chunk,
+                    drop_axes,
                 )
+                for chunk_array, (
+                    _,
+                    chunk_spec,
+                    chunk_selection,
+                    out_selection,
+                    is_complete_chunk,
+                ) in zip(chunk_array_decoded, batch_info, strict=False)
             ]
             chunk_array_batch: list[NDBuffer | None] = []
-            for chunk_array, (_, chunk_spec, _, _) in zip(
+            for chunk_array, (_, chunk_spec, *_) in zip(
                 chunk_array_merged, batch_info, strict=False
             ):
                 if chunk_array is None:
                     chunk_array_batch.append(None)  # type: ignore[unreachable]
                 else:
                     if not chunk_spec.config.write_empty_chunks and chunk_array.all_equal(
-                        chunk_spec.fill_value
+                        fill_value_or_default(chunk_spec)
                     ):
                         chunk_array_batch.append(None)
                     else:
@@ -403,7 +411,7 @@ async def _read_key(
             chunk_bytes_batch = await self.encode_batch(
                 [
                     (chunk_array, chunk_spec)
-                    for chunk_array, (_, chunk_spec, _, _) in zip(
+                    for chunk_array, (_, chunk_spec, *_) in zip(
                         chunk_array_batch, batch_info, strict=False
                     )
                 ],
@@ -418,7 +426,7 @@ async def _write_key(byte_setter: ByteSetter, chunk_bytes: Buffer | None) -> Non
             await concurrent_map(
                 [
                     (byte_setter, chunk_bytes)
-                    for chunk_bytes, (byte_setter, _, _, _) in zip(
+                    for chunk_bytes, (byte_setter, *_) in zip(
                         chunk_bytes_batch, batch_info, strict=False
                     )
                 ],
@@ -446,7 +454,7 @@ async def encode(
 
     async def read(
         self,
-        batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple]],
+        batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
         out: NDBuffer,
         drop_axes: tuple[int, ...] = (),
     ) -> None:
@@ -461,7 +469,7 @@ async def read(
 
     async def write(
         self,
-        batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple]],
+        batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
         value: NDBuffer,
         drop_axes: tuple[int, ...] = (),
     ) -> None:
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+Avoid reading chunks during writes where possible. :issue:`757`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Enitialise empty chunks to the default fill value during writing and add default fill values for datetime, timedelta, structured, and other (void* fixed size) data types`
Original file line number	Diff line number	Diff line change
`@@ -455,8 +455,9 @@ async def _decode_single(`
`455`	`455`	`chunk_spec,`
`456`	`456`	`chunk_selection,`
`457`	`457`	`out_selection,`
	`458`	`+ is_complete_shard,`
`458`	`459`	`)`
`459`		`- for chunk_coords, chunk_selection, out_selection in indexer`
	`460`	`+ for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer`
`460`	`461`	`],`
`461`	`462`	`out,`
`462`	`463`	`)`
`@@ -486,7 +487,7 @@ async def _decode_partial_single(`
`486`	`487`	`)`
`487`	`488`
`488`	`489`	`indexed_chunks = list(indexer)`
`489`		`- all_chunk_coords = {chunk_coords for chunk_coords, _, _ in indexed_chunks}`
	`490`	`+ all_chunk_coords = {chunk_coords for chunk_coords, *_ in indexed_chunks}`
`490`	`491`
`491`	`492`	`# reading bytes of all requested chunks`
`492`	`493`	`shard_dict: ShardMapping = {}`
`@@ -524,8 +525,9 @@ async def _decode_partial_single(`
`524`	`525`	`chunk_spec,`
`525`	`526`	`chunk_selection,`
`526`	`527`	`out_selection,`
	`528`	`+ is_complete_shard,`
`527`	`529`	`)`
`528`		`- for chunk_coords, chunk_selection, out_selection in indexer`
	`530`	`+ for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer`
`529`	`531`	`],`
`530`	`532`	`out,`
`531`	`533`	`)`
`@@ -562,8 +564,9 @@ async def _encode_single(`
`562`	`564`	`chunk_spec,`
`563`	`565`	`chunk_selection,`
`564`	`566`	`out_selection,`
	`567`	`+ is_complete_shard,`
`565`	`568`	`)`
`566`		`- for chunk_coords, chunk_selection, out_selection in indexer`
	`569`	`+ for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer`
`567`	`570`	`],`
`568`	`571`	`shard_array,`
`569`	`572`	`)`
`@@ -605,8 +608,9 @@ async def _encode_partial_single(`
`605`	`608`	`chunk_spec,`
`606`	`609`	`chunk_selection,`
`607`	`610`	`out_selection,`
	`611`	`+ is_complete_shard,`
`608`	`612`	`)`
`609`		`- for chunk_coords, chunk_selection, out_selection in indexer`
	`613`	`+ for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer`
`610`	`614`	`],`
`611`	`615`	`shard_array,`
`612`	`616`	`)`