zarr-developers
diff --git a/‎changes/2784.feature.rst‎
Lines changed: 1 addition & 0 deletions b/‎changes/2784.feature.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/zarr/abc/codec.py‎
Lines changed: 2 additions & 2 deletions b/‎src/zarr/abc/codec.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/zarr/codecs/sharding.py‎
Lines changed: 9 additions & 5 deletions b/‎src/zarr/codecs/sharding.py‎
Lines changed: 9 additions & 5 deletions
diff --git a/‎src/zarr/core/array.py‎
Lines changed: 4 additions & 2 deletions b/‎src/zarr/core/array.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/zarr/core/codec_pipeline.py‎
Lines changed: 34 additions & 26 deletions b/‎src/zarr/core/codec_pipeline.py‎
Lines changed: 34 additions & 26 deletions
@@ -0,0 +1 @@
+Avoid reading chunks during writes where possible. :issue:`757`
@@ -357,7 +357,7 @@ async def encode(
     @abstractmethod
     async def read(
         self,
-        batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple]],
+        batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
         out: NDBuffer,
         drop_axes: tuple[int, ...] = (),
     ) -> None:
@@ -379,7 +379,7 @@ async def read(
     @abstractmethod
     async def write(
         self,
-        batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple]],
+        batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
         value: NDBuffer,
         drop_axes: tuple[int, ...] = (),
     ) -> None:
 
@@ -455,8 +455,9 @@ async def _decode_single(
                     chunk_spec,
                     chunk_selection,
                     out_selection,
+                    is_complete_shard,
                 )
-                for chunk_coords, chunk_selection, out_selection in indexer
+                for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer
             ],
             out,
         )
@@ -486,7 +487,7 @@ async def _decode_partial_single(
         )
 
         indexed_chunks = list(indexer)
-        all_chunk_coords = {chunk_coords for chunk_coords, _, _ in indexed_chunks}
+        all_chunk_coords = {chunk_coords for chunk_coords, *_ in indexed_chunks}
 
         # reading bytes of all requested chunks
         shard_dict: ShardMapping = {}
@@ -524,8 +525,9 @@ async def _decode_partial_single(
                     chunk_spec,
                     chunk_selection,
                     out_selection,
+                    is_complete_shard,
                 )
-                for chunk_coords, chunk_selection, out_selection in indexer
+                for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer
             ],
             out,
         )
@@ -558,8 +560,9 @@ async def _encode_single(
                     chunk_spec,
                     chunk_selection,
                     out_selection,
+                    is_complete_shard,
                 )
-                for chunk_coords, chunk_selection, out_selection in indexer
+                for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer
             ],
             shard_array,
         )
@@ -601,8 +604,9 @@ async def _encode_partial_single(
                     chunk_spec,
                     chunk_selection,
                     out_selection,
+                    is_complete_shard,
                 )
-                for chunk_coords, chunk_selection, out_selection in indexer
+                for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer
             ],
             shard_array,
         )
 
@@ -1290,8 +1290,9 @@ async def _get_selection(
                         self.metadata.get_chunk_spec(chunk_coords, _config, prototype=prototype),
                         chunk_selection,
                         out_selection,
+                        is_complete_chunk,
                     )
-                    for chunk_coords, chunk_selection, out_selection in indexer
+                    for chunk_coords, chunk_selection, out_selection, is_complete_chunk in indexer
                 ],
                 out_buffer,
                 drop_axes=indexer.drop_axes,
@@ -1417,8 +1418,9 @@ async def _set_selection(
                     self.metadata.get_chunk_spec(chunk_coords, _config, prototype),
                     chunk_selection,
                     out_selection,
+                    is_complete_chunk,
                 )
-                for chunk_coords, chunk_selection, out_selection in indexer
+                for chunk_coords, chunk_selection, out_selection, is_complete_chunk in indexer
             ],
             value_buffer,
             drop_axes=indexer.drop_axes,
 
@@ -16,7 +16,7 @@
 )
 from zarr.core.common import ChunkCoords, concurrent_map
 from zarr.core.config import config
-from zarr.core.indexing import SelectorTuple, is_scalar, is_total_slice
+from zarr.core.indexing import SelectorTuple, is_scalar
 from zarr.core.metadata.v2 import _default_fill_value
 from zarr.registry import register_pipeline
 
@@ -243,18 +243,18 @@ async def encode_partial_batch(
 
     async def read_batch(
         self,
-        batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple]],
+        batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
         out: NDBuffer,
         drop_axes: tuple[int, ...] = (),
     ) -> None:
         if self.supports_partial_decode:
             chunk_array_batch = await self.decode_partial_batch(
                 [
                     (byte_getter, chunk_selection, chunk_spec)
-                    for byte_getter, chunk_spec, chunk_selection, _ in batch_info
+                    for byte_getter, chunk_spec, chunk_selection, *_ in batch_info
                 ]
             )
-            for chunk_array, (_, chunk_spec, _, out_selection) in zip(
+            for chunk_array, (_, chunk_spec, _, out_selection, _) in zip(
                 chunk_array_batch, batch_info, strict=False
             ):
                 if chunk_array is not None:
@@ -263,22 +263,19 @@ async def read_batch(
                     out[out_selection] = fill_value_or_default(chunk_spec)
         else:
             chunk_bytes_batch = await concurrent_map(
-                [
-                    (byte_getter, array_spec.prototype)
-                    for byte_getter, array_spec, _, _ in batch_info
-                ],
+                [(byte_getter, array_spec.prototype) for byte_getter, array_spec, *_ in batch_info],
                 lambda byte_getter, prototype: byte_getter.get(prototype),
                 config.get("async.concurrency"),
             )
             chunk_array_batch = await self.decode_batch(
                 [
                     (chunk_bytes, chunk_spec)
-                    for chunk_bytes, (_, chunk_spec, _, _) in zip(
+                    for chunk_bytes, (_, chunk_spec, *_) in zip(
                         chunk_bytes_batch, batch_info, strict=False
                     )
                 ],
             )
-            for chunk_array, (_, chunk_spec, chunk_selection, out_selection) in zip(
+            for chunk_array, (_, chunk_spec, chunk_selection, out_selection, _) in zip(
                 chunk_array_batch, batch_info, strict=False
             ):
                 if chunk_array is not None:
@@ -296,9 +293,10 @@ def _merge_chunk_array(
         out_selection: SelectorTuple,
         chunk_spec: ArraySpec,
         chunk_selection: SelectorTuple,
+        is_complete_chunk: bool,
         drop_axes: tuple[int, ...],
     ) -> NDBuffer:
-        if is_total_slice(chunk_selection, chunk_spec.shape) and value.shape == chunk_spec.shape:
+        if is_complete_chunk and value.shape == chunk_spec.shape:
             return value
         if existing_chunk_array is None:
             chunk_array = chunk_spec.prototype.nd_buffer.create(
@@ -327,7 +325,7 @@ def _merge_chunk_array(
 
     async def write_batch(
         self,
-        batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple]],
+        batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
         value: NDBuffer,
         drop_axes: tuple[int, ...] = (),
     ) -> None:
@@ -337,14 +335,14 @@ async def write_batch(
                 await self.encode_partial_batch(
                     [
                         (byte_setter, value, chunk_selection, chunk_spec)
-                        for byte_setter, chunk_spec, chunk_selection, out_selection in batch_info
+                        for byte_setter, chunk_spec, chunk_selection, out_selection, _ in batch_info
                     ],
                 )
             else:
                 await self.encode_partial_batch(
                     [
                         (byte_setter, value[out_selection], chunk_selection, chunk_spec)
-                        for byte_setter, chunk_spec, chunk_selection, out_selection in batch_info
+                        for byte_setter, chunk_spec, chunk_selection, out_selection, _ in batch_info
                     ],
                 )
 
@@ -361,33 +359,43 @@ async def _read_key(
             chunk_bytes_batch = await concurrent_map(
                 [
                     (
-                        None if is_total_slice(chunk_selection, chunk_spec.shape) else byte_setter,
+                        None if is_complete_chunk else byte_setter,
                         chunk_spec.prototype,
                     )
-                    for byte_setter, chunk_spec, chunk_selection, _ in batch_info
+                    for byte_setter, chunk_spec, chunk_selection, _, is_complete_chunk in batch_info
                 ],
                 _read_key,
                 config.get("async.concurrency"),
             )
             chunk_array_decoded = await self.decode_batch(
                 [
                     (chunk_bytes, chunk_spec)
-                    for chunk_bytes, (_, chunk_spec, _, _) in zip(
+                    for chunk_bytes, (_, chunk_spec, *_) in zip(
                         chunk_bytes_batch, batch_info, strict=False
                     )
                 ],
             )
 
             chunk_array_merged = [
                 self._merge_chunk_array(
-                    chunk_array, value, out_selection, chunk_spec, chunk_selection, drop_axes
-                )
-                for chunk_array, (_, chunk_spec, chunk_selection, out_selection) in zip(
-                    chunk_array_decoded, batch_info, strict=False
+                    chunk_array,
+                    value,
+                    out_selection,
+                    chunk_spec,
+                    chunk_selection,
+                    is_complete_chunk,
+                    drop_axes,
                 )
+                for chunk_array, (
+                    _,
+                    chunk_spec,
+                    chunk_selection,
+                    out_selection,
+                    is_complete_chunk,
+                ) in zip(chunk_array_decoded, batch_info, strict=False)
             ]
             chunk_array_batch: list[NDBuffer | None] = []
-            for chunk_array, (_, chunk_spec, _, _) in zip(
+            for chunk_array, (_, chunk_spec, *_) in zip(
                 chunk_array_merged, batch_info, strict=False
             ):
                 if chunk_array is None:
@@ -403,7 +411,7 @@ async def _read_key(
             chunk_bytes_batch = await self.encode_batch(
                 [
                     (chunk_array, chunk_spec)
-                    for chunk_array, (_, chunk_spec, _, _) in zip(
+                    for chunk_array, (_, chunk_spec, *_) in zip(
                         chunk_array_batch, batch_info, strict=False
                     )
                 ],
@@ -418,7 +426,7 @@ async def _write_key(byte_setter: ByteSetter, chunk_bytes: Buffer | None) -> Non
             await concurrent_map(
                 [
                     (byte_setter, chunk_bytes)
-                    for chunk_bytes, (byte_setter, _, _, _) in zip(
+                    for chunk_bytes, (byte_setter, *_) in zip(
                         chunk_bytes_batch, batch_info, strict=False
                     )
                 ],
@@ -446,7 +454,7 @@ async def encode(
 
     async def read(
         self,
-        batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple]],
+        batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
         out: NDBuffer,
         drop_axes: tuple[int, ...] = (),
     ) -> None:
@@ -461,7 +469,7 @@ async def read(
 
     async def write(
         self,
-        batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple]],
+        batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
         value: NDBuffer,
         drop_axes: tuple[int, ...] = (),
     ) -> None:
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+Avoid reading chunks during writes where possible. :issue:`757`
Original file line number	Diff line number	Diff line change
`@@ -455,8 +455,9 @@ async def _decode_single(`
`455`	`455`	`chunk_spec,`
`456`	`456`	`chunk_selection,`
`457`	`457`	`out_selection,`
	`458`	`+ is_complete_shard,`
`458`	`459`	`)`
`459`		`- for chunk_coords, chunk_selection, out_selection in indexer`
	`460`	`+ for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer`
`460`	`461`	`],`
`461`	`462`	`out,`
`462`	`463`	`)`
`@@ -486,7 +487,7 @@ async def _decode_partial_single(`
`486`	`487`	`)`
`487`	`488`
`488`	`489`	`indexed_chunks = list(indexer)`
`489`		`- all_chunk_coords = {chunk_coords for chunk_coords, _, _ in indexed_chunks}`
	`490`	`+ all_chunk_coords = {chunk_coords for chunk_coords, *_ in indexed_chunks}`
`490`	`491`
`491`	`492`	`# reading bytes of all requested chunks`
`492`	`493`	`shard_dict: ShardMapping = {}`
`@@ -524,8 +525,9 @@ async def _decode_partial_single(`
`524`	`525`	`chunk_spec,`
`525`	`526`	`chunk_selection,`
`526`	`527`	`out_selection,`
	`528`	`+ is_complete_shard,`
`527`	`529`	`)`
`528`		`- for chunk_coords, chunk_selection, out_selection in indexer`
	`530`	`+ for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer`
`529`	`531`	`],`
`530`	`532`	`out,`
`531`	`533`	`)`
`@@ -558,8 +560,9 @@ async def _encode_single(`
`558`	`560`	`chunk_spec,`
`559`	`561`	`chunk_selection,`
`560`	`562`	`out_selection,`
	`563`	`+ is_complete_shard,`
`561`	`564`	`)`
`562`		`- for chunk_coords, chunk_selection, out_selection in indexer`
	`565`	`+ for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer`
`563`	`566`	`],`
`564`	`567`	`shard_array,`
`565`	`568`	`)`
`@@ -601,8 +604,9 @@ async def _encode_partial_single(`
`601`	`604`	`chunk_spec,`
`602`	`605`	`chunk_selection,`
`603`	`606`	`out_selection,`
	`607`	`+ is_complete_shard,`
`604`	`608`	`)`
`605`		`- for chunk_coords, chunk_selection, out_selection in indexer`
	`609`	`+ for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer`
`606`	`610`	`],`
`607`	`611`	`shard_array,`
`608`	`612`	`)`