Skip to content

Commit 862481a

Browse files
committed
Merge branch 'refs/heads/main' into return-scalar-for-zero-dim-indexing
2 parents 98e13a8 + c66f32b commit 862481a

File tree

12 files changed

+171
-30
lines changed

12 files changed

+171
-30
lines changed

changes/2755.bugfix.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
The array returned by ``zarr.empty`` and an empty ``zarr.core.buffer.cpu.NDBuffer`` will now be filled with the
2+
specified fill value, or with zeros if no fill value is provided.
3+
This fixes a bug where Zarr format 2 data with no fill value was written with un-predictable chunk sizes.

changes/2799.bugfix.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Enitialise empty chunks to the default fill value during writing and add default fill values for datetime, timedelta, structured, and other (void* fixed size) data types

src/zarr/api/asynchronous.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1065,7 +1065,8 @@ async def create(
10651065
async def empty(
10661066
shape: ChunkCoords, **kwargs: Any
10671067
) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]:
1068-
"""Create an empty array.
1068+
"""Create an empty array with the specified shape. The contents will be filled with the
1069+
array's fill value or zeros if no fill value is provided.
10691070
10701071
Parameters
10711072
----------
@@ -1087,7 +1088,8 @@ async def empty(
10871088
async def empty_like(
10881089
a: ArrayLike, **kwargs: Any
10891090
) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]:
1090-
"""Create an empty array like `a`.
1091+
"""Create an empty array like `a`. The contents will be filled with the
1092+
array's fill value or zeros if no fill value is provided.
10911093
10921094
Parameters
10931095
----------
@@ -1100,6 +1102,12 @@ async def empty_like(
11001102
-------
11011103
Array
11021104
The new array.
1105+
1106+
Notes
1107+
-----
1108+
The contents of an empty Zarr array are not defined. On attempting to
1109+
retrieve data from an empty Zarr array, any values may be returned,
1110+
and these are not guaranteed to be stable from one access to the next.
11031111
"""
11041112
like_kwargs = _like_args(a, kwargs)
11051113
return await empty(**like_kwargs)

src/zarr/api/synchronous.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -902,7 +902,8 @@ def create_array(
902902

903903
# TODO: add type annotations for kwargs
904904
def empty(shape: ChunkCoords, **kwargs: Any) -> Array:
905-
"""Create an empty array.
905+
"""Create an empty array with the specified shape. The contents will be filled with the
906+
array's fill value or zeros if no fill value is provided.
906907
907908
Parameters
908909
----------
@@ -928,7 +929,8 @@ def empty(shape: ChunkCoords, **kwargs: Any) -> Array:
928929
# TODO: move ArrayLike to common module
929930
# TODO: add type annotations for kwargs
930931
def empty_like(a: ArrayLike, **kwargs: Any) -> Array:
931-
"""Create an empty array like another array.
932+
"""Create an empty array like another array. The contents will be filled with the
933+
array's fill value or zeros if no fill value is provided.
932934
933935
Parameters
934936
----------
@@ -941,6 +943,12 @@ def empty_like(a: ArrayLike, **kwargs: Any) -> Array:
941943
-------
942944
Array
943945
The new array.
946+
947+
Notes
948+
-----
949+
The contents of an empty Zarr array are not defined. On attempting to
950+
retrieve data from an empty Zarr array, any values may be returned,
951+
and these are not guaranteed to be stable from one access to the next.
944952
"""
945953
return Array(sync(async_api.empty_like(a, **kwargs)))
946954

src/zarr/core/buffer/cpu.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -154,10 +154,10 @@ def create(
154154
order: Literal["C", "F"] = "C",
155155
fill_value: Any | None = None,
156156
) -> Self:
157-
ret = cls(np.empty(shape=tuple(shape), dtype=dtype, order=order))
158-
if fill_value is not None:
159-
ret.fill(fill_value)
160-
return ret
157+
if fill_value is None:
158+
return cls(np.zeros(shape=tuple(shape), dtype=dtype, order=order))
159+
else:
160+
return cls(np.full(shape=tuple(shape), fill_value=fill_value, dtype=dtype, order=order))
161161

162162
@classmethod
163163
def from_numpy_array(cls, array_like: npt.ArrayLike) -> Self:

src/zarr/core/codec_pipeline.py

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,19 @@ def resolve_batched(codec: Codec, chunk_specs: Iterable[ArraySpec]) -> Iterable[
5656
return [codec.resolve_metadata(chunk_spec) for chunk_spec in chunk_specs]
5757

5858

59+
def fill_value_or_default(chunk_spec: ArraySpec) -> Any:
60+
fill_value = chunk_spec.fill_value
61+
if fill_value is None:
62+
# Zarr V2 allowed `fill_value` to be null in the metadata.
63+
# Zarr V3 requires it to be set. This has already been
64+
# validated when decoding the metadata, but we support reading
65+
# Zarr V2 data and need to support the case where fill_value
66+
# is None.
67+
return _default_fill_value(dtype=chunk_spec.dtype)
68+
else:
69+
return fill_value
70+
71+
5972
@dataclass(frozen=True)
6073
class BatchedCodecPipeline(CodecPipeline):
6174
"""Default codec pipeline.
@@ -247,17 +260,7 @@ async def read_batch(
247260
if chunk_array is not None:
248261
out[out_selection] = chunk_array
249262
else:
250-
fill_value = chunk_spec.fill_value
251-
252-
if fill_value is None:
253-
# Zarr V2 allowed `fill_value` to be null in the metadata.
254-
# Zarr V3 requires it to be set. This has already been
255-
# validated when decoding the metadata, but we support reading
256-
# Zarr V2 data and need to support the case where fill_value
257-
# is None.
258-
fill_value = _default_fill_value(dtype=chunk_spec.dtype)
259-
260-
out[out_selection] = fill_value
263+
out[out_selection] = fill_value_or_default(chunk_spec)
261264
else:
262265
chunk_bytes_batch = await concurrent_map(
263266
[
@@ -284,10 +287,7 @@ async def read_batch(
284287
tmp = tmp.squeeze(axis=drop_axes)
285288
out[out_selection] = tmp
286289
else:
287-
fill_value = chunk_spec.fill_value
288-
if fill_value is None:
289-
fill_value = _default_fill_value(dtype=chunk_spec.dtype)
290-
out[out_selection] = fill_value
290+
out[out_selection] = fill_value_or_default(chunk_spec)
291291

292292
def _merge_chunk_array(
293293
self,
@@ -305,7 +305,7 @@ def _merge_chunk_array(
305305
shape=chunk_spec.shape,
306306
dtype=chunk_spec.dtype,
307307
order=chunk_spec.order,
308-
fill_value=chunk_spec.fill_value,
308+
fill_value=fill_value_or_default(chunk_spec),
309309
)
310310
else:
311311
chunk_array = existing_chunk_array.copy() # make a writable copy
@@ -394,7 +394,7 @@ async def _read_key(
394394
chunk_array_batch.append(None) # type: ignore[unreachable]
395395
else:
396396
if not chunk_spec.config.write_empty_chunks and chunk_array.all_equal(
397-
chunk_spec.fill_value
397+
fill_value_or_default(chunk_spec)
398398
):
399399
chunk_array_batch.append(None)
400400
else:

src/zarr/core/group.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1498,7 +1498,8 @@ async def tree(self, expand: bool | None = None, level: int | None = None) -> An
14981498
async def empty(
14991499
self, *, name: str, shape: ChunkCoords, **kwargs: Any
15001500
) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]:
1501-
"""Create an empty array in this Group.
1501+
"""Create an empty array with the specified shape in this Group. The contents will
1502+
be filled with the array's fill value or zeros if no fill value is provided.
15021503
15031504
Parameters
15041505
----------
@@ -1515,7 +1516,6 @@ async def empty(
15151516
retrieve data from an empty Zarr array, any values may be returned,
15161517
and these are not guaranteed to be stable from one access to the next.
15171518
"""
1518-
15191519
return await async_api.empty(shape=shape, store=self.store_path, path=name, **kwargs)
15201520

15211521
async def zeros(
@@ -1592,7 +1592,8 @@ async def full(
15921592
async def empty_like(
15931593
self, *, name: str, data: async_api.ArrayLike, **kwargs: Any
15941594
) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]:
1595-
"""Create an empty sub-array like `data`.
1595+
"""Create an empty sub-array like `data`. The contents will be filled with
1596+
the array's fill value or zeros if no fill value is provided.
15961597
15971598
Parameters
15981599
----------
@@ -2442,7 +2443,8 @@ def require_array(self, name: str, *, shape: ShapeLike, **kwargs: Any) -> Array:
24422443

24432444
@_deprecate_positional_args
24442445
def empty(self, *, name: str, shape: ChunkCoords, **kwargs: Any) -> Array:
2445-
"""Create an empty array in this Group.
2446+
"""Create an empty array with the specified shape in this Group. The contents will be filled with
2447+
the array's fill value or zeros if no fill value is provided.
24462448
24472449
Parameters
24482450
----------
@@ -2531,7 +2533,8 @@ def full(
25312533

25322534
@_deprecate_positional_args
25332535
def empty_like(self, *, name: str, data: async_api.ArrayLike, **kwargs: Any) -> Array:
2534-
"""Create an empty sub-array like `data`.
2536+
"""Create an empty sub-array like `data`. The contents will be filled
2537+
with the array's fill value or zeros if no fill value is provided.
25352538
25362539
Parameters
25372540
----------
@@ -2546,6 +2549,12 @@ def empty_like(self, *, name: str, data: async_api.ArrayLike, **kwargs: Any) ->
25462549
-------
25472550
Array
25482551
The new array.
2552+
2553+
Notes
2554+
-----
2555+
The contents of an empty Zarr array are not defined. On attempting to
2556+
retrieve data from an empty Zarr array, any values may be returned,
2557+
and these are not guaranteed to be stable from one access to the next.
25492558
"""
25502559
return Array(self._sync(self._async_group.empty_like(name=name, data=data, **kwargs)))
25512560

src/zarr/core/metadata/v2.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,14 @@ def _default_fill_value(dtype: np.dtype[Any]) -> Any:
349349
return b""
350350
elif dtype.kind in "UO":
351351
return ""
352+
elif dtype.kind in "Mm":
353+
return dtype.type("nat")
354+
elif dtype.kind == "V":
355+
if dtype.fields is not None:
356+
default = tuple([_default_fill_value(field[0]) for field in dtype.fields.values()])
357+
return np.array([default], dtype=dtype)
358+
else:
359+
return np.zeros(1, dtype=dtype)
352360
else:
353361
return dtype.type(0)
354362

src/zarr/core/sync.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import asyncio
44
import atexit
55
import logging
6+
import os
67
import threading
78
from concurrent.futures import ThreadPoolExecutor, wait
89
from typing import TYPE_CHECKING, TypeVar
@@ -89,6 +90,26 @@ def cleanup_resources() -> None:
8990
atexit.register(cleanup_resources)
9091

9192

93+
def reset_resources_after_fork() -> None:
94+
"""
95+
Ensure that global resources are reset after a fork. Without this function,
96+
forked processes will retain invalid references to the parent process's resources.
97+
"""
98+
global loop, iothread, _executor
99+
# These lines are excluded from coverage because this function only runs in a child process,
100+
# which is not observed by the test coverage instrumentation. Despite the apparent lack of
101+
# test coverage, this function should be adequately tested by any test that uses Zarr IO with
102+
# multiprocessing.
103+
loop[0] = None # pragma: no cover
104+
iothread[0] = None # pragma: no cover
105+
_executor = None # pragma: no cover
106+
107+
108+
# this is only available on certain operating systems
109+
if hasattr(os, "register_at_fork"):
110+
os.register_at_fork(after_in_child=reset_resources_after_fork)
111+
112+
92113
async def _runner(coro: Coroutine[Any, Any, T]) -> T | BaseException:
93114
"""
94115
Await a coroutine and return the result of running it. If awaiting the coroutine raises an

tests/test_array.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
import dataclasses
22
import json
33
import math
4+
import multiprocessing as mp
45
import pickle
56
import re
7+
import sys
68
from itertools import accumulate
79
from typing import TYPE_CHECKING, Any, Literal
810
from unittest import mock
@@ -1390,3 +1392,39 @@ def test_roundtrip_numcodecs() -> None:
13901392
metadata = root["test"].metadata.to_dict()
13911393
expected = (*filters, BYTES_CODEC, *compressors)
13921394
assert metadata["codecs"] == expected
1395+
1396+
1397+
def _index_array(arr: Array, index: Any) -> Any:
1398+
return arr[index]
1399+
1400+
1401+
@pytest.mark.parametrize(
1402+
"method",
1403+
[
1404+
pytest.param(
1405+
"fork",
1406+
marks=pytest.mark.skipif(
1407+
sys.platform in ("win32", "darwin"), reason="fork not supported on Windows or OSX"
1408+
),
1409+
),
1410+
"spawn",
1411+
pytest.param(
1412+
"forkserver",
1413+
marks=pytest.mark.skipif(
1414+
sys.platform == "win32", reason="forkserver not supported on Windows"
1415+
),
1416+
),
1417+
],
1418+
)
1419+
@pytest.mark.parametrize("store", ["local"], indirect=True)
1420+
def test_multiprocessing(store: Store, method: Literal["fork", "spawn", "forkserver"]) -> None:
1421+
"""
1422+
Test that arrays can be pickled and indexed in child processes
1423+
"""
1424+
data = np.arange(100)
1425+
arr = zarr.create_array(store=store, data=data)
1426+
ctx = mp.get_context(method)
1427+
pool = ctx.Pool()
1428+
1429+
results = pool.starmap(_index_array, [(arr, slice(len(data)))])
1430+
assert all(np.array_equal(r, data) for r in results)

0 commit comments

Comments
 (0)