Skip to content

Commit eb67e03

Browse files
authored
Merge branch 'main' into optimize-boundary-chunk-writes
2 parents d446918 + 2f8b88a commit eb67e03

File tree

8 files changed

+126
-13
lines changed

8 files changed

+126
-13
lines changed

changes/2755.bugfix.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
The array returned by ``zarr.empty`` and an empty ``zarr.core.buffer.cpu.NDBuffer`` will now be filled with the
2+
specified fill value, or with zeros if no fill value is provided.
3+
This fixes a bug where Zarr format 2 data with no fill value was written with un-predictable chunk sizes.

src/zarr/api/asynchronous.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1065,7 +1065,8 @@ async def create(
10651065
async def empty(
10661066
shape: ChunkCoords, **kwargs: Any
10671067
) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]:
1068-
"""Create an empty array.
1068+
"""Create an empty array with the specified shape. The contents will be filled with the
1069+
array's fill value or zeros if no fill value is provided.
10691070
10701071
Parameters
10711072
----------
@@ -1087,7 +1088,8 @@ async def empty(
10871088
async def empty_like(
10881089
a: ArrayLike, **kwargs: Any
10891090
) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]:
1090-
"""Create an empty array like `a`.
1091+
"""Create an empty array like `a`. The contents will be filled with the
1092+
array's fill value or zeros if no fill value is provided.
10911093
10921094
Parameters
10931095
----------
@@ -1100,6 +1102,12 @@ async def empty_like(
11001102
-------
11011103
Array
11021104
The new array.
1105+
1106+
Notes
1107+
-----
1108+
The contents of an empty Zarr array are not defined. On attempting to
1109+
retrieve data from an empty Zarr array, any values may be returned,
1110+
and these are not guaranteed to be stable from one access to the next.
11031111
"""
11041112
like_kwargs = _like_args(a, kwargs)
11051113
return await empty(**like_kwargs)

src/zarr/api/synchronous.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -902,7 +902,8 @@ def create_array(
902902

903903
# TODO: add type annotations for kwargs
904904
def empty(shape: ChunkCoords, **kwargs: Any) -> Array:
905-
"""Create an empty array.
905+
"""Create an empty array with the specified shape. The contents will be filled with the
906+
array's fill value or zeros if no fill value is provided.
906907
907908
Parameters
908909
----------
@@ -928,7 +929,8 @@ def empty(shape: ChunkCoords, **kwargs: Any) -> Array:
928929
# TODO: move ArrayLike to common module
929930
# TODO: add type annotations for kwargs
930931
def empty_like(a: ArrayLike, **kwargs: Any) -> Array:
931-
"""Create an empty array like another array.
932+
"""Create an empty array like another array. The contents will be filled with the
933+
array's fill value or zeros if no fill value is provided.
932934
933935
Parameters
934936
----------
@@ -941,6 +943,12 @@ def empty_like(a: ArrayLike, **kwargs: Any) -> Array:
941943
-------
942944
Array
943945
The new array.
946+
947+
Notes
948+
-----
949+
The contents of an empty Zarr array are not defined. On attempting to
950+
retrieve data from an empty Zarr array, any values may be returned,
951+
and these are not guaranteed to be stable from one access to the next.
944952
"""
945953
return Array(sync(async_api.empty_like(a, **kwargs)))
946954

src/zarr/core/buffer/cpu.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -154,10 +154,10 @@ def create(
154154
order: Literal["C", "F"] = "C",
155155
fill_value: Any | None = None,
156156
) -> Self:
157-
ret = cls(np.empty(shape=tuple(shape), dtype=dtype, order=order))
158-
if fill_value is not None:
159-
ret.fill(fill_value)
160-
return ret
157+
if fill_value is None:
158+
return cls(np.zeros(shape=tuple(shape), dtype=dtype, order=order))
159+
else:
160+
return cls(np.full(shape=tuple(shape), fill_value=fill_value, dtype=dtype, order=order))
161161

162162
@classmethod
163163
def from_numpy_array(cls, array_like: npt.ArrayLike) -> Self:

src/zarr/core/group.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1498,7 +1498,8 @@ async def tree(self, expand: bool | None = None, level: int | None = None) -> An
14981498
async def empty(
14991499
self, *, name: str, shape: ChunkCoords, **kwargs: Any
15001500
) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]:
1501-
"""Create an empty array in this Group.
1501+
"""Create an empty array with the specified shape in this Group. The contents will
1502+
be filled with the array's fill value or zeros if no fill value is provided.
15021503
15031504
Parameters
15041505
----------
@@ -1515,7 +1516,6 @@ async def empty(
15151516
retrieve data from an empty Zarr array, any values may be returned,
15161517
and these are not guaranteed to be stable from one access to the next.
15171518
"""
1518-
15191519
return await async_api.empty(shape=shape, store=self.store_path, path=name, **kwargs)
15201520

15211521
async def zeros(
@@ -1592,7 +1592,8 @@ async def full(
15921592
async def empty_like(
15931593
self, *, name: str, data: async_api.ArrayLike, **kwargs: Any
15941594
) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]:
1595-
"""Create an empty sub-array like `data`.
1595+
"""Create an empty sub-array like `data`. The contents will be filled with
1596+
the array's fill value or zeros if no fill value is provided.
15961597
15971598
Parameters
15981599
----------
@@ -2442,7 +2443,8 @@ def require_array(self, name: str, *, shape: ShapeLike, **kwargs: Any) -> Array:
24422443

24432444
@_deprecate_positional_args
24442445
def empty(self, *, name: str, shape: ChunkCoords, **kwargs: Any) -> Array:
2445-
"""Create an empty array in this Group.
2446+
"""Create an empty array with the specified shape in this Group. The contents will be filled with
2447+
the array's fill value or zeros if no fill value is provided.
24462448
24472449
Parameters
24482450
----------
@@ -2531,7 +2533,8 @@ def full(
25312533

25322534
@_deprecate_positional_args
25332535
def empty_like(self, *, name: str, data: async_api.ArrayLike, **kwargs: Any) -> Array:
2534-
"""Create an empty sub-array like `data`.
2536+
"""Create an empty sub-array like `data`. The contents will be filled
2537+
with the array's fill value or zeros if no fill value is provided.
25352538
25362539
Parameters
25372540
----------
@@ -2546,6 +2549,12 @@ def empty_like(self, *, name: str, data: async_api.ArrayLike, **kwargs: Any) ->
25462549
-------
25472550
Array
25482551
The new array.
2552+
2553+
Notes
2554+
-----
2555+
The contents of an empty Zarr array are not defined. On attempting to
2556+
retrieve data from an empty Zarr array, any values may be returned,
2557+
and these are not guaranteed to be stable from one access to the next.
25492558
"""
25502559
return Array(self._sync(self._async_group.empty_like(name=name, data=data, **kwargs)))
25512560

src/zarr/core/sync.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import asyncio
44
import atexit
55
import logging
6+
import os
67
import threading
78
from concurrent.futures import ThreadPoolExecutor, wait
89
from typing import TYPE_CHECKING, TypeVar
@@ -89,6 +90,26 @@ def cleanup_resources() -> None:
8990
atexit.register(cleanup_resources)
9091

9192

93+
def reset_resources_after_fork() -> None:
94+
"""
95+
Ensure that global resources are reset after a fork. Without this function,
96+
forked processes will retain invalid references to the parent process's resources.
97+
"""
98+
global loop, iothread, _executor
99+
# These lines are excluded from coverage because this function only runs in a child process,
100+
# which is not observed by the test coverage instrumentation. Despite the apparent lack of
101+
# test coverage, this function should be adequately tested by any test that uses Zarr IO with
102+
# multiprocessing.
103+
loop[0] = None # pragma: no cover
104+
iothread[0] = None # pragma: no cover
105+
_executor = None # pragma: no cover
106+
107+
108+
# this is only available on certain operating systems
109+
if hasattr(os, "register_at_fork"):
110+
os.register_at_fork(after_in_child=reset_resources_after_fork)
111+
112+
92113
async def _runner(coro: Coroutine[Any, Any, T]) -> T | BaseException:
93114
"""
94115
Await a coroutine and return the result of running it. If awaiting the coroutine raises an

tests/test_array.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
import dataclasses
22
import json
33
import math
4+
import multiprocessing as mp
45
import pickle
56
import re
7+
import sys
68
from itertools import accumulate
79
from typing import TYPE_CHECKING, Any, Literal
810
from unittest import mock
@@ -1391,3 +1393,39 @@ def test_roundtrip_numcodecs() -> None:
13911393
metadata = root["test"].metadata.to_dict()
13921394
expected = (*filters, BYTES_CODEC, *compressors)
13931395
assert metadata["codecs"] == expected
1396+
1397+
1398+
def _index_array(arr: Array, index: Any) -> Any:
1399+
return arr[index]
1400+
1401+
1402+
@pytest.mark.parametrize(
1403+
"method",
1404+
[
1405+
pytest.param(
1406+
"fork",
1407+
marks=pytest.mark.skipif(
1408+
sys.platform in ("win32", "darwin"), reason="fork not supported on Windows or OSX"
1409+
),
1410+
),
1411+
"spawn",
1412+
pytest.param(
1413+
"forkserver",
1414+
marks=pytest.mark.skipif(
1415+
sys.platform == "win32", reason="forkserver not supported on Windows"
1416+
),
1417+
),
1418+
],
1419+
)
1420+
@pytest.mark.parametrize("store", ["local"], indirect=True)
1421+
def test_multiprocessing(store: Store, method: Literal["fork", "spawn", "forkserver"]) -> None:
1422+
"""
1423+
Test that arrays can be pickled and indexed in child processes
1424+
"""
1425+
data = np.arange(100)
1426+
arr = zarr.create_array(store=store, data=data)
1427+
ctx = mp.get_context(method)
1428+
pool = ctx.Pool()
1429+
1430+
results = pool.starmap(_index_array, [(arr, slice(len(data)))])
1431+
assert all(np.array_equal(r, data) for r in results)

tests/test_store/test_memory.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,19 @@
11
from __future__ import annotations
22

3+
from typing import TYPE_CHECKING
4+
5+
import numpy as np
36
import pytest
47

8+
import zarr
59
from zarr.core.buffer import Buffer, cpu, gpu
610
from zarr.storage import GpuMemoryStore, MemoryStore
711
from zarr.testing.store import StoreTests
812
from zarr.testing.utils import gpu_test
913

14+
if TYPE_CHECKING:
15+
from zarr.core.common import ZarrFormat
16+
1017

1118
class TestMemoryStore(StoreTests[MemoryStore, cpu.Buffer]):
1219
store_cls = MemoryStore
@@ -46,6 +53,25 @@ def test_store_supports_partial_writes(self, store: MemoryStore) -> None:
4653
def test_list_prefix(self, store: MemoryStore) -> None:
4754
assert True
4855

56+
@pytest.mark.parametrize("dtype", ["uint8", "float32", "int64"])
57+
@pytest.mark.parametrize("zarr_format", [2, 3])
58+
async def test_deterministic_size(
59+
self, store: MemoryStore, dtype, zarr_format: ZarrFormat
60+
) -> None:
61+
a = zarr.empty(
62+
store=store,
63+
shape=(3,),
64+
chunks=(1000,),
65+
dtype=dtype,
66+
zarr_format=zarr_format,
67+
overwrite=True,
68+
)
69+
a[...] = 1
70+
a.resize((1000,))
71+
72+
np.testing.assert_array_equal(a[:3], 1)
73+
np.testing.assert_array_equal(a[3:], 0)
74+
4975

5076
@gpu_test
5177
class TestGpuMemoryStore(StoreTests[GpuMemoryStore, gpu.Buffer]):

0 commit comments

Comments
 (0)