From e61da90997a539950ea6a9c48946918e0fa0a399 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Tue, 30 Dec 2025 10:54:18 -0500 Subject: [PATCH 01/10] Add an auto mechanism that doesn't split encoded chunks --- xarray/namedarray/daskmanager.py | 68 ++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index eb01a150c18..b5739635ce1 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -43,6 +43,66 @@ def is_chunked_array(self, data: duckarray[Any, Any]) -> bool: def chunks(self, data: Any) -> _NormalizedChunks: return data.chunks # type: ignore[no-any-return] + def meta_chunks(chunks, shape, target, typesize, encoded_chunks): + """Determine meta chunks + + This takes in a chunks value that contains ``"auto"`` values in certain + dimensions and replaces those values with concrete dimension sizes that try + to get chunks to be of a certain size in bytes, provided by the ``limit=`` + keyword. If multiple dimensions are marked as ``"auto"`` then they will + all respond to get close to the byte target, while never splitting + ``encoded_chunks``. + + Parameters + ---------- + chunks: tuple + A tuple of either dimensions or tuples of explicit chunk dimensions + Some entries should be "auto". Any explicit dimensions must match or + be multiple of ``encoded_chunks`` + shape: tuple[int] + The + target: int + The target size of the chunk in bytes. + typesize: int + The size, in bytes, of each element of the chunk. + encoded_chunks: tuple[int] + """ + shape = np.array(shape) + + # "auto" stays as "auto" + # empty tuple means match encoded chunks + # -1 means whole dim is in one chunk + desired_chunks = np.array( + [ + c or encoded_chunks[i] if c != -1 else shape[i] + for i, c in enumerate(chunks) + ] + ) + + auto_chunks = desired_chunks == "auto" + chunks = np.where(auto_chunks, np.array(encoded_chunks), desired_chunks).astype( + int + ) + + while True: + # Repeatedly loop over the ``encoded_chunks``, multiplying them by 2. + # Stop when: + # 1a. we are larger than the target chunk size OR + # 1b. we are within 50% of the target chunk size + + idx = np.argmax(shape / chunks * auto_chunks) + chunk_bytes = np.prod(chunks) * typesize + + if chunk_bytes > target or abs(chunk_bytes - target) / target < 0.5: + break + + if np.prod(chunks) == 1: + break # Element size larger than max_bytes + + chunks[idx] = chunks[idx] * 2 + + return tuple(int(x) for x in chunks) + def normalize_chunks( self, chunks: T_Chunks | _NormalizedChunks, @@ -54,6 +114,14 @@ def normalize_chunks( """Called by open_dataset""" from dask.array.core import normalize_chunks + chunks = self.meta_chunks( + chunks, + shape=shape, + target=128 * 1024 * 1024, + typesize=dtype.itemsize, + encoded_chunks=previous_chunks, + ) # type: ignore[no-untyped-call] + return normalize_chunks( chunks, shape=shape, From 75c3f51f80781bddc44ab4bc18d8248a0607d062 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Tue, 30 Dec 2025 10:58:12 -0500 Subject: [PATCH 02/10] Forgot self --- xarray/namedarray/daskmanager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index b5739635ce1..d89e4a063ff 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -43,7 +43,7 @@ def is_chunked_array(self, data: duckarray[Any, Any]) -> bool: def chunks(self, data: Any) -> _NormalizedChunks: return data.chunks # type: ignore[no-any-return] - def meta_chunks(chunks, shape, target, typesize, encoded_chunks): + def meta_chunks(self, chunks, shape, target, typesize, encoded_chunks): """Determine meta chunks This takes in a chunks value that contains ``"auto"`` values in certain From eecb8c5c027fe9575a08cc225ea611315aaf583d Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Tue, 30 Dec 2025 15:31:59 -0500 Subject: [PATCH 03/10] Change from 'auto' to 'preserve' --- xarray/namedarray/daskmanager.py | 37 ++++++++++++++++++-------------- xarray/namedarray/utils.py | 2 +- 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index d89e4a063ff..23673067882 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -49,9 +49,8 @@ def meta_chunks(self, chunks, shape, target, typesize, encoded_chunks): This takes in a chunks value that contains ``"auto"`` values in certain dimensions and replaces those values with concrete dimension sizes that try to get chunks to be of a certain size in bytes, provided by the ``limit=`` - keyword. If multiple dimensions are marked as ``"auto"`` then they will - all respond to get close to the byte target, while never splitting - ``encoded_chunks``. + keyword. Any dimensions marked as ``"auto"`` will potentially be multiplied + to get close to the byte target, while never splitting ``encoded_chunks``. Parameters ---------- @@ -79,7 +78,7 @@ def meta_chunks(self, chunks, shape, target, typesize, encoded_chunks): ] ) - auto_chunks = desired_chunks == "auto" + auto_chunks = desired_chunks == "preserve" chunks = np.where(auto_chunks, np.array(encoded_chunks), desired_chunks).astype( int ) @@ -88,18 +87,20 @@ def meta_chunks(self, chunks, shape, target, typesize, encoded_chunks): # Repeatedly loop over the ``encoded_chunks``, multiplying them by 2. # Stop when: # 1a. we are larger than the target chunk size OR - # 1b. we are within 50% of the target chunk size + # 1b. we are within 50% of the target chunk size OR + # 2. the size of the auto chunks matches the shape of the array - idx = np.argmax(shape / chunks * auto_chunks) + num_chunks = shape / chunks * auto_chunks + idx = np.argmax(num_chunks) chunk_bytes = np.prod(chunks) * typesize if chunk_bytes > target or abs(chunk_bytes - target) / target < 0.5: break - if np.prod(chunks) == 1: - break # Element size larger than max_bytes + if (num_chunks <= 1).all(): + break - chunks[idx] = chunks[idx] * 2 + chunks[idx] = min(chunks[idx] * 2, shape[idx]) return tuple(int(x) for x in chunks) @@ -114,13 +115,17 @@ def normalize_chunks( """Called by open_dataset""" from dask.array.core import normalize_chunks - chunks = self.meta_chunks( - chunks, - shape=shape, - target=128 * 1024 * 1024, - typesize=dtype.itemsize, - encoded_chunks=previous_chunks, - ) # type: ignore[no-untyped-call] + if any(c == "preserve" for c in chunks) and any(c == "auto" for c in chunks): + raise ValueError('chunks cannot use a combination of "auto" and "preserve"') + + if previous_chunks and any(c == "preserve" for c in chunks): + chunks = self.meta_chunks( + chunks, + shape=shape, + target=96 * 1024 * 1024, + typesize=dtype.itemsize, + encoded_chunks=previous_chunks, + ) # type: ignore[no-untyped-call] return normalize_chunks( chunks, diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py index 3490a76aa8d..26d99d7d40a 100644 --- a/xarray/namedarray/utils.py +++ b/xarray/namedarray/utils.py @@ -222,7 +222,7 @@ def _get_chunk( # type: ignore[no-untyped-def] preferred_chunk_shape = tuple( itertools.starmap(preferred_chunks.get, zip(dims, shape, strict=True)) ) - if isinstance(chunks, Number) or (chunks == "auto"): + if isinstance(chunks, (Number, str)): chunks = dict.fromkeys(dims, chunks) chunk_shape = tuple( chunks.get(dim, None) or preferred_chunk_sizes From 4461f87e4c631df8a28d644ad84be000a0baceb1 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Tue, 30 Dec 2025 15:34:01 -0500 Subject: [PATCH 04/10] Make sure api allows chunks='preserve' --- xarray/backends/api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 5cb879620cb..504c3f31e5a 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -291,9 +291,9 @@ def _dataset_from_backend_dataset( create_default_indexes, **extra_tokens, ): - if not isinstance(chunks, int | dict) and chunks not in {None, "auto"}: + if not isinstance(chunks, int | dict) and chunks not in {None, "auto", "preserve"}: raise ValueError( - f"chunks must be an int, dict, 'auto', or None. Instead found {chunks}." + f"chunks must be an int, dict, 'auto', 'preserve', or None. Instead found {chunks}." ) _protect_dataset_variables_inplace(backend_ds, cache) From c58093fc6249903d37825423b3d0a9ab3759c4a4 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Tue, 30 Dec 2025 15:59:54 -0500 Subject: [PATCH 05/10] Add types --- xarray/namedarray/_typing.py | 2 +- xarray/namedarray/daskmanager.py | 52 ++++++++++++++++++-------------- 2 files changed, 31 insertions(+), 23 deletions(-) diff --git a/xarray/namedarray/_typing.py b/xarray/namedarray/_typing.py index 9610b96d4f9..f9227f7796a 100644 --- a/xarray/namedarray/_typing.py +++ b/xarray/namedarray/_typing.py @@ -74,7 +74,7 @@ def dtype(self) -> _DType_co: ... _NormalizedChunks = tuple[tuple[int, ...], ...] # FYI in some cases we don't allow `None`, which this doesn't take account of. # # FYI the `str` is for a size string, e.g. "16MB", supported by dask. -T_ChunkDim: TypeAlias = str | int | Literal["auto"] | tuple[int, ...] | None # noqa: PYI051 +T_ChunkDim: TypeAlias = str | int | Literal["auto", "preserve"] | tuple[int, ...] | None # noqa: PYI051 # We allow the tuple form of this (though arguably we could transition to named dims only) T_Chunks: TypeAlias = T_ChunkDim | Mapping[Any, T_ChunkDim] diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index 23673067882..c7bf1230003 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -43,54 +43,62 @@ def is_chunked_array(self, data: duckarray[Any, Any]) -> bool: def chunks(self, data: Any) -> _NormalizedChunks: return data.chunks # type: ignore[no-any-return] - def meta_chunks(self, chunks, shape, target, typesize, encoded_chunks): + def preserve_chunks( + self, + chunks: T_Chunks, + shape: tuple[int, ...], + target: int, + typesize: int, + previous_chunks: tuple[int], + ) -> tuple[int]: """Determine meta chunks - This takes in a chunks value that contains ``"auto"`` values in certain + This takes in a chunks value that contains ``"preserve"`` values in certain dimensions and replaces those values with concrete dimension sizes that try to get chunks to be of a certain size in bytes, provided by the ``limit=`` - keyword. Any dimensions marked as ``"auto"`` will potentially be multiplied - to get close to the byte target, while never splitting ``encoded_chunks``. + keyword. Any dimensions marked as ``"preserve"`` will potentially be multiplied + to get close to the byte target, while never splitting ``previous_chunks``. Parameters ---------- - chunks: tuple + chunks: tuple[int | str | tuple, ...] A tuple of either dimensions or tuples of explicit chunk dimensions - Some entries should be "auto". Any explicit dimensions must match or - be multiple of ``encoded_chunks`` + Some entries should be "preserve". Any explicit dimensions must match or + be multiple of ``previous_chunks`` shape: tuple[int] - The + The shape of the array target: int The target size of the chunk in bytes. typesize: int The size, in bytes, of each element of the chunk. - encoded_chunks: tuple[int] + previous_chunks: tuple[int] + Size of chunks being preserved. Expressed as a tuple of ints which matches + the way chunks are encoded in Zarr. """ shape = np.array(shape) + previous_chunks = np.array(previous_chunks) - # "auto" stays as "auto" - # empty tuple means match encoded chunks + # "preserve" stays as "preserve" + # empty tuple means match previous chunks # -1 means whole dim is in one chunk desired_chunks = np.array( [ - c or encoded_chunks[i] if c != -1 else shape[i] + c or previous_chunks[i] if c != -1 else shape[i] for i, c in enumerate(chunks) ] ) - auto_chunks = desired_chunks == "preserve" - chunks = np.where(auto_chunks, np.array(encoded_chunks), desired_chunks).astype( - int - ) + preserve_chunks = desired_chunks == "preserve" + chunks = np.where(preserve_chunks, previous_chunks, desired_chunks).astype(int) while True: - # Repeatedly loop over the ``encoded_chunks``, multiplying them by 2. + # Repeatedly loop over the ``previous_chunks``, multiplying them by 2. # Stop when: # 1a. we are larger than the target chunk size OR # 1b. we are within 50% of the target chunk size OR - # 2. the size of the auto chunks matches the shape of the array + # 2. the chunk covers the entire array - num_chunks = shape / chunks * auto_chunks + num_chunks = shape / chunks * preserve_chunks idx = np.argmax(num_chunks) chunk_bytes = np.prod(chunks) * typesize @@ -119,13 +127,13 @@ def normalize_chunks( raise ValueError('chunks cannot use a combination of "auto" and "preserve"') if previous_chunks and any(c == "preserve" for c in chunks): - chunks = self.meta_chunks( + chunks = self.preserve_chunks( chunks, shape=shape, target=96 * 1024 * 1024, typesize=dtype.itemsize, - encoded_chunks=previous_chunks, - ) # type: ignore[no-untyped-call] + previous_chunks=previous_chunks, + ) return normalize_chunks( chunks, From 1e85c59bc0a3a9f7681bef1125b35b42ae47520a Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Wed, 31 Dec 2025 06:40:55 -0500 Subject: [PATCH 06/10] Refactor and add test * Move ``preserve_chunks`` to base ChunkManager class * Get target size from dask config options for DaskManager * Add test for open_zarr --- xarray/namedarray/daskmanager.py | 71 +---------------------------- xarray/namedarray/parallelcompat.py | 69 ++++++++++++++++++++++++++++ xarray/tests/test_backends.py | 38 +++++++++++++++ 3 files changed, 108 insertions(+), 70 deletions(-) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index c7bf1230003..955a58b8a69 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -43,75 +43,6 @@ def is_chunked_array(self, data: duckarray[Any, Any]) -> bool: def chunks(self, data: Any) -> _NormalizedChunks: return data.chunks # type: ignore[no-any-return] - def preserve_chunks( - self, - chunks: T_Chunks, - shape: tuple[int, ...], - target: int, - typesize: int, - previous_chunks: tuple[int], - ) -> tuple[int]: - """Determine meta chunks - - This takes in a chunks value that contains ``"preserve"`` values in certain - dimensions and replaces those values with concrete dimension sizes that try - to get chunks to be of a certain size in bytes, provided by the ``limit=`` - keyword. Any dimensions marked as ``"preserve"`` will potentially be multiplied - to get close to the byte target, while never splitting ``previous_chunks``. - - Parameters - ---------- - chunks: tuple[int | str | tuple, ...] - A tuple of either dimensions or tuples of explicit chunk dimensions - Some entries should be "preserve". Any explicit dimensions must match or - be multiple of ``previous_chunks`` - shape: tuple[int] - The shape of the array - target: int - The target size of the chunk in bytes. - typesize: int - The size, in bytes, of each element of the chunk. - previous_chunks: tuple[int] - Size of chunks being preserved. Expressed as a tuple of ints which matches - the way chunks are encoded in Zarr. - """ - shape = np.array(shape) - previous_chunks = np.array(previous_chunks) - - # "preserve" stays as "preserve" - # empty tuple means match previous chunks - # -1 means whole dim is in one chunk - desired_chunks = np.array( - [ - c or previous_chunks[i] if c != -1 else shape[i] - for i, c in enumerate(chunks) - ] - ) - - preserve_chunks = desired_chunks == "preserve" - chunks = np.where(preserve_chunks, previous_chunks, desired_chunks).astype(int) - - while True: - # Repeatedly loop over the ``previous_chunks``, multiplying them by 2. - # Stop when: - # 1a. we are larger than the target chunk size OR - # 1b. we are within 50% of the target chunk size OR - # 2. the chunk covers the entire array - - num_chunks = shape / chunks * preserve_chunks - idx = np.argmax(num_chunks) - chunk_bytes = np.prod(chunks) * typesize - - if chunk_bytes > target or abs(chunk_bytes - target) / target < 0.5: - break - - if (num_chunks <= 1).all(): - break - - chunks[idx] = min(chunks[idx] * 2, shape[idx]) - - return tuple(int(x) for x in chunks) - def normalize_chunks( self, chunks: T_Chunks | _NormalizedChunks, @@ -130,7 +61,7 @@ def normalize_chunks( chunks = self.preserve_chunks( chunks, shape=shape, - target=96 * 1024 * 1024, + target=self.get_auto_chunk_size(), typesize=dtype.itemsize, previous_chunks=previous_chunks, ) diff --git a/xarray/namedarray/parallelcompat.py b/xarray/namedarray/parallelcompat.py index cd57a8d4487..f34fea44b5b 100644 --- a/xarray/namedarray/parallelcompat.py +++ b/xarray/namedarray/parallelcompat.py @@ -777,3 +777,72 @@ def get_auto_chunk_size( raise NotImplementedError( "For 'auto' rechunking of cftime arrays, get_auto_chunk_size must be implemented by the chunk manager" ) + + @staticmethod + def preserve_chunks( + chunks: T_Chunks, + shape: tuple[int, ...], + target: int, + typesize: int, + previous_chunks: tuple[int], + ) -> tuple[int]: + """Determine meta chunks + + This takes in a chunks value that contains ``"preserve"`` values in certain + dimensions and replaces those values with concrete dimension sizes that try + to get chunks to be of a certain size in bytes, provided by the ``limit=`` + keyword. Any dimensions marked as ``"preserve"`` will potentially be multiplied + to get close to the byte target, while never splitting ``previous_chunks``. + + Parameters + ---------- + chunks: tuple[int | str | tuple, ...] + A tuple of either dimensions or tuples of explicit chunk dimensions + Some entries should be "preserve". Any explicit dimensions must match or + be multiple of ``previous_chunks`` + shape: tuple[int] + The shape of the array + target: int + The target size of the chunk in bytes. + typesize: int + The size, in bytes, of each element of the chunk. + previous_chunks: tuple[int] + Size of chunks being preserved. Expressed as a tuple of ints which matches + the way chunks are encoded in Zarr. + """ + shape = np.array(shape) + previous_chunks = np.array(previous_chunks) + + # "preserve" stays as "preserve" + # empty tuple means match previous chunks + # -1 means whole dim is in one chunk + desired_chunks = np.array( + [ + c or previous_chunks[i] if c != -1 else shape[i] + for i, c in enumerate(chunks) + ] + ) + + preserve_chunks = desired_chunks == "preserve" + chunks = np.where(preserve_chunks, previous_chunks, desired_chunks).astype(int) + + while True: + # Repeatedly loop over the ``previous_chunks``, multiplying them by 2. + # Stop when: + # 1a. we are larger than the target chunk size OR + # 1b. we are within 50% of the target chunk size OR + # 2. the chunk covers the entire array + + num_chunks = shape / chunks * preserve_chunks + idx = np.argmax(num_chunks) + chunk_bytes = np.prod(chunks) * typesize + + if chunk_bytes > target or abs(chunk_bytes - target) / target < 0.5: + break + + if (num_chunks <= 1).all(): + break + + chunks[idx] = min(chunks[idx] * 2, shape[idx]) + + return tuple(int(x) for x in chunks) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index fdc7fdc8edb..7e3290e60a9 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -7284,6 +7284,44 @@ def test_chunking_consintency(chunks, tmp_path: Path) -> None: xr.testing.assert_chunks_equal(actual, expected) +@requires_zarr +@requires_dask +@pytest.mark.parametrize( + "chunks,expected", + [ + ("preserve", (320, 320)), + (-1, (500, 500)), + ({}, (10, 10)), + ({"x": "preserve"}, (500, 10)), + ({"x": -1}, (500, 10)), + ({"x": "preserve", "y": -1}, (160, 500)), + ], +) +def test_open_dataset_chunking_zarr_with_preserve( + chunks, expected, tmp_path: Path +) -> None: + encoded_chunks = 10 + dask_arr = da.from_array( + np.ones((500, 500), dtype="float64"), chunks=encoded_chunks + ) + ds = xr.Dataset( + { + "test": xr.DataArray( + dask_arr, + dims=("x", "y"), + ) + } + ) + ds["test"].encoding["chunks"] = encoded_chunks + ds.to_zarr(tmp_path / "test.zarr") + + with dask.config.set({"array.chunk-size": "1MiB"}): + with open_dataset( + tmp_path / "test.zarr", engine="zarr", chunks=chunks + ) as actual: + assert (actual.chunks["x"][0], actual.chunks["y"][0]) == expected + + def _check_guess_can_open_and_open(entrypoint, obj, engine, expected): assert entrypoint.guess_can_open(obj) with open_dataset(obj, engine=engine) as actual: From fccd2634aa58c810c5703f17e326420a8d0fd40d Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Wed, 31 Dec 2025 08:49:47 -0500 Subject: [PATCH 07/10] Add hypothesis testing --- properties/test_parallelcompat.py | 71 +++++++++++++++++++++++++++++++ xarray/testing/strategies.py | 68 +++++++++++++++++++++++++++++ 2 files changed, 139 insertions(+) create mode 100644 properties/test_parallelcompat.py diff --git a/properties/test_parallelcompat.py b/properties/test_parallelcompat.py new file mode 100644 index 00000000000..91325578619 --- /dev/null +++ b/properties/test_parallelcompat.py @@ -0,0 +1,71 @@ +import numpy as np +import pytest + +pytest.importorskip("hypothesis") +# isort: split + +from hypothesis import given + +import xarray.testing.strategies as xrst +from xarray.namedarray.parallelcompat import ChunkManagerEntrypoint + + +class TestPreserveChunks: + @given(xrst.shape_and_chunks()) + def test_preserve_all_chunks( + self, shape_and_chunks: tuple[tuple[int, ...], tuple[int, ...]] + ) -> None: + shape, previous_chunks = shape_and_chunks + typesize = 8 + target = 1024 * 1024 + + actual = ChunkManagerEntrypoint.preserve_chunks( + chunks=("preserve",) * len(shape), + shape=shape, + target=target, + typesize=typesize, + previous_chunks=previous_chunks, + ) + for i, chunk in enumerate(actual): + if chunk != shape[i]: + assert chunk >= previous_chunks[i] + assert chunk % previous_chunks[i] == 0 + assert chunk <= shape[i] + + if actual != shape: + assert np.prod(actual) * typesize >= 0.5 * target + + @pytest.mark.parametrize("first_chunk", [-1, (), 1]) + @given(xrst.shape_and_chunks(min_dims=2)) + def test_preserve_some_chunks( + self, + first_chunk: int | tuple[int, ...], + shape_and_chunks: tuple[tuple[int, ...], tuple[int, ...]], + ) -> None: + shape, previous_chunks = shape_and_chunks + typesize = 4 + target = 2 * 1024 * 1024 + + actual = ChunkManagerEntrypoint.preserve_chunks( + chunks=(first_chunk, *["preserve" for _ in range(len(shape) - 1)]), + shape=shape, + target=target, + typesize=typesize, + previous_chunks=previous_chunks, + ) + for i, chunk in enumerate(actual): + if i == 0: + if first_chunk == 1: + assert chunk == 1 + elif first_chunk == -1: + assert chunk == shape[i] + elif first_chunk == (): + assert chunk == previous_chunks[i] + elif chunk != shape[i]: + assert chunk >= previous_chunks[i] + assert chunk % previous_chunks[i] == 0 + assert chunk <= shape[i] + + # if we have more than one chunk, make sure the chunks are big enough + if actual[1:] != shape[1:]: + assert np.prod(actual) * typesize >= 0.5 * target diff --git a/xarray/testing/strategies.py b/xarray/testing/strategies.py index 9f6bb8110e8..af974a144be 100644 --- a/xarray/testing/strategies.py +++ b/xarray/testing/strategies.py @@ -31,6 +31,7 @@ "names", "outer_array_indexers", "pandas_index_dtypes", + "shape_and_chunks", "supported_dtypes", "unique_subset_of", "variables", @@ -210,6 +211,73 @@ def dimension_sizes( ) +@st.composite +def shape_and_chunks( + draw: st.DrawFn, + *, + min_dims: int = 1, + max_dims: int = 4, + min_size: int = 1, + max_size: int = 900, +) -> tuple[tuple[int, ...], tuple[int, ...]]: + """ + Generate a shape tuple and corresponding chunks tuple. + + Each element in the chunks tuple is smaller than or equal to the + corresponding element in the shape tuple. + + Requires the hypothesis package to be installed. + + Parameters + ---------- + min_dims : int, optional + Minimum number of dimensions. Default is 1. + max_dims : int, optional + Maximum number of dimensions. Default is 4. + min_size : int, optional + Minimum size for each dimension. Default is 1. + max_size : int, optional + Maximum size for each dimension. Default is 100. + + Returns + ------- + tuple[tuple[int, ...], tuple[int, ...]] + A tuple containing (shape, chunks) where: + - shape is a tuple of positive integers + - chunks is a tuple where each element is an integer <= corresponding shape element + + Examples + -------- + >>> shape_and_chunks().example() # doctest: +SKIP + ((5, 3, 8), (2, 3, 4)) + >>> shape_and_chunks().example() # doctest: +SKIP + ((10, 7), (10, 3)) + >>> shape_and_chunks(min_dims=2, max_dims=3).example() # doctest: +SKIP + ((4, 6, 2), (2, 3, 1)) + + See Also + -------- + :ref:`testing.hypothesis`_ + """ + # Generate the shape tuple + ndim = draw(st.integers(min_value=min_dims, max_value=max_dims)) + shape = draw( + st.tuples( + *[st.integers(min_value=min_size, max_value=max_size) for _ in range(ndim)] + ) + ) + + # Generate chunks tuple with each element <= corresponding shape element + chunks_elements = [] + for size in shape: + # Each chunk is an integer between 1 and the size of that dimension + chunk_element = draw(st.integers(min_value=1, max_value=size)) + chunks_elements.append(chunk_element) + + chunks = tuple(chunks_elements) + return shape, chunks + + _readable_strings = st.text( _readable_characters, max_size=5, From 478af0eef7b91c51a7b8fde3e20e71469d7dab67 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Wed, 31 Dec 2025 09:04:39 -0500 Subject: [PATCH 08/10] Tidy up strategy --- xarray/testing/strategies.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/xarray/testing/strategies.py b/xarray/testing/strategies.py index af974a144be..9f0a7080936 100644 --- a/xarray/testing/strategies.py +++ b/xarray/testing/strategies.py @@ -266,15 +266,10 @@ def shape_and_chunks( *[st.integers(min_value=min_size, max_value=max_size) for _ in range(ndim)] ) ) - # Generate chunks tuple with each element <= corresponding shape element - chunks_elements = [] - for size in shape: - # Each chunk is an integer between 1 and the size of that dimension - chunk_element = draw(st.integers(min_value=1, max_value=size)) - chunks_elements.append(chunk_element) - - chunks = tuple(chunks_elements) + chunks = draw( + st.tuples(*[st.integers(min_value=1, max_value=size) for size in shape]) + ) return shape, chunks From a468c4b79eff9b09e193b9e861e3f26c31804d53 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Wed, 31 Dec 2025 13:17:18 -0500 Subject: [PATCH 09/10] Fix up typing --- xarray/namedarray/daskmanager.py | 9 ++++---- xarray/namedarray/parallelcompat.py | 34 ++++++++++++++++++----------- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index 955a58b8a69..6f6a046b684 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -11,6 +11,7 @@ if TYPE_CHECKING: from xarray.namedarray._typing import ( + T_ChunkDim, T_Chunks, _DType_co, _NormalizedChunks, @@ -45,11 +46,11 @@ def chunks(self, data: Any) -> _NormalizedChunks: def normalize_chunks( self, - chunks: T_Chunks | _NormalizedChunks, + chunks: tuple[T_ChunkDim, ...] | _NormalizedChunks, shape: tuple[int, ...] | None = None, limit: int | None = None, dtype: _DType_co | None = None, - previous_chunks: _NormalizedChunks | None = None, + previous_chunks: tuple[int, ...] | _NormalizedChunks | None = None, ) -> Any: """Called by open_dataset""" from dask.array.core import normalize_chunks @@ -57,12 +58,12 @@ def normalize_chunks( if any(c == "preserve" for c in chunks) and any(c == "auto" for c in chunks): raise ValueError('chunks cannot use a combination of "auto" and "preserve"') - if previous_chunks and any(c == "preserve" for c in chunks): + if shape and previous_chunks and any(c == "preserve" for c in chunks): chunks = self.preserve_chunks( chunks, shape=shape, target=self.get_auto_chunk_size(), - typesize=dtype.itemsize, + typesize=getattr(dtype, "itemsize", 8), previous_chunks=previous_chunks, ) diff --git a/xarray/namedarray/parallelcompat.py b/xarray/namedarray/parallelcompat.py index f34fea44b5b..b2069f60a80 100644 --- a/xarray/namedarray/parallelcompat.py +++ b/xarray/namedarray/parallelcompat.py @@ -20,6 +20,7 @@ if TYPE_CHECKING: from xarray.namedarray._typing import ( + T_ChunkDim, T_Chunks, _Chunks, _DType, @@ -780,12 +781,12 @@ def get_auto_chunk_size( @staticmethod def preserve_chunks( - chunks: T_Chunks, + chunks: tuple[T_ChunkDim, ...], shape: tuple[int, ...], target: int, typesize: int, - previous_chunks: tuple[int], - ) -> tuple[int]: + previous_chunks: tuple[int, ...] | _NormalizedChunks, + ) -> tuple[T_ChunkDim, ...]: """Determine meta chunks This takes in a chunks value that contains ``"preserve"`` values in certain @@ -810,21 +811,28 @@ def preserve_chunks( Size of chunks being preserved. Expressed as a tuple of ints which matches the way chunks are encoded in Zarr. """ - shape = np.array(shape) - previous_chunks = np.array(previous_chunks) + # pop the first item off in case it's a tuple of tuples + preferred_chunks = np.array( + [c if isinstance(c, int) else c[0] for c in previous_chunks] + ) # "preserve" stays as "preserve" - # empty tuple means match previous chunks + # None or empty tuple means match previous chunks # -1 means whole dim is in one chunk desired_chunks = np.array( [ - c or previous_chunks[i] if c != -1 else shape[i] + c or preferred_chunks[i] if c != -1 else shape[i] for i, c in enumerate(chunks) ] ) - preserve_chunks = desired_chunks == "preserve" - chunks = np.where(preserve_chunks, previous_chunks, desired_chunks).astype(int) + + if not preserve_chunks.any(): + return chunks + + new_chunks = np.where(preserve_chunks, preferred_chunks, desired_chunks).astype( + int + ) while True: # Repeatedly loop over the ``previous_chunks``, multiplying them by 2. @@ -833,9 +841,9 @@ def preserve_chunks( # 1b. we are within 50% of the target chunk size OR # 2. the chunk covers the entire array - num_chunks = shape / chunks * preserve_chunks + num_chunks = np.array(shape) / new_chunks * preserve_chunks idx = np.argmax(num_chunks) - chunk_bytes = np.prod(chunks) * typesize + chunk_bytes = np.prod(new_chunks) * typesize if chunk_bytes > target or abs(chunk_bytes - target) / target < 0.5: break @@ -843,6 +851,6 @@ def preserve_chunks( if (num_chunks <= 1).all(): break - chunks[idx] = min(chunks[idx] * 2, shape[idx]) + new_chunks[idx] = min(new_chunks[idx] * 2, shape[idx]) - return tuple(int(x) for x in chunks) + return tuple(int(x) for x in new_chunks) From 20934cd260c1084f077a316b620ee9f99fb06cf6 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Wed, 31 Dec 2025 13:45:52 -0500 Subject: [PATCH 10/10] Move `preserve_chunks` call out of `normalize_chunks` --- xarray/namedarray/daskmanager.py | 12 ------------ xarray/namedarray/utils.py | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index 6f6a046b684..323beb6a37e 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -55,18 +55,6 @@ def normalize_chunks( """Called by open_dataset""" from dask.array.core import normalize_chunks - if any(c == "preserve" for c in chunks) and any(c == "auto" for c in chunks): - raise ValueError('chunks cannot use a combination of "auto" and "preserve"') - - if shape and previous_chunks and any(c == "preserve" for c in chunks): - chunks = self.preserve_chunks( - chunks, - shape=shape, - target=self.get_auto_chunk_size(), - typesize=getattr(dtype, "itemsize", 8), - previous_chunks=previous_chunks, - ) - return normalize_chunks( chunks, shape=shape, diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py index 26d99d7d40a..2a997b4a831 100644 --- a/xarray/namedarray/utils.py +++ b/xarray/namedarray/utils.py @@ -236,6 +236,20 @@ def _get_chunk( # type: ignore[no-untyped-def] limit = None dtype = data.dtype + if any(c == "preserve" for c in chunk_shape) and any( + c == "auto" for c in chunk_shape + ): + raise ValueError('chunks cannot use a combination of "auto" and "preserve"') + + if shape and preferred_chunk_shape and any(c == "preserve" for c in chunk_shape): + chunk_shape = chunkmanager.preserve_chunks( + chunk_shape, + shape=shape, + target=chunkmanager.get_auto_chunk_size(), + typesize=getattr(dtype, "itemsize", 8), + previous_chunks=preferred_chunk_shape, + ) + chunk_shape = chunkmanager.normalize_chunks( chunk_shape, shape=shape,