Skip to content
Open
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
eb1a967
All works, just need to satisfy mypy and whatnot now
charles-turner-1 Jul 13, 2025
852476d
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 13, 2025
c921c59
Merge branch 'main' into autochunk-cftime
charles-turner-1 Jul 13, 2025
1aba531
Fix moving import to be optional
charles-turner-1 Jul 13, 2025
9429c3d
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 13, 2025
3c9d27e
Make mypy happy
charles-turner-1 Jul 13, 2025
5153d2d
Add some clarifying comments about what we need to do to optimise this
charles-turner-1 Jul 13, 2025
62e71e6
Merge branch 'autochunk-cftime' of https://github.com/charles-turner-…
charles-turner-1 Jul 14, 2025
cfdc31b
@dcherian's suggestions. Just need to update chunking strategy to res…
charles-turner-1 Jul 14, 2025
2f16bc7
Merge branch 'main' of https://github.com/charles-turner-1/xarray
charles-turner-1 Jul 14, 2025
ce720fa
Merge branch 'main' into autochunk-cftime
charles-turner-1 Jul 14, 2025
4fa58c1
Merge branch 'main' into autochunk-cftime
charles-turner-1 Jul 23, 2025
e58d6d7
Can now load cftime arrays with auto-chunking. Implementation still k…
charles-turner-1 Jul 23, 2025
590e503
Merge branch 'autochunk-cftime' of https://github.com/charles-turner-…
charles-turner-1 Jul 23, 2025
f953976
Test for autochunking when reading from disk
charles-turner-1 Jul 25, 2025
6706524
replace `build_chunkspec` with faking the dtype of a cftime array & a…
charles-turner-1 Jul 25, 2025
4e56acd
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 25, 2025
0d008cd
Merge branch 'autochunk-cftime' of https://github.com/charles-turner-…
charles-turner-1 Jul 25, 2025
49c4e9c
Merge branch 'main' into autochunk-cftime
charles-turner-1 Jul 25, 2025
4594099
Merge branch 'autochunk-cftime' of https://github.com/charles-turner-…
charles-turner-1 Jul 25, 2025
5d00b0a
Remove redundant comments, rename things to make them clearer, add mo…
charles-turner-1 Jul 25, 2025
80421ef
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 25, 2025
d1f7ad3
Refactor to move most of the changes into the DaskManager
charles-turner-1 Jul 25, 2025
1b7de62
Merge branch 'autochunk-cftime' of https://github.com/charles-turner-…
charles-turner-1 Jul 25, 2025
4407185
bare-min tests should pass now?
charles-turner-1 Jul 25, 2025
d8f45b2
Deepak's suggestions (think mypy is still going to be angry for now)
charles-turner-1 Jul 28, 2025
20226c1
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 28, 2025
11ac9f0
Merge branch 'autochunk-cftime' of https://github.com/charles-turner-…
charles-turner-1 Jul 28, 2025
8485df5
Fix errant line
charles-turner-1 Jul 28, 2025
2c27877
Clean up `DaskManager.rechunk` a bit - maybe possible to remove more …
charles-turner-1 Jul 28, 2025
0983261
Remove unused import
charles-turner-1 Jul 28, 2025
c4ec31f
Merge branch 'main' into autochunk-cftime
charles-turner-1 Aug 8, 2025
adbf5b2
Merge branch 'autochunk-cftime' of https://github.com/charles-turner-…
charles-turner-1 Aug 8, 2025
6c93bc4
Fix a couple of type errors
charles-turner-1 Aug 8, 2025
74bc0ea
Mypy & tests passing locally
charles-turner-1 Aug 8, 2025
0b9bbd0
Merge branch 'main' into autochunk-cftime
charles-turner-1 Aug 12, 2025
e58322f
Merge branch 'main' into autochunk-cftime
charles-turner-1 Aug 20, 2025
dbc6ebd
Merge branch 'main' into autochunk-cftime
charles-turner-1 Aug 24, 2025
5680663
Merge branch 'autochunk-cftime' of https://github.com/charles-turner-…
charles-turner-1 Aug 24, 2025
b5933ed
Deepak's comments
charles-turner-1 Aug 25, 2025
5db9225
Merge branch 'main' into autochunk-cftime
charles-turner-1 Aug 26, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions xarray/namedarray/daskmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,21 @@

import numpy as np

from xarray.core.common import _contains_cftime_datetimes
from xarray.core.indexing import ImplicitToExplicitIndexingAdapter
from xarray.namedarray.parallelcompat import ChunkManagerEntrypoint, T_ChunkedArray
from xarray.namedarray.utils import is_duck_dask_array, module_available

if TYPE_CHECKING:
from xarray.core.variable import Variable
from xarray.namedarray._typing import (
T_Chunks,
_DType,
_DType_co,
_NormalizedChunks,
duckarray,
)
from xarray.namedarray.parallelcompat import _Chunks

try:
from dask.array import Array as DaskArray
Expand Down Expand Up @@ -264,3 +268,63 @@ def shuffle(
if chunks != "auto":
raise NotImplementedError("Only chunks='auto' is supported at present.")
return dask.array.shuffle(x, indexer, axis, chunks="auto")

def rechunk( # type: ignore[override]
self,
data: T_ChunkedArray,
chunks: _NormalizedChunks | tuple[int, ...] | _Chunks,
**kwargs: Any,
) -> Any:
"""
Changes the chunking pattern of the given array.

Called when the .chunk method is called on an xarray object that is already chunked.

Parameters
----------
data : dask array
Array to be rechunked.
chunks : int, tuple, dict or str, optional
The new block dimensions to create. -1 indicates the full size of the
corresponding dimension. Default is "auto" which automatically
determines chunk sizes.

Returns
-------
chunked array

See Also
--------
dask.array.Array.rechunk
cubed.Array.rechunk
"""

if _contains_cftime_datetimes(data):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess this can be deleted

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Had a play and I don't think I can fully get rid of it, I've reused as much of the abstracted logic as possible though.

from dask import config as dask_config
from dask.array.core import normalize_chunks
from dask.utils import parse_bytes

from xarray.namedarray.utils import fake_target_chunksize

target_chunksize = parse_bytes(dask_config.get("array.chunk-size"))
limit, var_dtype = fake_target_chunksize( # type: ignore[var-annotated]
data, target_chunksize=target_chunksize
)

chunks = normalize_chunks(
chunks,
shape=data.shape, # type: ignore[attr-defined]
dtype=var_dtype,
limit=limit,
) # type: ignore[no-untyped-call]

return data.rechunk(chunks, **kwargs)

def get_auto_chunk_size(self, var: Variable) -> tuple[int, _DType]:
from dask import config as dask_config
from dask.utils import parse_bytes

from xarray.namedarray.utils import fake_target_chunksize

target_chunksize = parse_bytes(dask_config.get("array.chunk-size"))
return fake_target_chunksize(var, target_chunksize=target_chunksize)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def get_auto_chunk_size(self, var: Variable) -> tuple[int, _DType]:
from dask import config as dask_config
from dask.utils import parse_bytes
from xarray.namedarray.utils import fake_target_chunksize
target_chunksize = parse_bytes(dask_config.get("array.chunk-size"))
return fake_target_chunksize(var, target_chunksize=target_chunksize)
def get_auto_chunk_size(self) -> int:
from dask import config as dask_config
from dask.utils import parse_bytes
return parse_bytes(dask_config.get("array.chunk-size"))

Only this much is dask-specific, so that's what the DaskManager should be responsible for.

29 changes: 29 additions & 0 deletions xarray/namedarray/parallelcompat.py
Original file line number Diff line number Diff line change
Expand Up @@ -746,3 +746,32 @@ def store(
cubed.store
"""
raise NotImplementedError()

def get_auto_chunk_size(
self,
var,
) -> tuple[int, _DType]:
"""
Get the default chunk size for a variable.

This is used to determine the chunk size when opening a dataset with
``chunks="auto"`` or when rechunking an array with ``chunks="auto"``.

Parameters
----------
var : xarray.Variable
The variable for which to get the chunk size.
target_chunksize : int, optional
The target chunk size in bytes. If not provided, a default value is used.

Returns
-------
chunk_size : int
The chunk size in bytes.
dtype : np.dtype
The data type of the variable.
"""

raise NotImplementedError(
"get_auto_chunk_size must be implemented by the chunk manager."
)
36 changes: 35 additions & 1 deletion xarray/namedarray/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import importlib
import sys
import warnings
from collections.abc import Hashable, Iterable, Iterator, Mapping
from functools import lru_cache
Expand All @@ -23,7 +24,9 @@
DaskArray = NDArray # type: ignore[assignment, misc]
DaskCollection: Any = NDArray # type: ignore[no-redef]

from xarray.namedarray._typing import _Dim, duckarray
from xarray.core.variable import Variable
from xarray.namedarray._typing import DuckArray, _Dim, _DType, duckarray
from xarray.namedarray.parallelcompat import T_ChunkedArray


K = TypeVar("K")
Expand Down Expand Up @@ -195,6 +198,37 @@ def either_dict_or_kwargs(
return pos_kwargs


def fake_target_chunksize(
data: DuckArray[Any] | T_ChunkedArray | Variable,
target_chunksize: int,
) -> tuple[int, _DType]:
"""
Naughty trick - let's get the ratio of our cftime_nbytes, and then compute
the ratio of that size to a np.float64. Then we can just adjust our target_chunksize
and use the default dask chunking algorithm to get a reasonable chunk size.

? I don't think T_chunkedArray or Variable should be necessary, but the calls
? to this in daskmanager.py requires it to be that. I still need to wrap my head
? around the typing here a bit more.
"""
import numpy as np
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's move imports to the top if we can; and remove the no_op bit

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can only move numpy to the top - moving from xarray.core.formatting import first_n_items creates a ciruclar import

Copy link
Author

@charles-turner-1 charles-turner-1 Aug 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've removed the no_op stuff - this has the effect of assuming uniform dtypes across all arrays going into dask now. All tests pass (locally) so it's probably not a big deal - I'm not even sure that numpy would allow mixed dtypes, it doesn't feel like it should, but it might be worth noting?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you mean?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looked this up subsequently and I think I'm talking waffle - the if no_op was just in there so that the logic for getting the array item size (in bytes) from the first item was skipped if we didn't find a cftime dtype in the array and a request for auto chunking.

Since arrays can only contain a single dtype, this shouldn't make any difference.

TLDR; ignore my previous comment, it was nonsense


from xarray.core.formatting import first_n_items

output_dtype: _DType = np.dtype(np.float64) # type: ignore[assignment]

if data.dtype == object:
nbytes_approx: int = sys.getsizeof(first_n_items(data, 1)) # type: ignore[no-untyped-call]
else:
nbytes_approx = data[0].itemsize

f64_nbytes = output_dtype.itemsize # Should be 8 bytes

target_chunksize = int(target_chunksize * (f64_nbytes / nbytes_approx))

return target_chunksize, output_dtype


class ReprObject:
"""Object that prints as the given value, for use with sentinel values."""

Expand Down
20 changes: 18 additions & 2 deletions xarray/structure/chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
from typing import TYPE_CHECKING, Any, Literal, TypeVar, Union, overload

from xarray.core import utils
from xarray.core.utils import emit_user_level_warning
from xarray.core.common import _contains_cftime_datetimes
from xarray.core.utils import emit_user_level_warning, is_dict_like
from xarray.core.variable import IndexVariable, Variable
from xarray.namedarray.parallelcompat import (
ChunkManagerEntrypoint,
Expand Down Expand Up @@ -83,8 +84,23 @@ def _get_chunk(var: Variable, chunks, chunkmanager: ChunkManagerEntrypoint):
for dim, preferred_chunk_sizes in zip(dims, preferred_chunk_shape, strict=True)
)

# Chunks can be either dict-like or tuple-like (according to type annotations)
# at this point, so check for # this before we manually construct our chunk
# spec- if we've set chunks to auto
_chunks = list(chunks.values()) if is_dict_like(chunks) else chunks
auto_chunks = all(_chunk == "auto" for _chunk in _chunks)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think technically a subset of this tuple can be "auto" but we can ignore this wrinkle for now.


if _contains_cftime_datetimes(var) and auto_chunks:
limit, var_dtype = chunkmanager.get_auto_chunk_size(var)
else:
limit, var_dtype = None, var.dtype
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This logic would change to use fake_target_chunksize


chunk_shape = chunkmanager.normalize_chunks(
chunk_shape, shape=shape, dtype=var.dtype, previous_chunks=preferred_chunk_shape
chunk_shape,
shape=shape,
dtype=var_dtype,
limit=limit,
previous_chunks=preferred_chunk_shape,
)

# Warn where requested chunks break preferred chunks, provided that the variable
Expand Down
29 changes: 29 additions & 0 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -5427,6 +5427,35 @@ def test_open_multi_dataset(self) -> None:
) as actual:
assert_identical(expected, actual)

def test_open_dataset_cftime_autochunk(self) -> None:
"""Create a dataset with cftime datetime objects and
ensure that auto-chunking works correctly."""
import cftime

from xarray.core.common import _contains_cftime_datetimes

original = xr.Dataset(
{
"foo": ("time", [0.0]),
"time_bnds": (
("time", "bnds"),
[
[
cftime.Datetime360Day(2005, 12, 1, 0, 0, 0, 0),
cftime.Datetime360Day(2005, 12, 2, 0, 0, 0, 0),
]
],
),
},
{"time": [cftime.Datetime360Day(2005, 12, 1, 12, 0, 0, 0)]},
)
with create_tmp_file() as tmp:
original.to_netcdf(tmp)
with open_dataset(tmp, chunks="auto") as actual:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
with create_tmp_file() as tmp:
original.to_netcdf(tmp)
with open_dataset(tmp, chunks="auto") as actual:
with self.roundtrip(original, open_kwargs={"chunks": "auto"}) as actual:

assert isinstance(actual.time_bnds.variable.data, da.Array)
assert _contains_cftime_datetimes(actual.time)
assert_identical(original, actual)

# Flaky test. Very open to contributions on fixing this
@pytest.mark.flaky
def test_dask_roundtrip(self) -> None:
Expand Down
31 changes: 31 additions & 0 deletions xarray/tests/test_dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -1060,6 +1060,29 @@ def make_da():
return da


def make_da_cftime():
yrs = np.arange(2000, 2120)
cftime_dates = xr.date_range(
start=f"{yrs[0]}-01-01",
end=f"{yrs[-1]}-12-31",
freq="1YE",
use_cftime=True,
)
yr_array = np.tile(cftime_dates.values, (10, 1))
da = xr.DataArray(
yr_array,
dims=["x", "t"],
coords={"x": np.arange(10), "t": cftime_dates},
name="a",
).chunk({"x": 4, "t": 5})
da.x.attrs["long_name"] = "x"
da.attrs["test"] = "test"
da.coords["c2"] = 0.5
da.coords["ndcoord"] = da.x * 2

return da


def make_ds():
map_ds = xr.Dataset()
map_ds["a"] = make_da()
Expand Down Expand Up @@ -1141,6 +1164,14 @@ def test_auto_chunk_da(obj):
assert actual.chunks == expected.chunks


@pytest.mark.parametrize("obj", [make_da_cftime()])
def test_auto_chunk_da_cftime(obj):
actual = obj.chunk("auto").data
expected = obj.data.rechunk({0: 10, 1: 120})
np.testing.assert_array_equal(actual, expected)
assert actual.chunks == expected.chunks


def test_map_blocks_error(map_da, map_ds):
def bad_func(darray):
return (darray * darray.x + 5 * darray.y)[:1, :1]
Expand Down
52 changes: 52 additions & 0 deletions xarray/tests/test_namedarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
_ShapeType_co,
)
from xarray.namedarray.core import NamedArray, from_array
from xarray.namedarray.utils import fake_target_chunksize

if TYPE_CHECKING:
from types import ModuleType
Expand All @@ -26,6 +27,7 @@

from xarray.namedarray._typing import (
Default,
DuckArray,
_AttrsLike,
_Dim,
_DimsLike,
Expand All @@ -37,6 +39,13 @@
duckarray,
)

try:
import cftime

cftime_available = True
except ModuleNotFoundError:
cftime_available = False


class CustomArrayBase(Generic[_ShapeType_co, _DType_co]):
def __init__(self, array: duckarray[Any, _DType_co]) -> None:
Expand Down Expand Up @@ -609,3 +618,46 @@ def test_repr() -> None:

# Basic comparison:
assert r == "<xarray.NamedArray (x: 1)> Size: 8B\narray([0], dtype=uint64)"


@pytest.mark.parametrize(
"input_array, expected_chunksize_faked",
[
(np.arange(100).reshape(10, 10), 1024),
(np.arange(100).reshape(10, 10).astype(np.float32), 2048),
(
pytest.param(
np.array(
[
cftime.Datetime360Day(2000, month, day, 0, 0, 0, 0)
for month in range(1, 11)
for day in range(1, 11)
],
dtype=object,
).reshape(10, 10),
73,
marks=pytest.mark.xfail(
not cftime_available,
reason="cftime not available, cannot test object dtype with cftime dates",
),
)
),
],
)
def test_fake_target_chunksize(
input_array: DuckArray[Any], expected_chunksize_faked: int
) -> None:
"""
Check that `fake_target_chunksize` returns the expected chunksize and dtype.
- It pretends to dask we are chunking an array with an 8-byte dtype, ie. a float64.
As such, it will *double* the amount of memory a 4-byte dtype (like float32) would try to use,
fooling it into actually using the correct amount of memory. For object dtypes, which are
generally larger, it will reduce the effective dask configuration chunksize, reducing the size of
the arrays per chunk such that we get the same amount of memory used.
"""
target_chunksize = 1024

faked_chunksize, dtype = fake_target_chunksize(input_array, target_chunksize) # type: ignore[var-annotated]

assert faked_chunksize == expected_chunksize_faked
assert dtype == np.float64
Loading