Skip to content

Commit 69316e5

Browse files
authored
Sanitize unlimited_dims when writing to_netcdf (#10608)
* Sanitize unlimited_dims when writing `to_netcdf`, raise ValueError when encoding is inconsistent with dataset. * Update doc/whats-new.rst
1 parent d1873f9 commit 69316e5

File tree

4 files changed

+72
-28
lines changed

4 files changed

+72
-28
lines changed

doc/whats-new.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,9 @@ Bug fixes
5050
By `Deepak Cherian <https://github.com/dcherian>`_.
5151
- Fix detection of the ``h5netcdf`` backend. Xarray now selects ``h5netcdf`` if the default ``netCDF4`` engine is not available (:issue:`10401`, :pull:`10557`).
5252
By `Scott Staniewicz <https://github.com/scottstanie>`_.
53+
- Ensure ``unlimited_dims`` passed to :py:meth:`xarray.DataArray.to_netcdf`, :py:meth:`xarray.Dataset.to_netcdf` or :py:meth:`xarray.DataTree.to_netcdf` only contains dimensions present in the object; raise ``ValueError`` otherwise (:issue:`10549`, :pull:`10608`).
54+
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
55+
5356

5457
Documentation
5558
~~~~~~~~~~~~~

xarray/backends/api.py

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,28 @@ def check_attr(name, value, valid_types):
244244
check_attr(k, v, valid_types)
245245

246246

247+
def _sanitize_unlimited_dims(dataset, unlimited_dims):
248+
msg_origin = "unlimited_dims-kwarg"
249+
if unlimited_dims is None:
250+
unlimited_dims = dataset.encoding.get("unlimited_dims", None)
251+
msg_origin = "dataset.encoding"
252+
if unlimited_dims is not None:
253+
if isinstance(unlimited_dims, str) or not isinstance(unlimited_dims, Iterable):
254+
unlimited_dims = [unlimited_dims]
255+
else:
256+
unlimited_dims = list(unlimited_dims)
257+
dataset_dims = set(dataset.dims)
258+
unlimited_dims = set(unlimited_dims)
259+
if undeclared_dims := (unlimited_dims - dataset_dims):
260+
msg = (
261+
f"Unlimited dimension(s) {undeclared_dims!r} declared in {msg_origin!r}, "
262+
f"but not part of current dataset dimensions. "
263+
f"Consider removing {undeclared_dims!r} from {msg_origin!r}."
264+
)
265+
raise ValueError(msg)
266+
return unlimited_dims
267+
268+
247269
def _resolve_decoders_kwargs(decode_cf, open_backend_dataset_parameters, **decoders):
248270
for d in list(decoders):
249271
if decode_cf is False and d in open_backend_dataset_parameters:
@@ -2007,6 +2029,8 @@ def to_netcdf(
20072029
# validate Dataset keys, DataArray names, and attr keys/values
20082030
_validate_dataset_names(dataset)
20092031
_validate_attrs(dataset, engine, invalid_netcdf)
2032+
# sanitize unlimited_dims
2033+
unlimited_dims = _sanitize_unlimited_dims(dataset, unlimited_dims)
20102034

20112035
try:
20122036
store_open = WRITEABLE_STORES[engine]
@@ -2045,14 +2069,6 @@ def to_netcdf(
20452069

20462070
store = store_open(target, mode, format, group, **kwargs)
20472071

2048-
if unlimited_dims is None:
2049-
unlimited_dims = dataset.encoding.get("unlimited_dims", None)
2050-
if unlimited_dims is not None:
2051-
if isinstance(unlimited_dims, str) or not isinstance(unlimited_dims, Iterable):
2052-
unlimited_dims = [unlimited_dims]
2053-
else:
2054-
unlimited_dims = list(unlimited_dims)
2055-
20562072
writer = ArrayWriter()
20572073

20582074
# TODO: figure out how to refactor this logic (here and in save_mfdataset)

xarray/tests/test_backends.py

Lines changed: 41 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1324,6 +1324,47 @@ def test_encoding_kwarg(self) -> None:
13241324
with self.roundtrip(ds, save_kwargs=kwargs) as actual:
13251325
pass
13261326

1327+
def test_encoding_unlimited_dims(self) -> None:
1328+
if isinstance(self, ZarrBase):
1329+
pytest.skip("No unlimited_dims handled in zarr.")
1330+
ds = Dataset({"x": ("y", np.arange(10.0))})
1331+
with self.roundtrip(ds, save_kwargs=dict(unlimited_dims=["y"])) as actual:
1332+
assert actual.encoding["unlimited_dims"] == set("y")
1333+
assert_equal(ds, actual)
1334+
1335+
# Regression test for https://github.com/pydata/xarray/issues/2134
1336+
with self.roundtrip(ds, save_kwargs=dict(unlimited_dims="y")) as actual:
1337+
assert actual.encoding["unlimited_dims"] == set("y")
1338+
assert_equal(ds, actual)
1339+
1340+
ds.encoding = {"unlimited_dims": ["y"]}
1341+
with self.roundtrip(ds) as actual:
1342+
assert actual.encoding["unlimited_dims"] == set("y")
1343+
assert_equal(ds, actual)
1344+
1345+
# Regression test for https://github.com/pydata/xarray/issues/2134
1346+
ds.encoding = {"unlimited_dims": "y"}
1347+
with self.roundtrip(ds) as actual:
1348+
assert actual.encoding["unlimited_dims"] == set("y")
1349+
assert_equal(ds, actual)
1350+
1351+
# test unlimited_dims validation
1352+
# https://github.com/pydata/xarray/issues/10549
1353+
ds.encoding = {"unlimited_dims": "z"}
1354+
with pytest.raises(
1355+
ValueError,
1356+
match=r"Unlimited dimension\(s\) .* declared in 'dataset.encoding'",
1357+
):
1358+
with self.roundtrip(ds) as _:
1359+
pass
1360+
ds.encoding = {}
1361+
with pytest.raises(
1362+
ValueError,
1363+
match=r"Unlimited dimension\(s\) .* declared in 'unlimited_dims-kwarg'",
1364+
):
1365+
with self.roundtrip(ds, save_kwargs=dict(unlimited_dims=["z"])) as _:
1366+
pass
1367+
13271368
def test_encoding_kwarg_dates(self) -> None:
13281369
ds = Dataset({"t": pd.date_range("2000-01-01", periods=3)})
13291370
units = "days since 1900-01-01"
@@ -1918,16 +1959,6 @@ def test_read_variable_len_strings(self) -> None:
19181959
with open_dataset(tmp_file, **cast(dict, kwargs)) as actual:
19191960
assert_identical(expected, actual)
19201961

1921-
def test_encoding_unlimited_dims(self) -> None:
1922-
ds = Dataset({"x": ("y", np.arange(10.0))})
1923-
with self.roundtrip(ds, save_kwargs=dict(unlimited_dims=["y"])) as actual:
1924-
assert actual.encoding["unlimited_dims"] == set("y")
1925-
assert_equal(ds, actual)
1926-
ds.encoding = {"unlimited_dims": ["y"]}
1927-
with self.roundtrip(ds) as actual:
1928-
assert actual.encoding["unlimited_dims"] == set("y")
1929-
assert_equal(ds, actual)
1930-
19311962
def test_raise_on_forward_slashes_in_names(self) -> None:
19321963
# test for forward slash in variable names and dimensions
19331964
# see GH 7943
@@ -4393,16 +4424,6 @@ def test_read_byte_attrs_as_unicode(self) -> None:
43934424
expected = Dataset(attrs={"foo": "bar"})
43944425
assert_identical(expected, actual)
43954426

4396-
def test_encoding_unlimited_dims(self) -> None:
4397-
ds = Dataset({"x": ("y", np.arange(10.0))})
4398-
with self.roundtrip(ds, save_kwargs=dict(unlimited_dims=["y"])) as actual:
4399-
assert actual.encoding["unlimited_dims"] == set("y")
4400-
assert_equal(ds, actual)
4401-
ds.encoding = {"unlimited_dims": ["y"]}
4402-
with self.roundtrip(ds) as actual:
4403-
assert actual.encoding["unlimited_dims"] == set("y")
4404-
assert_equal(ds, actual)
4405-
44064427
def test_compression_encoding_h5py(self) -> None:
44074428
ENCODINGS: tuple[tuple[dict[str, Any], dict[str, Any]], ...] = (
44084429
# h5py style compression with gzip codec will be converted to

xarray/tests/test_conventions.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -606,6 +606,10 @@ def test_encoding_kwarg_fixed_width_string(self) -> None:
606606
# CFEncodedInMemoryStore doesn't support explicit string encodings.
607607
pass
608608

609+
def test_encoding_unlimited_dims(self) -> None:
610+
# CFEncodedInMemoryStore doesn't support unlimited_dims.
611+
pass
612+
609613

610614
class TestDecodeCFVariableWithArrayUnits:
611615
def test_decode_cf_variable_with_array_units(self) -> None:

0 commit comments

Comments
 (0)