Skip to content

Commit 6762c2e

Browse files
authored
Avoid refining parent dimensions in NetCDF files (#10623)
* Avoid refining parent dimensions in NetCDF files This changes the way DataTree objects are written to disk in a single call to `DataTree.to_netcdf()`, as well as netCDF groups written by passing an explicit `group` to `Dataset.to_netcdf()`. Conceivably we could only adjust the behavior for `DataTree`, but doing so for `Dataset` was well felt more consistent to me (and was also easier to implement). Fixes: GH10241 * Typo * doc failure * Create collect_ancestor_dimensions() helper function
1 parent b96d607 commit 6762c2e

File tree

6 files changed

+63
-2
lines changed

6 files changed

+63
-2
lines changed

doc/whats-new.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,13 @@ New Features
2121
Breaking changes
2222
~~~~~~~~~~~~~~~~
2323

24+
- When writing to NetCDF files with groups, Xarray no longer redefines dimensions
25+
that have the same size in parent groups (:issue:`10241`). This conforms with
26+
`CF Conventions for group scrope <https://cfconventions.org/cf-conventions/cf-conventions.html#_scope>`_
27+
but may require adjustments for code that consumes NetCDF files produced by
28+
Xarray.
29+
By `Stephan Hoyer <https://github.com/shoyer>`_.
30+
2431

2532
Deprecations
2633
~~~~~~~~~~~~

xarray/backends/common.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,20 @@ def find_root_and_group(ds):
256256
return ds, group
257257

258258

259+
def collect_ancestor_dimensions(group) -> dict[str, int]:
260+
"""Returns dimensions defined in parent groups.
261+
262+
If dimensions are defined in multiple ancestors, use the size of the closest
263+
ancestor.
264+
"""
265+
dims = {}
266+
while (group := group.parent) is not None:
267+
for k, v in group.dimensions.items():
268+
if k not in dims:
269+
dims[k] = len(v)
270+
return dims
271+
272+
259273
def datatree_from_dict_with_io_cleanup(groups_dict: Mapping[str, Dataset]) -> DataTree:
260274
"""DataTree.from_dict with file clean-up."""
261275
try:
@@ -308,6 +322,9 @@ class AbstractDataStore:
308322
def get_dimensions(self): # pragma: no cover
309323
raise NotImplementedError()
310324

325+
def get_parent_dimensions(self): # pragma: no cover
326+
return {}
327+
311328
def get_attrs(self): # pragma: no cover
312329
raise NotImplementedError()
313330

@@ -563,21 +580,22 @@ def set_dimensions(self, variables, unlimited_dims=None):
563580
if unlimited_dims is None:
564581
unlimited_dims = set()
565582

583+
parent_dims = self.get_parent_dimensions()
566584
existing_dims = self.get_dimensions()
567585

568586
dims = {}
569587
for v in unlimited_dims: # put unlimited_dims first
570588
dims[v] = None
571589
for v in variables.values():
572-
dims.update(dict(zip(v.dims, v.shape, strict=True)))
590+
dims |= v.sizes
573591

574592
for dim, length in dims.items():
575593
if dim in existing_dims and length != existing_dims[dim]:
576594
raise ValueError(
577595
"Unable to update size for existing dimension"
578596
f"{dim!r} ({length} != {existing_dims[dim]})"
579597
)
580-
elif dim not in existing_dims:
598+
elif dim not in existing_dims and length != parent_dims.get(dim):
581599
is_unlimited = dim in unlimited_dims
582600
self.set_dimension(dim, length, is_unlimited)
583601

xarray/backends/h5netcdf_.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
WritableCFDataStore,
1717
_normalize_path,
1818
_open_remote_file,
19+
collect_ancestor_dimensions,
1920
datatree_from_dict_with_io_cleanup,
2021
find_root_and_group,
2122
)
@@ -287,6 +288,9 @@ def get_attrs(self):
287288
def get_dimensions(self):
288289
return FrozenDict((k, len(v)) for k, v in self.ds.dimensions.items())
289290

291+
def get_parent_dimensions(self):
292+
return FrozenDict(collect_ancestor_dimensions(self.ds))
293+
290294
def get_encoding(self):
291295
return {
292296
"unlimited_dims": {

xarray/backends/netCDF4_.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
T_PathFileOrDataStore,
1717
WritableCFDataStore,
1818
_normalize_path,
19+
collect_ancestor_dimensions,
1920
datatree_from_dict_with_io_cleanup,
2021
find_root_and_group,
2122
robust_getitem,
@@ -518,6 +519,9 @@ def get_attrs(self):
518519
def get_dimensions(self):
519520
return FrozenDict((k, len(v)) for k, v in self.ds.dimensions.items())
520521

522+
def get_parent_dimensions(self):
523+
return FrozenDict(collect_ancestor_dimensions(self.ds))
524+
521525
def get_encoding(self):
522526
return {
523527
"unlimited_dims": {

xarray/tests/test_backends.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1651,6 +1651,17 @@ def test_write_groups(self) -> None:
16511651
with self.open(tmp_file, group="data/2") as actual2:
16521652
assert_identical(data2, actual2)
16531653

1654+
def test_child_group_with_inconsistent_dimensions(self) -> None:
1655+
base = Dataset(coords={"x": [1, 2]})
1656+
child = Dataset(coords={"x": [1, 2, 3]})
1657+
with create_tmp_file() as tmp_file:
1658+
self.save(base, tmp_file)
1659+
self.save(child, tmp_file, group="child", mode="a")
1660+
with self.open(tmp_file) as actual_base:
1661+
assert_identical(base, actual_base)
1662+
with self.open(tmp_file, group="child") as actual_child:
1663+
assert_identical(child, actual_child)
1664+
16541665
@pytest.mark.parametrize(
16551666
"input_strings, is_bytes",
16561667
[

xarray/tests/test_backends_datatree.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,23 @@ def test_write_subgroup(self, tmpdir):
265265
assert_equal(original_dt, roundtrip_dt)
266266
assert_identical(expected_dt, roundtrip_dt)
267267

268+
@requires_netCDF4
269+
def test_no_redundant_dimensions(self, tmpdir):
270+
# regression test for https://github.com/pydata/xarray/issues/10241
271+
original_dt = DataTree.from_dict(
272+
{
273+
"/": xr.Dataset(coords={"x": [1, 2, 3]}),
274+
"/child": xr.Dataset({"foo": ("x", [4, 5, 6])}),
275+
}
276+
)
277+
filepath = tmpdir / "test.zarr"
278+
original_dt.to_netcdf(filepath, engine=self.engine)
279+
280+
root = nc4.Dataset(str(filepath))
281+
child = root.groups["child"]
282+
assert list(root.dimensions) == ["x"]
283+
assert list(child.dimensions) == []
284+
268285

269286
@requires_netCDF4
270287
class TestNetCDF4DatatreeIO(DatatreeIOBase):

0 commit comments

Comments
 (0)