Skip to content

Commit ea426d3

Browse files
authored
Warn if serializing non-virtual data (#657)
* test warning raised if writing non-virtual dataset to icechunk * warn for Dataset * generalize decorator to work for DataTree too * test warned on datatree to icechunk * lint * test to_kerchunk * release note * avoid writing to same store twice in test * remove decorator type hints
1 parent fa92cb6 commit ea426d3

File tree

4 files changed

+79
-1
lines changed

4 files changed

+79
-1
lines changed

docs/releases.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
`xarray.DataTree.virtualize.to_icechunk()` for writing an `xarray.DataTree`
2222
to an Icechunk store ([#244](https://github.com/zarr-developers/VirtualiZarr/issues/244)). By
2323
[Chuck Daniels](https://github.com/chuckwondo).
24+
- Now throws a warning if you attempt to write an entirely non-virtual dataset to a virtual references format ([#657](https://github.com/zarr-developers/VirtualiZarr/pull/657)).
25+
By [Tom Nicholas](https://github.com/TomNicholas).
2426

2527
### Breaking changes
2628

virtualizarr/accessor.py

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,16 @@
1+
import warnings
2+
from collections import deque
13
from datetime import datetime
4+
from functools import wraps
25
from pathlib import Path
3-
from typing import TYPE_CHECKING, Callable, Literal, overload
6+
from typing import (
7+
TYPE_CHECKING,
8+
Callable,
9+
Generator,
10+
Iterable,
11+
Literal,
12+
overload,
13+
)
414

515
import xarray as xr
616

@@ -12,6 +22,44 @@
1222
from icechunk import IcechunkStore # type: ignore[import-not-found]
1323

1424

25+
def warn_if_not_virtual(cls_name: Literal["Dataset", "DataTree"]):
26+
"""Decorator for methods which only make sense for fully virtual xarray objects."""
27+
28+
def decorator(func):
29+
@wraps(func)
30+
def wrapper(self, *args, **kwargs):
31+
all_vars: Iterable[xr.Variable]
32+
match cls_name:
33+
case "Dataset":
34+
all_vars = self.ds.variables.values()
35+
case "DataTree":
36+
all_vars = all_datatree_variables(self.dt)
37+
38+
if not any(isinstance(var.data, ManifestArray) for var in all_vars):
39+
warnings.warn(
40+
f"Attempting to write an entirely non-virtual {cls_name} to a virtual references format - i.e. your `xarray.{cls_name}` contains zero `ManifestArray` objects. "
41+
"This is almost certainly not intended, as the entire data contents will be duplicated rather than referenced. "
42+
f"This may have happened because you used `xarray.open_{cls_name}` instead of `virtualizarr.open_virtual_{cls_name}`, or you set all variables to be `loadable_variables`."
43+
"Please read the usage docs.",
44+
UserWarning,
45+
)
46+
47+
return func(self, *args, **kwargs)
48+
49+
return wrapper
50+
51+
return decorator
52+
53+
54+
def all_datatree_variables(root: xr.DataTree) -> Generator[xr.Variable, None, None]:
55+
"""Flat iterable over all variables in a DataTree"""
56+
queue = deque([root])
57+
while queue:
58+
node = queue.popleft()
59+
yield from node.variables.values()
60+
queue.extend(node.children.values())
61+
62+
1563
@xr.register_dataset_accessor("virtualize")
1664
class VirtualiZarrDatasetAccessor:
1765
"""
@@ -23,6 +71,7 @@ class VirtualiZarrDatasetAccessor:
2371
def __init__(self, ds: xr.Dataset):
2472
self.ds: xr.Dataset = ds
2573

74+
@warn_if_not_virtual("Dataset")
2675
def to_icechunk(
2776
self,
2877
store: "IcechunkStore",
@@ -93,6 +142,7 @@ def to_kerchunk(
93142
categorical_threshold: int = 10,
94143
) -> None: ...
95144

145+
@warn_if_not_virtual("Dataset")
96146
def to_kerchunk(
97147
self,
98148
filepath: str | Path | None = None,
@@ -235,6 +285,7 @@ class VirtualiZarrDataTreeAccessor:
235285
def __init__(self, dt: xr.DataTree):
236286
self.dt = dt
237287

288+
@warn_if_not_virtual("DataTree")
238289
def to_icechunk(
239290
self,
240291
store: "IcechunkStore",

virtualizarr/tests/test_writers/test_icechunk.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -473,6 +473,23 @@ def test_roundtrip_coords(
473473
assert set(roundtrip.coords) == set(vds.coords)
474474

475475

476+
class TestWarnIfNotVirtual:
477+
def test_warn_if_no_virtual_vars_dataset(self, icechunk_filestore: "IcechunkStore"):
478+
non_virtual_ds = xr.Dataset({"foo": ("x", [10, 20, 30]), "x": ("x", [1, 2, 3])})
479+
with pytest.warns(UserWarning, match="non-virtual"):
480+
non_virtual_ds.virtualize.to_icechunk(icechunk_filestore)
481+
482+
def test_warn_if_no_virtual_vars_datatree(
483+
self, icechunk_filestore: "IcechunkStore"
484+
):
485+
non_virtual_ds = xr.Dataset({"foo": ("x", [10, 20, 30]), "x": ("x", [1, 2, 3])})
486+
non_virtual_dt = xr.DataTree.from_dict(
487+
{"/": non_virtual_ds, "/group": non_virtual_ds}
488+
)
489+
with pytest.warns(UserWarning, match="non-virtual"):
490+
non_virtual_dt.virtualize.to_icechunk(icechunk_filestore)
491+
492+
476493
class TestAppend:
477494
"""
478495
Tests for appending to existing icechunk store.

virtualizarr/tests/test_writers/test_kerchunk.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import numpy as np
22
import pandas as pd
3+
import pytest
4+
import xarray as xr
35
from xarray import Dataset
46
from zarr.core.metadata.v2 import ArrayV2Metadata
57

@@ -179,3 +181,9 @@ def testconvert_v3_to_v2_metadata(array_v3_metadata):
179181
assert filters_config["dtype"] == "<i8"
180182
assert filters_config["astype"] == "<i8"
181183
assert v2_metadata.attributes == {}
184+
185+
186+
def test_warn_if_no_virtual_vars():
187+
non_virtual_ds = xr.Dataset({"foo": ("x", [10, 20, 30]), "x": ("x", [1, 2, 3])})
188+
with pytest.warns(UserWarning, match="non-virtual"):
189+
non_virtual_ds.virtualize.to_kerchunk()

0 commit comments

Comments
 (0)