Skip to content
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,17 @@ def netcdf4_file_with_data_in_multiple_groups(tmp_path: Path) -> str:
return str(filepath)


@pytest.fixture
def netcdf4_file_with_data_in_sibling_groups(tmp_path: Path) -> str:
"""Create a NetCDF4 file with data in sibling groups."""
filepath = tmp_path / "test.nc"
ds1 = xr.DataArray([1, 2, 3], name="foo").to_dataset()
ds1.to_netcdf(filepath, group="subgroup1")
ds2 = xr.DataArray([4, 5], name="bar").to_dataset()
ds2.to_netcdf(filepath, group="subgroup2", mode="a")
return str(filepath)


@pytest.fixture
def netcdf4_files_factory(tmp_path: Path) -> Callable[[], tuple[str, str]]:
"""Factory fixture to create multiple NetCDF4 files."""
Expand Down
3 changes: 3 additions & 0 deletions docs/api/virtualizarr.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ Users can use xarray for every step apart from reading and serializing virtual r

::: virtualizarr.open_virtual_mfdataset

::: virtualizarr.open_virtual_datatree


## Information

::: virtualizarr.accessor.VirtualiZarrDatasetAccessor.nbytes
Expand Down
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ plugins:
- https://icechunk.io/en/stable/objects.inv
- https://lithops-cloud.github.io/docs/objects.inv
- https://docs.dask.org/en/stable/objects.inv
- https://virtual-tiff.readthedocs.io/en/latest/objects.inv
# https://github.com/developmentseed/titiler/blob/50934c929cca2fa8d3c408d239015f8da429c6a8/docs/mkdocs.yml#L115-L140
markdown_extensions:
- admonition
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ upstream = [
's3fs @ git+https://github.com/fsspec/s3fs',
'kerchunk @ git+https://github.com/fsspec/kerchunk',
'icechunk @ git+https://github.com/earth-mover/icechunk#subdirectory=icechunk-python',
'virtual_tiff @ git+https://github.com/virtual-zarr/virtual-tiff',
]
docs = [
"mkdocs-material[imaging]>=9.6.14",
Expand Down
7 changes: 6 additions & 1 deletion virtualizarr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@
VirtualiZarrDatasetAccessor,
VirtualiZarrDataTreeAccessor,
)
from virtualizarr.xarray import open_virtual_dataset, open_virtual_mfdataset
from virtualizarr.xarray import (
open_virtual_dataset,
open_virtual_datatree,
open_virtual_mfdataset,
)

try:
__version__ = _version("virtualizarr")
Expand All @@ -18,4 +22,5 @@
"VirtualiZarrDataTreeAccessor",
"open_virtual_dataset",
"open_virtual_mfdataset",
"open_virtual_datatree",
]
33 changes: 33 additions & 0 deletions virtualizarr/manifests/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,36 @@ def to_virtual_dataset(self) -> xr.Dataset:
coord_names=coord_names,
attrs=attributes,
)

def to_virtual_datasets(self) -> Mapping[str, xr.Dataset]:
"""
Create a dictionary containing virtual datasets for all the sub-groups of a ManifestGroup. All the
variables in the datasets will be "virtual", i.e., they will wrap ManifestArray objects.

It is convenient to have a separate `to_virtual_datasets` function from `to_virtual_datatree` so that
it can be called recursively without needing to use `DataTree.to_dict() and `.from_dict()` repeatedly.
"""
result = {"": self.to_virtual_dataset()}

# Recursively process all subgroups
for group_name, subgroup in self.groups.items():
subgroup_datasets = subgroup.to_virtual_datasets()

# Add the subgroup's datasets with proper path prefixes
for subpath, dataset in subgroup_datasets.items():
if subpath == "":
# Direct child group
full_path = group_name
else:
# Nested subgroup
full_path = f"{group_name}/{subpath}"
result[full_path] = dataset
return result

def to_virtual_datatree(self) -> xr.DataTree:
"""
Create a "virtual" [xarray.DataTree][] containing the contents of one zarr group.

All variables in the returned DataTree will be "virtual", i.e. they will wrap ManifestArray objects.
"""
return xr.DataTree.from_dict(self.to_virtual_datasets())
40 changes: 40 additions & 0 deletions virtualizarr/manifests/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,46 @@ def to_virtual_dataset(
decode_times=decode_times,
)

def to_virtual_datatree(
self,
group="",
*,
drop_variables: Iterable[str] | None = None,
loadable_variables: Iterable[str] | None = None,
decode_times: bool | None = None,
) -> "xr.DataTree":
"""
Create a "virtual" [xarray.DataTree][] containing the contents of a zarr group. Default is the root group and all sub-groups.

Will ignore the contents of any other groups in the store.

Requires xarray.

Parameters
----------
group : Group to convert to a virtual DataTree
drop_variables
Variables in the data source to drop before returning.
loadable_variables
Variables in the data source to load as Dask/NumPy arrays instead of as virtual arrays.
decode_times
Bool that is passed into [xarray.open_dataset][]. Allows time to be decoded into a datetime object.

Returns
-------
vdt : xarray.DataTree
"""

from virtualizarr.xarray import construct_virtual_datatree

return construct_virtual_datatree(
manifest_store=self,
group=group,
loadable_variables=loadable_variables,
decode_times=decode_times,
drop_variables=drop_variables,
)


def _transform_byte_range(
byte_range: ByteRequest | None, *, chunk_start: int, chunk_end_exclusive: int
Expand Down
24 changes: 21 additions & 3 deletions virtualizarr/tests/test_parsers/test_tiff.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import pytest
from obstore.store import S3Store
from xarray import Dataset
from xarray import Dataset, DataTree

from virtualizarr import open_virtual_dataset
from virtualizarr import open_virtual_dataset, open_virtual_datatree
from virtualizarr.registry import ObjectStoreRegistry
from virtualizarr.tests import requires_network, requires_tiff

Expand All @@ -11,7 +11,25 @@

@requires_tiff
@requires_network
def test_virtual_tiff() -> None:
def test_virtual_tiff_datatree() -> None:
store = S3Store("sentinel-cogs", region="us-west-2", skip_signature=True)
registry = ObjectStoreRegistry({"s3://sentinel-cogs/": store})
url = "s3://sentinel-cogs/sentinel-s2-l2a-cogs/12/S/UF/2022/6/S2B_12SUF_20220609_0_L2A/B04.tif"
parser = virtual_tiff.VirtualTIFF(ifd_layout="nested")
with open_virtual_datatree(url=url, parser=parser, registry=registry) as vdt:
assert isinstance(vdt, DataTree)
assert list(vdt["0"].ds.variables) == ["0"]
var = vdt["0"].ds["0"].variable
assert var.sizes == {"y": 10980, "x": 10980}
assert var.dtype == "<u2"
var = vdt["1"].ds["1"].variable
assert var.sizes == {"y": 5490, "x": 5490}
assert var.dtype == "<u2"


@requires_tiff
@requires_network
def test_virtual_tiff_dataset() -> None:
store = S3Store("sentinel-cogs", region="us-west-2", skip_signature=True)
registry = ObjectStoreRegistry({"s3://sentinel-cogs/": store})
url = "s3://sentinel-cogs/sentinel-s2-l2a-cogs/12/S/UF/2022/6/S2B_12SUF_20220609_0_L2A/B04.tif"
Expand Down
58 changes: 56 additions & 2 deletions virtualizarr/tests/test_xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,14 @@
import pytest
import xarray as xr
import xarray.testing as xrt
from xarray import Dataset, open_dataset
from xarray import Dataset, open_dataset, open_datatree
from xarray.core.indexes import Index

from virtualizarr import open_virtual_dataset, open_virtual_mfdataset
from virtualizarr import (
open_virtual_dataset,
open_virtual_datatree,
open_virtual_mfdataset,
)
from virtualizarr.manifests import ChunkManifest, ManifestArray
from virtualizarr.parsers import HDFParser
from virtualizarr.registry import ObjectStoreRegistry
Expand Down Expand Up @@ -655,6 +659,56 @@ def test_open_subgroup(
assert isinstance(vds["bar"].data, ManifestArray)
assert vds["bar"].shape == (2,)

def test_open_virtual_datatree_raises(
self, netcdf4_file_with_data_in_multiple_groups, local_registry
):
parser = HDFParser()
with pytest.raises(
ValueError, match="group '/subgroup' is not aligned with its parents"
):
open_virtual_datatree(
url=netcdf4_file_with_data_in_multiple_groups,
registry=local_registry,
parser=parser,
)

def test_open_virtual_datatree(
self, netcdf4_file_with_data_in_sibling_groups, local_registry
):
with (
open_virtual_datatree(
url=netcdf4_file_with_data_in_sibling_groups,
registry=local_registry,
parser=HDFParser(),
) as vdt,
open_datatree(
netcdf4_file_with_data_in_sibling_groups, engine="h5netcdf"
) as dt,
):
vdt.isomorphic(dt)
assert list(vdt["/subgroup1"].variables) == ["foo"]
assert isinstance(vdt["/subgroup1"]["foo"].data, ManifestArray)
assert vdt["/subgroup1"]["foo"].shape == (3,)
assert list(vdt["/subgroup2"].variables) == ["bar"]
assert isinstance(vdt["/subgroup2"]["bar"].data, ManifestArray)
assert vdt["/subgroup2"]["bar"].shape == (2,)

def test_open_virtual_datatree_all_vars_loaded(
self, netcdf4_file_with_data_in_sibling_groups, local_registry
):
with (
open_virtual_datatree(
url=netcdf4_file_with_data_in_sibling_groups,
registry=local_registry,
parser=HDFParser(),
loadable_variables=["foo", "bar"],
) as vdt,
open_datatree(
netcdf4_file_with_data_in_sibling_groups, engine="h5netcdf"
) as dt,
):
xr.testing.assert_allclose(vdt, dt)

@pytest.mark.parametrize("group", ["", None])
def test_open_root_group(
self, netcdf4_file_with_data_in_multiple_groups, group, local_registry
Expand Down
104 changes: 101 additions & 3 deletions virtualizarr/xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from xarray.core.types import NestedSequence
from xarray.structure.combine import _infer_concat_order_from_positions, _nested_combine

from virtualizarr.manifests import ManifestStore
from virtualizarr.manifests import ManifestArray, ManifestGroup, ManifestStore
from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri
from virtualizarr.parallel import get_executor
from virtualizarr.parsers.typing import Parser
Expand All @@ -35,6 +35,68 @@
)


def open_virtual_datatree(
url: str,
registry: ObjectStoreRegistry,
parser: Parser,
*,
drop_variables: Iterable[str] | None = None,
loadable_variables: Iterable[str] | None = None,
decode_times: bool | None = None,
) -> xr.DataTree:
"""
Open an archival data source as an [xarray.DataTree][] wrapping virtualized zarr arrays.

No data variables will be loaded unless specified in the ``loadable_variables`` kwarg (in which case they will open as lazily indexed arrays using xarray's standard lazy indexing classes).

Parameters
----------
url
The url of the data source to virtualize. The URL should include a scheme. For example:

- `url="file:///Users/my-name/Documents/my-project/my-data.nc"` for a local data source.
- `url="s3://my-bucket/my-project/my-data.nc"` for a remote data source on an S3 compatible cloud.

registry
An [ObjectStoreRegistry][virtualizarr.registry.ObjectStoreRegistry] for resolving urls and reading data.
parser
A parser to use for the given data source. For example:

- [virtualizarr.parsers.HDFParser][] for virtualizing NetCDF4 or HDF5 files.
- [virtualizarr.parsers.FITSParser][] for virtualizing FITS files.
- [virtualizarr.parsers.NetCDF3Parser][] for virtualizing NetCDF3 files.
- [virtualizarr.parsers.KerchunkJSONParser][] for re-opening Kerchunk JSONs.
- [virtualizarr.parsers.KerchunkParquetParser][] for re-opening Kerchunk Parquets.
- [virtualizarr.parsers.ZarrParser][] for virtualizing Zarr stores.
- [virtualizarr.parsers.ZarrParser][] for virtualizing Zarr stores.
- [virtual_tiff.VirtualTIFF][] for virtualizing TIFFs.

drop_variables
Variables in the data source to drop before returning.
loadable_variables
Variables in the data source to load as Dask/NumPy arrays instead of as virtual arrays.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems under-specified. Can I load/drop specific variables from one group? If so how? Does it understand a path-like string?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added more details and examples in 9b91ad3 (#838)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I should ask, do you approve of the behavior that's now described regarding loadable/droppable variables @TomNicholas? we should avoid breaking changes obviously, so this is an important component to get right

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not a big fan of the current map-over-groups behaviour, because IMO it only makes sense for overviews-style-datatrees. Accepting path-like strings is more effort but should work. Unfortunately this whole API design problem is just a little bit messy 😞

I think maybe this is an argument for implementing something like vdt["var"].load()...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, so what if we only implement the default behavior in this PR and punt on adding loadable/drop_variables? It's the end of the project increment for me, so I'd like to get something in before I get pulled onto other work and splitting PRs is good practice anyways.

We'd need to decide on the default behavior between:

  1. Same as xarray (load coordinates)
  2. Don't load anything

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm okay with that, because then generalizing later wouldn't be a breaking change.

We'd need to decide on the default behavior

Maybe you could keep both these options by keeping the kwargs, but raising a NotImplementedError if the user attempts to pass an a non-empty list.

decode_times
Bool that is passed into [xarray.open_dataset][]. Allows time to be decoded into a datetime object.

Returns
-------
vds
An [xarray.DataTree][] containing virtual chunk references for all variables.
"""
filepath = validate_and_normalize_path_to_uri(url, fs_root=Path.cwd().as_uri())

manifest_store = parser(
url=filepath,
registry=registry,
)

return manifest_store.to_virtual_datatree(
loadable_variables=loadable_variables,
decode_times=decode_times,
drop_variables=drop_variables,
)


def open_virtual_dataset(
url: str,
registry: ObjectStoreRegistry,
Expand All @@ -48,8 +110,6 @@ def open_virtual_dataset(

No data variables will be loaded unless specified in the ``loadable_variables`` kwarg (in which case they will open as lazily indexed arrays using xarray's standard lazy indexing classes).

Xarray indexes can optionally be created (the default behaviour is to create indexes for any 1D coordinate variables). To avoid creating any xarray indexes pass ``indexes={}``.

Parameters
----------
url
Expand All @@ -69,6 +129,7 @@ def open_virtual_dataset(
- [virtualizarr.parsers.KerchunkJSONParser][] for re-opening Kerchunk JSONs.
- [virtualizarr.parsers.KerchunkParquetParser][] for re-opening Kerchunk Parquets.
- [virtualizarr.parsers.ZarrParser][] for virtualizing Zarr stores.
- [virtual_tiff.VirtualTIFF][] for virtualizing TIFFs.

drop_variables
Variables in the data source to drop before returning.
Expand Down Expand Up @@ -354,6 +415,43 @@ def construct_virtual_dataset(
)


def construct_virtual_datatree(
manifest_store: ManifestStore,
group: str = "",
*,
drop_variables: Iterable[str] | None = None,
loadable_variables: Iterable[str] | None = None,
decode_times: bool | None = None,
) -> xr.DataTree:
"""
Construct a fully or partly virtual datatree from a ManifestStore.
"""
node = manifest_store._group[group] if group else manifest_store._group

if isinstance(node, ManifestArray):
node = ManifestGroup(arrays={group: node}, attributes={})

fully_loadable_datatree = xr.open_datatree(
manifest_store, # type: ignore[arg-type]
group=group,
engine="zarr",
consolidated=False,
zarr_format=3,
decode_times=decode_times,
)

partially_loaded_datasets = {
name: replace_virtual_with_loadable_vars(
virtual_node.to_dataset(),
fully_loadable_datatree[name].to_dataset(),
loadable_variables,
).drop_vars(list(drop_variables or ()))
for name, virtual_node in node.to_virtual_datatree().subtree_with_keys
}

return xr.DataTree.from_dict(partially_loaded_datasets)


def replace_virtual_with_loadable_vars(
fully_virtual_ds: xr.Dataset,
loadable_ds: xr.Dataset,
Expand Down
Loading