Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ upstream = [
's3fs @ git+https://github.com/fsspec/s3fs',
'kerchunk @ git+https://github.com/fsspec/kerchunk',
'icechunk @ git+https://github.com/earth-mover/icechunk#subdirectory=icechunk-python',
'virtual_tiff @ git+https://github.com/virtual-zarr/virtual-tiff',
]
docs = [
"mkdocs-material[imaging]>=9.6.14",
Expand Down
7 changes: 6 additions & 1 deletion virtualizarr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@
VirtualiZarrDatasetAccessor,
VirtualiZarrDataTreeAccessor,
)
from virtualizarr.xarray import open_virtual_dataset, open_virtual_mfdataset
from virtualizarr.xarray import (
open_virtual_dataset,
open_virtual_datatree,
open_virtual_mfdataset,
)

try:
__version__ = _version("virtualizarr")
Expand All @@ -18,4 +22,5 @@
"VirtualiZarrDataTreeAccessor",
"open_virtual_dataset",
"open_virtual_mfdataset",
"open_virtual_datatree",
]
34 changes: 34 additions & 0 deletions virtualizarr/manifests/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,37 @@ def to_virtual_dataset(self) -> xr.Dataset:
coord_names=coord_names,
attrs=attributes,
)

def to_virtual_datasets(self) -> dict[str, xr.Dataset]:
"""
Create a dictionary containing virtual datasets for all the sub-groups of a ManifestGroup. All the
variables in the datasets will be "virtual", i.e., they will wrap ManifestArray objects.

It is convenient to have a separate `to_virtual_datasets` function from `to_virtual_datatree` so that
it can be called recursively without needing to use `DataTree.to_dict() and `.from_dict()` repeatedly.
"""
result = {"": self.to_virtual_dataset()}

# Recursively process all subgroups
for group_name, subgroup in self.groups.items():
subgroup_datasets = subgroup.to_virtual_datasets()

# Add the subgroup's datasets with proper path prefixes
for subpath, dataset in subgroup_datasets.items():
if subpath == "":
# Direct child group
full_path = group_name
else:
# Nested subgroup
full_path = f"{group_name}/{subpath}"
result[full_path] = dataset
return result

def to_virtual_datatree(self) -> xr.DataTree:
"""
Create a "virtual" [xarray.DataTree][] containing the contents of one zarr group.

All variables in the returned DataTree will be "virtual", i.e. they will wrap ManifestArray objects.
"""
datasets = self.to_virtual_datasets()
return xr.DataTree.from_dict(datasets)
40 changes: 40 additions & 0 deletions virtualizarr/manifests/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,46 @@ def to_virtual_dataset(
decode_times=decode_times,
)

def to_virtual_datatree(
self,
group="",
*,
drop_variables: Iterable[str] | None = None,
loadable_variables: Iterable[str] | None = None,
decode_times: bool | None = None,
) -> "xr.DataTree":
"""
Create a "virtual" [xarray.DataTree][] containing the contents of a zarr group. Default is the root group and all sub-groups.

Will ignore the contents of any other groups in the store.

Requires xarray.

Parameters
----------
group : Group to convert to a virtual DataTree
drop_variables
Variables in the data source to drop before returning.
loadable_variables
Variables in the data source to load as Dask/NumPy arrays instead of as virtual arrays.
decode_times
Bool that is passed into [xarray.open_dataset][]. Allows time to be decoded into a datetime object.

Returns
-------
vdt : xarray.DataTree
"""

from virtualizarr.xarray import construct_virtual_datatree

return construct_virtual_datatree(
manifest_store=self,
group=group,
loadable_variables=loadable_variables,
decode_times=decode_times,
drop_variables=drop_variables,
)


def _transform_byte_range(
byte_range: ByteRequest | None, *, chunk_start: int, chunk_end_exclusive: int
Expand Down
24 changes: 21 additions & 3 deletions virtualizarr/tests/test_parsers/test_tiff.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import pytest
from obstore.store import S3Store
from xarray import Dataset
from xarray import Dataset, DataTree

from virtualizarr import open_virtual_dataset
from virtualizarr import open_virtual_dataset, open_virtual_datatree
from virtualizarr.registry import ObjectStoreRegistry
from virtualizarr.tests import requires_network, requires_tiff

Expand All @@ -11,7 +11,25 @@

@requires_tiff
@requires_network
def test_virtual_tiff() -> None:
def test_virtual_tiff_datatree() -> None:
store = S3Store("sentinel-cogs", region="us-west-2", skip_signature=True)
registry = ObjectStoreRegistry({"s3://sentinel-cogs/": store})
url = "s3://sentinel-cogs/sentinel-s2-l2a-cogs/12/S/UF/2022/6/S2B_12SUF_20220609_0_L2A/B04.tif"
parser = virtual_tiff.VirtualTIFF(ifd_layout="nested")
with open_virtual_datatree(url=url, parser=parser, registry=registry) as vdt:
assert isinstance(vdt, DataTree)
assert list(vdt["0"].ds.variables) == ["0"]
var = vdt["0"].ds["0"].variable
assert var.sizes == {"y": 10980, "x": 10980}
assert var.dtype == "<u2"
var = vdt["1"].ds["1"].variable
assert var.sizes == {"y": 5490, "x": 5490}
assert var.dtype == "<u2"


@requires_tiff
@requires_network
def test_virtual_tiff_dataset() -> None:
store = S3Store("sentinel-cogs", region="us-west-2", skip_signature=True)
registry = ObjectStoreRegistry({"s3://sentinel-cogs/": store})
url = "s3://sentinel-cogs/sentinel-s2-l2a-cogs/12/S/UF/2022/6/S2B_12SUF_20220609_0_L2A/B04.tif"
Expand Down
102 changes: 101 additions & 1 deletion virtualizarr/xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from xarray.core.types import NestedSequence
from xarray.structure.combine import _infer_concat_order_from_positions, _nested_combine

from virtualizarr.manifests import ManifestStore
from virtualizarr.manifests import ManifestArray, ManifestGroup, ManifestStore
from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri
from virtualizarr.parallel import get_executor
from virtualizarr.parsers.typing import Parser
Expand All @@ -35,6 +35,68 @@
)


def open_virtual_datatree(
url: str,
registry: ObjectStoreRegistry,
parser: Parser,
*,
drop_variables: Iterable[str] | None = None,
loadable_variables: Iterable[str] | None = None,
decode_times: bool | None = None,
) -> xr.DataTree:
"""
Open an archival data source as an [xarray.Datatree][] wrapping virtualized zarr arrays.

No data variables will be loaded unless specified in the ``loadable_variables`` kwarg (in which case they will open as lazily indexed arrays using xarray's standard lazy indexing classes).

Xarray indexes can optionally be created (the default behaviour is to create indexes for any 1D coordinate variables). To avoid creating any xarray indexes pass ``indexes={}``.

Parameters
----------
url
The url of the data source to virtualize. The URL should include a scheme. For example:

- `url="file:///Users/my-name/Documents/my-project/my-data.nc"` for a local data source.
- `url="s3://my-bucket/my-project/my-data.nc"` for a remote data source on an S3 compatible cloud.

registry
An [ObjectStoreRegistry][virtualizarr.registry.ObjectStoreRegistry] for resolving urls and reading data.
parser
A parser to use for the given data source. For example:

- [virtualizarr.parsers.HDFParser][] for virtualizing NetCDF4 or HDF5 files.
- [virtualizarr.parsers.FITSParser][] for virtualizing FITS files.
- [virtualizarr.parsers.NetCDF3Parser][] for virtualizing NetCDF3 files.
- [virtualizarr.parsers.KerchunkJSONParser][] for re-opening Kerchunk JSONs.
- [virtualizarr.parsers.KerchunkParquetParser][] for re-opening Kerchunk Parquets.
- [virtualizarr.parsers.ZarrParser][] for virtualizing Zarr stores.
- [virtualizarr.parsers.ZarrParser][] for virtualizing Zarr stores.
drop_variables
Variables in the data source to drop before returning.
loadable_variables
Variables in the data source to load as Dask/NumPy arrays instead of as virtual arrays.
decode_times
Bool that is passed into [xarray.open_dataset][]. Allows time to be decoded into a datetime object.

Returns
-------
vds
An [xarray.DataTree][] containing virtual chunk references for all variables.
"""
filepath = validate_and_normalize_path_to_uri(url, fs_root=Path.cwd().as_uri())

manifest_store = parser(
url=filepath,
registry=registry,
)

return manifest_store.to_virtual_datatree(
loadable_variables=loadable_variables,
decode_times=decode_times,
drop_variables=drop_variables,
)


def open_virtual_dataset(
url: str,
registry: ObjectStoreRegistry,
Expand Down Expand Up @@ -354,6 +416,44 @@ def construct_virtual_dataset(
)


def construct_virtual_datatree(
manifest_store: ManifestStore,
group: str = "",
*,
drop_variables: Iterable[str] | None = None,
loadable_variables: Iterable[str] | None = None,
decode_times: bool | None = None,
) -> xr.DataTree:
"""
Construct a fully or partly virtual datatree from a ManifestStore.
"""
fully_loadable_datatree = xr.open_datatree(
manifest_store, # type: ignore[arg-type]
group=group,
engine="zarr",
consolidated=False,
zarr_format=3,
decode_times=decode_times,
)
if group:
node = manifest_store._group[group]
else:
node = manifest_store._group
if isinstance(node, ManifestArray):
node = ManifestGroup(arrays={group: node}, attributes={})
fully_virtual_datatree = node.to_virtual_datatree()

partially_loaded_datasets = {}
for name, virtual_node in fully_virtual_datatree.subtree_with_keys:
loadable_node = fully_loadable_datatree[name]
node_dataset = replace_virtual_with_loadable_vars(
virtual_node.to_dataset(), loadable_node.to_dataset(), loadable_variables
)
node_dataset = node_dataset.drop_vars(list(drop_variables or ()))
partially_loaded_datasets[name] = node_dataset
return xr.DataTree.from_dict(partially_loaded_datasets)


def replace_virtual_with_loadable_vars(
fully_virtual_ds: xr.Dataset,
loadable_ds: xr.Dataset,
Expand Down
Loading