-
Notifications
You must be signed in to change notification settings - Fork 54
Feat: add top-level open_virtual_datatree #838
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 15 commits
439accd
24e29f3
da2f601
1e1db55
437a5d8
5c35e1b
e563aa0
0f45437
0da38c8
e1b5009
a03f0fc
62b0e99
7fc4669
3132005
59e61e7
0e10381
9b91ad3
9e20aab
9a203fc
556f5f5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,7 +21,7 @@ | |
| from xarray.core.types import NestedSequence | ||
| from xarray.structure.combine import _infer_concat_order_from_positions, _nested_combine | ||
|
|
||
| from virtualizarr.manifests import ManifestStore | ||
| from virtualizarr.manifests import ManifestArray, ManifestGroup, ManifestStore | ||
| from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri | ||
| from virtualizarr.parallel import get_executor | ||
| from virtualizarr.parsers.typing import Parser | ||
|
|
@@ -35,6 +35,68 @@ | |
| ) | ||
|
|
||
|
|
||
| def open_virtual_datatree( | ||
| url: str, | ||
| registry: ObjectStoreRegistry, | ||
| parser: Parser, | ||
| *, | ||
| drop_variables: Iterable[str] | None = None, | ||
| loadable_variables: Iterable[str] | None = None, | ||
| decode_times: bool | None = None, | ||
| ) -> xr.DataTree: | ||
| """ | ||
| Open an archival data source as an [xarray.DataTree][] wrapping virtualized zarr arrays. | ||
|
|
||
| No data variables will be loaded unless specified in the ``loadable_variables`` kwarg (in which case they will open as lazily indexed arrays using xarray's standard lazy indexing classes). | ||
|
|
||
| Parameters | ||
| ---------- | ||
| url | ||
| The url of the data source to virtualize. The URL should include a scheme. For example: | ||
|
|
||
| - `url="file:///Users/my-name/Documents/my-project/my-data.nc"` for a local data source. | ||
| - `url="s3://my-bucket/my-project/my-data.nc"` for a remote data source on an S3 compatible cloud. | ||
|
|
||
| registry | ||
| An [ObjectStoreRegistry][virtualizarr.registry.ObjectStoreRegistry] for resolving urls and reading data. | ||
| parser | ||
| A parser to use for the given data source. For example: | ||
|
|
||
| - [virtualizarr.parsers.HDFParser][] for virtualizing NetCDF4 or HDF5 files. | ||
| - [virtualizarr.parsers.FITSParser][] for virtualizing FITS files. | ||
| - [virtualizarr.parsers.NetCDF3Parser][] for virtualizing NetCDF3 files. | ||
| - [virtualizarr.parsers.KerchunkJSONParser][] for re-opening Kerchunk JSONs. | ||
| - [virtualizarr.parsers.KerchunkParquetParser][] for re-opening Kerchunk Parquets. | ||
| - [virtualizarr.parsers.ZarrParser][] for virtualizing Zarr stores. | ||
| - [virtualizarr.parsers.ZarrParser][] for virtualizing Zarr stores. | ||
| - [virtual_tiff.VirtualTIFF][] for virtualizing TIFFs. | ||
|
|
||
| drop_variables | ||
| Variables in the data source to drop before returning. | ||
| loadable_variables | ||
| Variables in the data source to load as Dask/NumPy arrays instead of as virtual arrays. | ||
|
||
| decode_times | ||
| Bool that is passed into [xarray.open_dataset][]. Allows time to be decoded into a datetime object. | ||
|
|
||
| Returns | ||
| ------- | ||
| vds | ||
| An [xarray.DataTree][] containing virtual chunk references for all variables. | ||
| """ | ||
| filepath = validate_and_normalize_path_to_uri(url, fs_root=Path.cwd().as_uri()) | ||
|
|
||
| manifest_store = parser( | ||
| url=filepath, | ||
| registry=registry, | ||
| ) | ||
|
|
||
| return manifest_store.to_virtual_datatree( | ||
| loadable_variables=loadable_variables, | ||
| decode_times=decode_times, | ||
| drop_variables=drop_variables, | ||
| ) | ||
|
|
||
|
|
||
| def open_virtual_dataset( | ||
| url: str, | ||
| registry: ObjectStoreRegistry, | ||
|
|
@@ -48,8 +110,6 @@ def open_virtual_dataset( | |
|
|
||
| No data variables will be loaded unless specified in the ``loadable_variables`` kwarg (in which case they will open as lazily indexed arrays using xarray's standard lazy indexing classes). | ||
|
|
||
| Xarray indexes can optionally be created (the default behaviour is to create indexes for any 1D coordinate variables). To avoid creating any xarray indexes pass ``indexes={}``. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| url | ||
|
|
@@ -69,6 +129,7 @@ def open_virtual_dataset( | |
| - [virtualizarr.parsers.KerchunkJSONParser][] for re-opening Kerchunk JSONs. | ||
| - [virtualizarr.parsers.KerchunkParquetParser][] for re-opening Kerchunk Parquets. | ||
| - [virtualizarr.parsers.ZarrParser][] for virtualizing Zarr stores. | ||
| - [virtual_tiff.VirtualTIFF][] for virtualizing TIFFs. | ||
|
|
||
| drop_variables | ||
| Variables in the data source to drop before returning. | ||
|
|
@@ -354,6 +415,43 @@ def construct_virtual_dataset( | |
| ) | ||
|
|
||
|
|
||
| def construct_virtual_datatree( | ||
| manifest_store: ManifestStore, | ||
| group: str = "", | ||
| *, | ||
| drop_variables: Iterable[str] | None = None, | ||
| loadable_variables: Iterable[str] | None = None, | ||
| decode_times: bool | None = None, | ||
| ) -> xr.DataTree: | ||
| """ | ||
| Construct a fully or partly virtual datatree from a ManifestStore. | ||
| """ | ||
| node = manifest_store._group[group] if group else manifest_store._group | ||
|
|
||
| if isinstance(node, ManifestArray): | ||
| node = ManifestGroup(arrays={group: node}, attributes={}) | ||
|
|
||
| fully_loadable_datatree = xr.open_datatree( | ||
| manifest_store, # type: ignore[arg-type] | ||
| group=group, | ||
| engine="zarr", | ||
| consolidated=False, | ||
| zarr_format=3, | ||
| decode_times=decode_times, | ||
| ) | ||
|
|
||
| partially_loaded_datasets = { | ||
| name: replace_virtual_with_loadable_vars( | ||
| virtual_node.to_dataset(), | ||
| fully_loadable_datatree[name].to_dataset(), | ||
| loadable_variables, | ||
| ).drop_vars(list(drop_variables or ())) | ||
| for name, virtual_node in node.to_virtual_datatree().subtree_with_keys | ||
| } | ||
|
|
||
| return xr.DataTree.from_dict(partially_loaded_datasets) | ||
|
|
||
|
|
||
| def replace_virtual_with_loadable_vars( | ||
| fully_virtual_ds: xr.Dataset, | ||
| loadable_ds: xr.Dataset, | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.