zarr-developers · TomNicholas · Mar 24, 2025 · Mar 6, 2025 · Mar 6, 2025 · Mar 6, 2025
diff --git a/conftest.py b/conftest.py
@@ -195,7 +195,7 @@ def netcdf4_virtual_dataset(netcdf4_file):
     """Create a virtual dataset from a NetCDF4 file."""
     from virtualizarr import open_virtual_dataset
 
-    with open_virtual_dataset(netcdf4_file, indexes={}) as ds:
+    with open_virtual_dataset(netcdf4_file, loadable_variables=[]) as ds:
         yield ds
 
 

diff --git a/docs/releases.rst b/docs/releases.rst
@@ -12,6 +12,16 @@ New Features
 Breaking changes
 ~~~~~~~~~~~~~~~~
 
+- Which variables are loadable by default has changed. The behaviour is now to make loadable by default the
+  same variables which `xarray.open_dataset` would create indexes for: i.e. one-dimensional coordinate variables whose
+  name matches the name of their only dimension (also known as "dimension coordinates").
+  Pandas indexes will also now be created by default for these loadable variables.
+  This is intended to provide a more friendly default, as often you will want these small variables to be loaded
+  (or "inlined", for efficiency of storage in icechunk/kerchunk), and you will also want to have in-memory indexes for these variables
+  (to allow `xarray.combine_by_coords` to sort using them).
+  The old behaviour is equivalent to passing ``loadable_variables=[]`` and ``indexes={}``.
+  (:issue:`335`, :pull:`477`) by `Tom Nicholas <https://github.com/TomNicholas>`_.
+
 Deprecations
 ~~~~~~~~~~~~
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -28,27 +28,27 @@ vds = open_virtual_dataset('air.nc')
 
 (Notice we did not have to explicitly indicate the file format, as {py:func}`open_virtual_dataset <virtualizarr.open_virtual_dataset>` will attempt to automatically infer it.)
 
-Printing this "virtual dataset" shows that although it is an instance of `xarray.Dataset`, unlike a typical xarray dataset, it does not contain numpy or dask arrays, but instead it wraps {py:class}`ManifestArray <virtualizarr.manifests.ManifestArray>` objects.
+Printing this "virtual dataset" shows that although it is an instance of `xarray.Dataset`, unlike a typical xarray dataset, in addition to a few in-memory numpy arrays, it also wraps {py:class}`ManifestArray <virtualizarr.manifests.ManifestArray>` objects.
 
 ```python
 vds
 ```
 
 ```
-<xarray.Dataset> Size: 8MB
-Dimensions:  (time: 2920, lat: 25, lon: 53)
+<xarray.Dataset> Size: 31MB
+Dimensions:  (lat: 25, lon: 53, time: 2920)
 Coordinates:
-    lat      (lat) float32 100B ManifestArray<shape=(25,), dtype=float32, chu...
-    lon      (lon) float32 212B ManifestArray<shape=(53,), dtype=float32, chu...
-    time     (time) float32 12kB ManifestArray<shape=(2920,), dtype=float32, ...
+  * lat      (lat) float32 100B 75.0 72.5 70.0 67.5 65.0 ... 22.5 20.0 17.5 15.0
+  * lon      (lon) float32 212B 200.0 202.5 205.0 207.5 ... 325.0 327.5 330.0
+  * time     (time) datetime64[ns] 23kB 2013-01-01 ... 2014-12-31T18:00:00
 Data variables:
-    air      (time, lat, lon) int16 8MB ManifestArray<shape=(2920, 25, 53), d...
+    air      (time, lat, lon) float64 31MB ManifestArray<shape=(2920, 25, 53)...
 Attributes:
     Conventions:  COARDS
+    title:        4x daily NMC reanalysis (1948)
     description:  Data is from NMC initialized reanalysis\n(4x/day).  These a...
     platform:     Model
     references:   http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly...
-    title:        4x daily NMC reanalysis (1948)
 ```
 
 Generally a "virtual dataset" is any `xarray.Dataset` which wraps one or more {py:class}`ManifestArray <virtualizarr.manifests.ManifestArray>` objects.
@@ -70,7 +70,7 @@ vds.virtualize.nbytes
 ```
 
 ```
-128
+23704
 ```
 
 ```{important} Virtual datasets are not normal xarray datasets!
@@ -230,7 +230,9 @@ But before we combine our data, we might want to consider loading some variables
 
 ## Loading variables
 
-Whilst the values of virtual variables (i.e. those backed by `ManifestArray` objects) cannot be loaded into memory, you do have the option of opening specific variables from the file as loadable lazy numpy/dask arrays, just like `xr.open_dataset` normally returns. These variables are specified using the `loadable_variables` argument:
+Whilst the values of virtual variables (i.e. those backed by `ManifestArray` objects) cannot be loaded into memory, you do have the option of opening specific variables from the file as loadable lazy numpy arrays, just like `xr.open_dataset` normally returns.
+
+Which variables to open this way can be specified using the `loadable_variables` argument:
 
 ```python
 vds = open_virtual_dataset('air.nc', loadable_variables=['air', 'time'])
@@ -240,17 +242,17 @@ vds = open_virtual_dataset('air.nc', loadable_variables=['air', 'time'])
 <xarray.Dataset> Size: 31MB
 Dimensions:  (time: 2920, lat: 25, lon: 53)
 Coordinates:
+  * time     (time) datetime64[ns] 23kB 2013-01-01 ... 2014-12-31T18:00:00
     lat      (lat) float32 100B ManifestArray<shape=(25,), dtype=float32, chu...
     lon      (lon) float32 212B ManifestArray<shape=(53,), dtype=float32, chu...
-  * time     (time) datetime64[ns] 23kB 2013-01-01 ... 2014-12-31T18:00:00
-  Data variables:
+Data variables:
     air      (time, lat, lon) float64 31MB ...
 Attributes:
     Conventions:  COARDS
+    title:        4x daily NMC reanalysis (1948)
     description:  Data is from NMC initialized reanalysis\n(4x/day).  These a...
     platform:     Model
     references:   http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly...
-    title:        4x daily NMC reanalysis (1948)
 ```
 
 You can see that the dataset contains a mixture of virtual variables backed by `ManifestArray` objects (`lat` and `lon`), and loadable variables backed by (lazy) numpy arrays (`air` and `time`).
@@ -261,18 +263,21 @@ Loading variables can be useful in a few scenarios:
 2. You want in-memory indexes to use with `xr.combine_by_coords`,
 3. Storing a variable on-disk as a set of references would be inefficient, e.g. because it's a very small array (saving the values like this is similar to kerchunk's concept of "inlining" data),
 4. The variable has encoding, and the simplest way to decode it correctly is to let xarray's standard decoding machinery load it into memory and apply the decoding,
-5. Some of your variables have inconsistent-length chunks, and you want to be able to concatenate them together. For example you might have multiple virtual datasets with coordinates of inconsistent length (e.g., leap years within multi-year daily data).
+5. Some of your variables have inconsistent-length chunks, and you want to be able to concatenate them together. For example you might have multiple virtual datasets with coordinates of inconsistent length (e.g., leap years within multi-year daily data). Loading them allows you to rechunk them however you like.
+
+The default value of `loadable_variables` is `None`, which effectively specifies all the "dimension coordinates" in the file, i.e. all one-dimensional coordinate variables whose name is the same as the name of their dimensions. Xarray indexes will also be automatically created for these variables. Together these defaults mean that your virtual dataset will be opened with the same indexes as it would have been if it had been opened with just `xarray.open_dataset()`.
 
-### Loading low-dimensional coordinates
+```{note}
+In general, it is recommended to load all of your low-dimensional variables.
 
-In general, it is recommended to load all of your low-dimensional coordinates.
-This will slow down your initial opening of the individual virtual datasets, but by loading your coordinates into memory, they can be inlined in the reference file for fast reads of the virtualized store.
-However, doing this for coordinates that are N-dimensional might use a lot of storage duplicating them.
-Also, anything duplicated could become out of sync with the referenced original files, especially if not using a transactional storage engine like `Icechunk`.
+Whilst this does mean the original data will be duplicated in your new virtual zarr store, by loading your coordinates into memory they can be inlined in the reference file for fast reads from the virtual store.
+
+However, you should not do this for higher-dimensional variables, as then you might use a lot of storage duplicating them, defeating the point of the virtual zarr approach. Also, anything duplicated could become out of sync with the referenced origial files, especially if not using a transactional storage engine like `Icechunk`.
+```
 
 ### CF-encoded time variables
 
-To correctly decode time variables according to the CF conventions, you need to pass `time` to `loadable_variables` and ensure the `decode_times` argument of `open_virtual_dataset` is set to True (`decode_times` defaults to None).
+To decode time variables according to the CF conventions, you must ensure `time` is one of the `loadable_variables` and the `decode_times` argument of `open_virtual_dataset` is set to `True` (`decode_times` defaults to None).
 
 ```python
 vds = open_virtual_dataset(
@@ -286,17 +291,17 @@ vds = open_virtual_dataset(
 <xarray.Dataset> Size: 31MB
 Dimensions:  (time: 2920, lat: 25, lon: 53)
 Coordinates:
+  * time     (time) datetime64[ns] 23kB 2013-01-01 ... 2014-12-31T18:00:00
     lat      (lat) float32 100B ManifestArray<shape=(25,), dtype=float32, chu...
     lon      (lon) float32 212B ManifestArray<shape=(53,), dtype=float32, chu...
-    time     (time) datetime64[ns] 23kB 2013-01-01T00:02:06.757437440 ... 201...
 Data variables:
     air      (time, lat, lon) float64 31MB ...
 Attributes:
     Conventions:  COARDS
+    title:        4x daily NMC reanalysis (1948)
     description:  Data is from NMC initialized reanalysis\n(4x/day).  These a...
     platform:     Model
     references:   http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly...
-    title:        4x daily NMC reanalysis (1948)
 ```
 
 ## Combining virtual datasets
@@ -328,26 +333,26 @@ vds2 = open_virtual_dataset('air2.nc')
 
 As we know the correct order a priori, we can just combine along one dimension using `xarray.concat`.
 
-```
-combined_vds = xr.concat([vds1, vds2], dim='time', coords='minimal', compat='override')
+```python
+combined_vds = xr.concat([vds1, vds2], dim='time')
 combined_vds
 ```
 
 ```
-<xarray.Dataset> Size: 8MB
+<xarray.Dataset> Size: 31MB
 Dimensions:  (time: 2920, lat: 25, lon: 53)
 Coordinates:
-    lat      (lat) float32 100B ManifestArray<shape=(25,), dtype=float32, chu...
-    lon      (lon) float32 212B ManifestArray<shape=(53,), dtype=float32, chu...
-    time     (time) float32 12kB ManifestArray<shape=(2920,), dtype=float32, ...
+  * lat      (lat) float32 100B 75.0 72.5 70.0 67.5 65.0 ... 22.5 20.0 17.5 15.0
+  * lon      (lon) float32 212B 200.0 202.5 205.0 207.5 ... 325.0 327.5 330.0
+  * time     (time) datetime64[ns] 23kB 2013-01-01 ... 2014-12-31T18:00:00
 Data variables:
-    air      (time, lat, lon) int16 8MB ManifestArray<shape=(2920, 25, 53), d...
+    air      (time, lat, lon) float64 31MB ManifestArray<shape=(2920, 25, 53)...
 Attributes:
     Conventions:  COARDS
+    title:        4x daily NMC reanalysis (1948)
     description:  Data is from NMC initialized reanalysis\n(4x/day).  These a...
     platform:     Model
     references:   http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly...
-    title:        4x daily NMC reanalysis (1948)
 ```
 
 We can see that the resulting combined manifest has two chunks, as expected.
@@ -362,32 +367,30 @@ combined_vds['air'].data.manifest.dict()
 ```
 
 ```{note}
-The keyword arguments `coords='minimal', compat='override'` are currently necessary because the default behaviour of xarray will attempt to load coordinates in order to check their compatibility with one another. In future this [default will be changed](https://github.com/pydata/xarray/issues/8778), such that passing these two arguments explicitly will become unnecessary.
+If you have any virtual coordinate variables, you will likely need to specify the keyword arguments `coords='minimal'` and `compat='override'` to `xarray.concat()`, because the default behaviour of xarray will attempt to load coordinates in order to check their compatibility with one another. In future this [default will be changed](https://github.com/pydata/xarray/issues/8778), such that passing these two arguments explicitly will become unnecessary.
 ```
 
-The general multi-dimensional version of this concatenation-by-order-supplied can be achieved using `xarray.combine_nested`.
+The general multi-dimensional version of this concatenation-by-order-supplied can be achieved using `xarray.combine_nested()`.
 
 ```python
-combined_vds = xr.combine_nested([vds1, vds2], concat_dim=['time'], coords='minimal', compat='override')
+combined_vds = xr.combine_nested([vds1, vds2], concat_dim=['time'])
 ```
 
 In N-dimensions the datasets would need to be passed as an N-deep nested list-of-lists, see the [xarray docs](https://docs.xarray.dev/en/stable/user-guide/combining.html#combining-along-multiple-dimensions).
 
 ```{note}
 For manual concatenation we can actually avoid creating any xarray indexes, as we won't need them. Without indexes we can avoid loading any data whatsoever from the files. However, you should first be confident that the archival files actually do have compatible data, as the coordinate values then cannot be efficiently compared for consistency (i.e. aligned).
-
-By default indexes are created for 1-dimensional ``loadable_variables`` whose name matches their only dimension (i.e. "dimension coordinates"), but if you wish you can load variables without creating any indexes by passing ``indexes={}`` to ``open_virtual_dataset``.
 ```
 
 ### Ordering by coordinate values
 
 If you're happy to load 1D dimension coordinates into memory, you can use their values to do the ordering for you!
 
 ```python
-vds1 = open_virtual_dataset('air1.nc', loadable_variables=['time', 'lat', 'lon'])
-vds2 = open_virtual_dataset('air2.nc', loadable_variables=['time', 'lat', 'lon'])
+vds1 = open_virtual_dataset('air1.nc')
+vds2 = open_virtual_dataset('air2.nc')
 
-combined_vds = xr.combine_by_coords([vds2, vds1], coords='minimal', compat='override')
+combined_vds = xr.combine_by_coords([vds2, vds1])
 ```
 
 Notice we don't have to specify the concatenation dimension explicitly - xarray works out the correct ordering for us. Even though we actually passed in the virtual datasets in the wrong order just now, the manifest still has the chunks listed in the correct order such that the 1-dimensional `time` coordinate has ascending values:

diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py
@@ -17,8 +17,8 @@
     NetCDF3VirtualBackend,
     TIFFVirtualBackend,
 )
-from virtualizarr.readers.common import VirtualBackend
-from virtualizarr.utils import _FsspecFSFromFilepath, check_for_collisions
+from virtualizarr.readers.api import VirtualBackend
+from virtualizarr.utils import _FsspecFSFromFilepath
 
 # TODO add entrypoint to allow external libraries to add to this mapping
 VIRTUAL_BACKENDS = {
@@ -112,11 +112,13 @@ def open_virtual_dataset(
     backend: type[VirtualBackend] | None = None,
 ) -> Dataset:
     """
-    Open a file or store as an xarray Dataset wrapping virtualized zarr arrays.
+    Open a file or store as an xarray.Dataset wrapping virtualized zarr arrays.
 
-    No data variables will be loaded unless specified in the ``loadable_variables`` kwarg (in which case they will be xarray lazily indexed arrays).
-
-    Xarray indexes can optionally be created (the default behaviour). To avoid creating any xarray indexes pass ``indexes={}``.
+    Some variables can be opened as loadable lazy numpy arrays. This can be controlled explicitly using the ``loadable_variables`` keyword argument.
+    By default this will be the same variables which `xarray.open_dataset` would create indexes for: i.e. one-dimensional coordinate variables whose
+    name matches the name of their only dimension (also known as "dimension coordinates").
+    Pandas indexes will also now be created by default for these loadable variables, but this can be controlled by passing a value for the ``indexes`` keyword argument.
+    To avoid creating any xarray indexes pass ``indexes={}``.
 
     Parameters
     ----------
@@ -159,11 +161,6 @@ def open_virtual_dataset(
             stacklevel=2,
         )
 
-    drop_variables, loadable_variables = check_for_collisions(
-        drop_variables,
-        loadable_variables,
-    )
-
     if reader_options is None:
         reader_options = {}
 

diff --git a/virtualizarr/readers/api.py b/virtualizarr/readers/api.py
@@ -0,0 +1,33 @@
+from abc import ABC
+from collections.abc import Iterable, Mapping
+from typing import Optional
+
+import xarray as xr
+
+
+class VirtualBackend(ABC):
+    @staticmethod
+    def open_virtual_dataset(
+        filepath: str,
+        group: str | None = None,
+        drop_variables: Iterable[str] | None = None,
+        loadable_variables: Iterable[str] | None = None,
+        decode_times: bool | None = None,
+        indexes: Mapping[str, xr.Index] | None = None,
+        virtual_backend_kwargs: Optional[dict] = None,
+        reader_options: Optional[dict] = None,
+    ) -> xr.Dataset:
+        raise NotImplementedError()
+
+    @staticmethod
+    def open_virtual_datatree(
+        path: str,
+        group: str | None = None,
+        drop_variables: Iterable[str] | None = None,
+        loadable_variables: Iterable[str] | None = None,
+        decode_times: bool | None = None,
+        indexes: Mapping[str, xr.Index] | None = None,
+        virtual_backend_kwargs: Optional[dict] = None,
+        reader_options: Optional[dict] = None,
+    ) -> xr.DataTree:
+        raise NotImplementedError()