diff --git a/conftest.py b/conftest.py index 0781c37e2..875f8db2e 100644 --- a/conftest.py +++ b/conftest.py @@ -61,7 +61,7 @@ def netcdf4_file_with_data_in_multiple_groups(tmp_path: Path) -> str: @pytest.fixture -def netcdf4_files_factory(tmp_path: Path) -> Callable: +def netcdf4_files_factory(tmp_path: Path) -> Callable[[], tuple[str, str]]: def create_netcdf4_files( encoding: Optional[Mapping[str, Mapping[str, Any]]] = None, ) -> tuple[str, str]: @@ -96,7 +96,8 @@ def netcdf4_file_with_2d_coords(tmp_path: Path) -> str: def netcdf4_virtual_dataset(netcdf4_file): from virtualizarr import open_virtual_dataset - return open_virtual_dataset(netcdf4_file, indexes={}) + with open_virtual_dataset(netcdf4_file, indexes={}) as ds: + yield ds @pytest.fixture diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py index 5fbd4fcd4..d17f336e5 100644 --- a/virtualizarr/backend.py +++ b/virtualizarr/backend.py @@ -109,9 +109,9 @@ def open_virtual_dataset( cftime_variables: Iterable[str] | None = None, indexes: Mapping[str, Index] | None = None, virtual_array_class=ManifestArray, - virtual_backend_kwargs: Optional[dict] = None, - reader_options: Optional[dict] = None, - backend: Optional[VirtualBackend] = None, + virtual_backend_kwargs: dict | None = None, + reader_options: dict | None = None, + backend: type[VirtualBackend] | None = None, ) -> Dataset: """ Open a file or store as an xarray Dataset wrapping virtualized zarr arrays. diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py index 1b27b0f27..4a845d9f1 100644 --- a/virtualizarr/tests/test_backend.py +++ b/virtualizarr/tests/test_backend.py @@ -1,4 +1,5 @@ from collections.abc import Mapping +from pathlib import Path from unittest.mock import patch import numpy as np @@ -9,7 +10,11 @@ from xarray.core.indexes import Index from virtualizarr import open_virtual_dataset -from virtualizarr.backend import FileType, automatically_determine_filetype +from virtualizarr.backend import ( + FileType, + VirtualBackend, + automatically_determine_filetype, +) from virtualizarr.manifests import ManifestArray from virtualizarr.readers import HDF5VirtualBackend from virtualizarr.readers.hdf import HDFVirtualBackend @@ -96,16 +101,17 @@ def test_create_default_indexes_for_loadable_variables( ): loadable_variables = ["time", "lat"] - vds = open_virtual_dataset( - netcdf4_file, - indexes=None, - backend=hdf_backend, - loadable_variables=loadable_variables, - ) - ds = open_dataset(netcdf4_file, decode_times=True) - - # TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812 - assert index_mappings_equal(vds.xindexes, ds[loadable_variables].xindexes) + with ( + open_virtual_dataset( + netcdf4_file, + indexes=None, + backend=hdf_backend, + loadable_variables=loadable_variables, + ) as vds, + open_dataset(netcdf4_file, decode_times=True) as ds, + ): + # TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812 + assert index_mappings_equal(vds.xindexes, ds[loadable_variables].xindexes) def index_mappings_equal(indexes1: Mapping[str, Index], indexes2: Mapping[str, Index]): @@ -127,7 +133,7 @@ def index_mappings_equal(indexes1: Mapping[str, Index], indexes2: Mapping[str, I @requires_hdf5plugin @requires_imagecodecs @parametrize_over_hdf_backends -def test_cftime_index(tmpdir, hdf_backend): +def test_cftime_index(tmp_path: Path, hdf_backend: type[VirtualBackend]): """Ensure a virtual dataset contains the same indexes as an Xarray dataset""" # Note: Test was created to debug: https://github.com/zarr-developers/VirtualiZarr/issues/168 ds = xr.Dataset( @@ -141,18 +147,19 @@ def test_cftime_index(tmpdir, hdf_backend): }, attrs={"attr1_key": "attr1_val"}, ) - ds.to_netcdf(f"{tmpdir}/tmp.nc") - vds = open_virtual_dataset( - f"{tmpdir}/tmp.nc", + ds.to_netcdf(str(tmp_path / "tmp.nc")) + + with open_virtual_dataset( + str(tmp_path / "tmp.nc"), loadable_variables=["time", "lat", "lon"], indexes={}, backend=hdf_backend, - ) - # TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812 - assert index_mappings_equal(vds.xindexes, ds.xindexes) - assert list(ds.coords) == list(vds.coords) - assert vds.dims == ds.dims - assert vds.attrs == ds.attrs + ) as vds: + # TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812 + assert index_mappings_equal(vds.xindexes, ds.xindexes) + assert list(ds.coords) == list(vds.coords) + assert vds.dims == ds.dims + assert vds.attrs == ds.attrs @parametrize_over_hdf_backends @@ -176,25 +183,24 @@ def test_coordinate_variable_attrs_preserved(self, netcdf4_file, hdf_backend): @parametrize_over_hdf_backends class TestDetermineCoords: def test_infer_one_dimensional_coords(self, netcdf4_file, hdf_backend): - vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend) - assert set(vds.coords) == {"time", "lat", "lon"} + with open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend) as vds: + assert set(vds.coords) == {"time", "lat", "lon"} def test_var_attr_coords(self, netcdf4_file_with_2d_coords, hdf_backend): - vds = open_virtual_dataset( + with open_virtual_dataset( netcdf4_file_with_2d_coords, indexes={}, backend=hdf_backend - ) - - expected_dimension_coords = ["ocean_time", "s_rho"] - expected_2d_coords = ["lon_rho", "lat_rho", "h"] - expected_1d_non_dimension_coords = ["Cs_r"] - expected_scalar_coords = ["hc", "Vtransform"] - expected_coords = ( - expected_dimension_coords - + expected_2d_coords - + expected_1d_non_dimension_coords - + expected_scalar_coords - ) - assert set(vds.coords) == set(expected_coords) + ) as vds: + expected_dimension_coords = ["ocean_time", "s_rho"] + expected_2d_coords = ["lon_rho", "lat_rho", "h"] + expected_1d_non_dimension_coords = ["Cs_r"] + expected_scalar_coords = ["hc", "Vtransform"] + expected_coords = ( + expected_dimension_coords + + expected_2d_coords + + expected_1d_non_dimension_coords + + expected_scalar_coords + ) + assert set(vds.coords) == set(expected_coords) @requires_network @@ -208,16 +214,15 @@ def test_anon_read_s3(self, indexes, hdf_backend): """Parameterized tests for empty vs supplied indexes and filetypes.""" # TODO: Switch away from this s3 url after minIO is implemented. fpath = "s3://carbonplan-share/virtualizarr/local.nc" - vds = open_virtual_dataset( + with open_virtual_dataset( fpath, indexes=indexes, reader_options={"storage_options": {"anon": True}}, backend=hdf_backend, - ) - - assert vds.dims == {"time": 2920, "lat": 25, "lon": 53} - for var in vds.variables: - assert isinstance(vds[var].data, ManifestArray), var + ) as vds: + assert vds.dims == {"time": 2920, "lat": 25, "lon": 53} + for var in vds.variables: + assert isinstance(vds[var].data, ManifestArray), var @requires_network @@ -271,24 +276,20 @@ class TestReadFromURL: def test_read_from_url(self, hdf_backend, filetype, url): if filetype in ["grib", "jpg", "hdf4"]: with pytest.raises(NotImplementedError): - vds = open_virtual_dataset( - url, - reader_options={}, - indexes={}, - ) + open_virtual_dataset(url, reader_options={}, indexes={}) elif filetype == "hdf5": - vds = open_virtual_dataset( + with open_virtual_dataset( url, group="science/LSAR/GCOV/grids/frequencyA", drop_variables=["listOfCovarianceTerms", "listOfPolarizations"], indexes={}, reader_options={}, backend=hdf_backend, - ) - assert isinstance(vds, xr.Dataset) + ) as vds: + assert isinstance(vds, xr.Dataset) else: - vds = open_virtual_dataset(url, indexes={}) - assert isinstance(vds, xr.Dataset) + with open_virtual_dataset(url, indexes={}) as vds: + assert isinstance(vds, xr.Dataset) @pytest.mark.skip(reason="often times out, as nisar file is 200MB") def test_virtualizarr_vs_local_nisar(self, hdf_backend): @@ -299,51 +300,56 @@ def test_virtualizarr_vs_local_nisar(self, hdf_backend): tmpfile = fsspec.open_local( f"filecache::{url}", filecache=dict(cache_storage="/tmp", same_names=True) ) + assert isinstance(tmpfile, str) # make type-checkers happy hdf_group = "science/LSAR/GCOV/grids/frequencyA" - dsXR = xr.open_dataset( - tmpfile, - engine="h5netcdf", - group=hdf_group, - drop_variables=["listOfCovarianceTerms", "listOfPolarizations"], - phony_dims="access", - ) - # save group reference file via virtualizarr, then open with engine="kerchunk" - vds = open_virtual_dataset( - tmpfile, - group=hdf_group, - indexes={}, - drop_variables=["listOfCovarianceTerms", "listOfPolarizations"], - backend=hdf_backend, - ) - tmpref = "/tmp/cmip6.json" - vds.virtualize.to_kerchunk(tmpref, format="json") - dsV = xr.open_dataset(tmpref, engine="kerchunk") + with ( + xr.open_dataset( + tmpfile, + engine="h5netcdf", + group=hdf_group, + drop_variables=["listOfCovarianceTerms", "listOfPolarizations"], + phony_dims="access", + ) as dsXR, + # save group reference file via virtualizarr, then open with engine="kerchunk" + open_virtual_dataset( + tmpfile, + group=hdf_group, + indexes={}, + drop_variables=["listOfCovarianceTerms", "listOfPolarizations"], + backend=hdf_backend, + ) as vds, + ): + tmpref = "/tmp/cmip6.json" + vds.virtualize.to_kerchunk(tmpref, format="json") - # xrt.assert_identical(dsXR, dsV) #Attribute order changes - xrt.assert_equal(dsXR, dsV) + with xr.open_dataset(tmpref, engine="kerchunk") as dsV: + # xrt.assert_identical(dsXR, dsV) #Attribute order changes + xrt.assert_equal(dsXR, dsV) @parametrize_over_hdf_backends class TestOpenVirtualDatasetHDFGroup: def test_open_empty_group(self, empty_netcdf4_file, hdf_backend): - vds = open_virtual_dataset(empty_netcdf4_file, indexes={}, backend=hdf_backend) - assert isinstance(vds, xr.Dataset) - expected = Dataset() - xrt.assert_identical(vds, expected) + with open_virtual_dataset( + empty_netcdf4_file, indexes={}, backend=hdf_backend + ) as vds: + assert isinstance(vds, xr.Dataset) + expected = Dataset() + xrt.assert_identical(vds, expected) def test_open_subgroup( self, netcdf4_file_with_data_in_multiple_groups, hdf_backend ): - vds = open_virtual_dataset( + with open_virtual_dataset( netcdf4_file_with_data_in_multiple_groups, group="subgroup", indexes={}, backend=hdf_backend, - ) - assert list(vds.variables) == ["bar"] - assert isinstance(vds["bar"].data, ManifestArray) - assert vds["bar"].shape == (2,) + ) as vds: + assert list(vds.variables) == ["bar"] + assert isinstance(vds["bar"].data, ManifestArray) + assert vds["bar"].shape == (2,) @pytest.mark.parametrize("group", ["", None]) def test_open_root_group( @@ -352,15 +358,15 @@ def test_open_root_group( hdf_backend, group, ): - vds = open_virtual_dataset( + with open_virtual_dataset( netcdf4_file_with_data_in_multiple_groups, group=group, indexes={}, backend=hdf_backend, - ) - assert list(vds.variables) == ["foo"] - assert isinstance(vds["foo"].data, ManifestArray) - assert vds["foo"].shape == (3,) + ) as vds: + assert list(vds.variables) == ["foo"] + assert isinstance(vds["foo"].data, ManifestArray) + assert vds["foo"].shape == (3,) @requires_hdf5plugin @@ -369,36 +375,37 @@ class TestLoadVirtualDataset: @parametrize_over_hdf_backends def test_loadable_variables(self, netcdf4_file, hdf_backend): vars_to_load = ["air", "time"] - vds = open_virtual_dataset( - netcdf4_file, - loadable_variables=vars_to_load, - indexes={}, - backend=hdf_backend, - ) - - for name in vds.variables: - if name in vars_to_load: - assert isinstance(vds[name].data, np.ndarray), name - else: - assert isinstance(vds[name].data, ManifestArray), name - - full_ds = xr.open_dataset(netcdf4_file, decode_times=True) - - for name in full_ds.variables: - if name in vars_to_load: - xrt.assert_identical(vds.variables[name], full_ds.variables[name]) + with ( + open_virtual_dataset( + netcdf4_file, + loadable_variables=vars_to_load, + indexes={}, + backend=hdf_backend, + ) as vds, + xr.open_dataset(netcdf4_file, decode_times=True) as full_ds, + ): + for name in vds.variables: + if name in vars_to_load: + assert isinstance(vds[name].data, np.ndarray), name + else: + assert isinstance(vds[name].data, ManifestArray), name + + for name in full_ds.variables: + if name in vars_to_load: + xrt.assert_identical(vds.variables[name], full_ds.variables[name]) def test_explicit_filetype(self, netcdf4_file): with pytest.raises(ValueError): open_virtual_dataset(netcdf4_file, filetype="unknown") with pytest.raises(ValueError): - open_virtual_dataset(netcdf4_file, filetype=ManifestArray) + open_virtual_dataset(netcdf4_file, filetype=ManifestArray) # type: ignore with pytest.raises(NotImplementedError): open_virtual_dataset(netcdf4_file, filetype="grib") - open_virtual_dataset(netcdf4_file, filetype="netCDF4") + with open_virtual_dataset(netcdf4_file, filetype="netCDF4"): + pass def test_explicit_filetype_and_backend(self, netcdf4_file): with pytest.raises(ValueError): @@ -410,30 +417,31 @@ def test_explicit_filetype_and_backend(self, netcdf4_file): def test_group_kwarg(self, hdf5_groups_file, hdf_backend): if hdf_backend == HDFVirtualBackend: with pytest.raises(KeyError, match="doesn't exist"): - open_virtual_dataset( + with open_virtual_dataset( hdf5_groups_file, group="doesnt_exist", backend=hdf_backend - ) + ): + pass if hdf_backend == HDF5VirtualBackend: with pytest.raises(ValueError, match="not found in"): - open_virtual_dataset( + with open_virtual_dataset( hdf5_groups_file, group="doesnt_exist", backend=hdf_backend - ) + ): + pass vars_to_load = ["air", "time"] - vds = open_virtual_dataset( - hdf5_groups_file, - group="test/group", - loadable_variables=vars_to_load, - indexes={}, - backend=hdf_backend, - ) - full_ds = xr.open_dataset( - hdf5_groups_file, - group="test/group", - ) - for name in full_ds.variables: - if name in vars_to_load: - xrt.assert_identical(vds.variables[name], full_ds.variables[name]) + with ( + open_virtual_dataset( + hdf5_groups_file, + group="test/group", + loadable_variables=vars_to_load, + indexes={}, + backend=hdf_backend, + ) as vds, + xr.open_dataset(hdf5_groups_file, group="test/group") as full_ds, + ): + for name in full_ds.variables: + if name in vars_to_load: + xrt.assert_identical(vds.variables[name], full_ds.variables[name]) @pytest.mark.xfail(reason="patches a function which no longer exists") @patch("virtualizarr.translators.kerchunk.read_kerchunk_references_from_file") @@ -441,7 +449,10 @@ def test_open_virtual_dataset_passes_expected_args( self, mock_read_kerchunk, netcdf4_file ): reader_options = {"option1": "value1", "option2": "value2"} - open_virtual_dataset(netcdf4_file, indexes={}, reader_options=reader_options) + with open_virtual_dataset( + netcdf4_file, indexes={}, reader_options=reader_options + ): + pass args = { "filepath": netcdf4_file, "filetype": None, @@ -452,12 +463,12 @@ def test_open_virtual_dataset_passes_expected_args( @parametrize_over_hdf_backends def test_open_dataset_with_empty(self, hdf5_empty, hdf_backend): - vds = open_virtual_dataset(hdf5_empty, backend=hdf_backend) - assert vds.empty.dims == () - assert vds.empty.attrs == {"empty": "true"} + with open_virtual_dataset(hdf5_empty, backend=hdf_backend) as vds: + assert vds.empty.dims == () + assert vds.empty.attrs == {"empty": "true"} @parametrize_over_hdf_backends def test_open_dataset_with_scalar(self, hdf5_scalar, hdf_backend): - vds = open_virtual_dataset(hdf5_scalar, backend=hdf_backend) - assert vds.scalar.dims == () - assert vds.scalar.attrs == {"scalar": "true"} + with open_virtual_dataset(hdf5_scalar, backend=hdf_backend) as vds: + assert vds.scalar.dims == () + assert vds.scalar.attrs == {"scalar": "true"} diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 57a7cb7b6..e84152095 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -1,5 +1,6 @@ from os.path import relpath from pathlib import Path +from typing import Callable, Concatenate, TypeAlias import numpy as np import pytest @@ -7,6 +8,7 @@ import xarray.testing as xrt from virtualizarr import open_virtual_dataset +from virtualizarr.backend import VirtualBackend from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.tests import ( has_fastparquet, @@ -21,6 +23,8 @@ ) from virtualizarr.zarr import ZArray +RoundtripFunction: TypeAlias = Callable[Concatenate[xr.Dataset, Path, ...], xr.Dataset] + def test_kerchunk_roundtrip_in_memory_no_concat(): # Set up example xarray dataset @@ -78,17 +82,17 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( ).translate() # loading the variables should produce same result as inlining them using kerchunk - vds = open_virtual_dataset( + with open_virtual_dataset( netcdf4_file, loadable_variables=vars_to_inline, indexes={}, backend=hdf_backend - ) - refs = vds.virtualize.to_kerchunk(format="dict") + ) as vds: + refs = vds.virtualize.to_kerchunk(format="dict") - # TODO I would just compare the entire dicts but kerchunk returns inconsistent results - see https://github.com/TomNicholas/VirtualiZarr/pull/73#issuecomment-2040931202 - # assert refs == expected - assert refs["refs"]["air/0.0.0"] == expected["refs"]["air/0.0.0"] - assert refs["refs"]["lon/0"] == expected["refs"]["lon/0"] - assert refs["refs"]["lat/0"] == expected["refs"]["lat/0"] - assert refs["refs"]["time/0"] == expected["refs"]["time/0"] + # TODO I would just compare the entire dicts but kerchunk returns inconsistent results - see https://github.com/TomNicholas/VirtualiZarr/pull/73#issuecomment-2040931202 + # assert refs == expected + assert refs["refs"]["air/0.0.0"] == expected["refs"]["air/0.0.0"] + assert refs["refs"]["lon/0"] == expected["refs"]["lon/0"] + assert refs["refs"]["lat/0"] == expected["refs"]["lat/0"] + assert refs["refs"]["time/0"] == expected["refs"]["time/0"] def roundtrip_as_kerchunk_dict(vds: xr.Dataset, tmpdir, **kwargs): @@ -145,82 +149,109 @@ def roundtrip_as_in_memory_icechunk(vds: xr.Dataset, tmpdir, **kwargs): ) class TestRoundtrip: @parametrize_over_hdf_backends - def test_roundtrip_no_concat(self, tmpdir, roundtrip_func, hdf_backend): - # set up example xarray dataset - ds = xr.tutorial.open_dataset("air_temperature", decode_times=False) - - # save it to disk as netCDF (in temporary directory) - ds.to_netcdf(f"{tmpdir}/air.nc") + def test_roundtrip_no_concat( + self, + tmp_path, + roundtrip_func: RoundtripFunction, + hdf_backend: type[VirtualBackend], + ): + air_nc_path = tmp_path / "air.nc" - # use open_dataset_via_kerchunk to read it as references - vds = open_virtual_dataset(f"{tmpdir}/air.nc", indexes={}, backend=hdf_backend) + # set up example xarray dataset + with xr.tutorial.open_dataset("air_temperature", decode_times=False) as ds: + # save it to disk as netCDF (in temporary directory) + ds.to_netcdf(air_nc_path) - roundtrip = roundtrip_func(vds, tmpdir, decode_times=False) + # use open_dataset_via_kerchunk to read it as references + with open_virtual_dataset( + str(air_nc_path), indexes={}, backend=hdf_backend + ) as vds: + roundtrip = roundtrip_func(vds, tmp_path, decode_times=False) - # assert all_close to original dataset - xrt.assert_allclose(roundtrip, ds) + # assert all_close to original dataset + xrt.assert_allclose(roundtrip, ds) - # assert coordinate attributes are maintained - for coord in ds.coords: - assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs + # assert coordinate attributes are maintained + for coord in ds.coords: + assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs @parametrize_over_hdf_backends @pytest.mark.parametrize("decode_times,time_vars", [(False, []), (True, ["time"])]) def test_kerchunk_roundtrip_concat( - self, tmpdir, roundtrip_func, hdf_backend, decode_times, time_vars + self, + tmp_path: Path, + roundtrip_func: RoundtripFunction, + hdf_backend: type[VirtualBackend], + decode_times: bool, + time_vars: list[str], ): # set up example xarray dataset - ds = xr.tutorial.open_dataset("air_temperature", decode_times=decode_times) - - # split into two datasets - ds1, ds2 = ds.isel(time=slice(None, 1460)), ds.isel(time=slice(1460, None)) - - # save it to disk as netCDF (in temporary directory) - ds1.to_netcdf(f"{tmpdir}/air1.nc") - ds2.to_netcdf(f"{tmpdir}/air2.nc") - - # use open_dataset_via_kerchunk to read it as references - vds1 = open_virtual_dataset( - f"{tmpdir}/air1.nc", - indexes={}, - loadable_variables=time_vars, - backend=hdf_backend, - ) - vds2 = open_virtual_dataset( - f"{tmpdir}/air2.nc", - indexes={}, - loadable_variables=time_vars, - backend=hdf_backend, - ) - - if decode_times is False: - assert vds1.time.dtype == np.dtype("float32") - else: - assert vds1.time.dtype == np.dtype(" str: + filepath = str(tmp_path / "nested_group.nc") + + with h5py.File(filepath, "w") as f: + g = f.create_group("group") + data = np.random.random((10, 10)) + g.create_dataset("data", data=data) + g.create_group("nested_group") + return filepath @pytest.fixture -def multiple_datasets_hdf5_file(tmpdir): - filepath = f"{tmpdir}/multiple_datasets.nc" - f = h5py.File(filepath, "w") - data = np.random.random((10, 10)) - f.create_dataset(name="data", data=data, chunks=None) - f.create_dataset(name="data2", data=data, chunks=None) +def multiple_datasets_hdf5_file(tmp_path: Path) -> str: + filepath = str(tmp_path / "multiple_datasets.nc") + + with h5py.File(filepath, "w") as f: + data = np.random.random((10, 10)) + f.create_dataset(name="data", data=data, chunks=None) + f.create_dataset(name="data2", data=data, chunks=None) + return filepath @@ -159,44 +163,56 @@ def np_uncompressed(): @pytest.fixture(params=["gzip", "blosc_lz4", "lz4", "bzip2", "zstd", "shuffle"]) -def filter_encoded_hdf5_file(tmpdir, np_uncompressed, request): - filepath = f"{tmpdir}/{request.param}.nc" - f = h5py.File(filepath, "w") - if request.param == "gzip": - f.create_dataset( - name="data", data=np_uncompressed, compression="gzip", compression_opts=1 - ) - if request.param == "blosc_lz4": - f.create_dataset( - name="data", - data=np_uncompressed, - **hdf5plugin.Blosc(cname="lz4", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE), - ) - if request.param == "lz4": - f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.LZ4(nbytes=0)) - if request.param == "bzip2": - f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.BZip2()) - if request.param == "zstd": - f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.Zstd(clevel=2)) - if request.param == "shuffle": - f.create_dataset(name="data", data=np_uncompressed, shuffle=True) +def filter_encoded_hdf5_file(tmp_path: Path, np_uncompressed, request) -> str: + assert hdf5plugin is not None # make type-checkers happy + filepath = str(tmp_path / f"{request.param}.nc") + + with h5py.File(filepath, "w") as f: + if request.param == "gzip": + f.create_dataset( + name="data", + data=np_uncompressed, + compression="gzip", + compression_opts=1, + ) + if request.param == "blosc_lz4": + f.create_dataset( + name="data", + data=np_uncompressed, + **hdf5plugin.Blosc( + cname="lz4", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE + ), + ) + if request.param == "lz4": + f.create_dataset( + name="data", data=np_uncompressed, **hdf5plugin.LZ4(nbytes=0) + ) + if request.param == "bzip2": + f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.BZip2()) + if request.param == "zstd": + f.create_dataset( + name="data", data=np_uncompressed, **hdf5plugin.Zstd(clevel=2) + ) + if request.param == "shuffle": + f.create_dataset(name="data", data=np_uncompressed, shuffle=True) return filepath @pytest.fixture(params=["gzip"]) -def filter_encoded_roundtrip_hdf5_file(tmpdir, request): - ds = xr.tutorial.open_dataset("air_temperature") - encoding = {} - if request.param == "gzip": - encoding_config = {"zlib": True, "complevel": 1} +def filter_encoded_roundtrip_hdf5_file(tmp_path: Path, request) -> str: + with xr.tutorial.open_dataset("air_temperature") as ds: + encoding = {} + if request.param == "gzip": + encoding_config = {"zlib": True, "complevel": 1} - for var_name in ds.variables: - encoding[var_name] = encoding_config + for var_name in ds.variables: + encoding[var_name] = encoding_config - filepath = f"{tmpdir}/{request.param}_xarray.nc" - ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding) - return filepath + filepath = tmp_path / f"{request.param}_xarray.nc" + ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding) + + return str(filepath) @pytest.fixture() @@ -240,12 +256,14 @@ def offset(): @pytest.fixture -def add_offset_hdf5_file(tmpdir, np_uncompressed_int16, offset): - filepath = f"{tmpdir}/offset.nc" - f = h5py.File(filepath, "w") - data = np_uncompressed_int16 - offset - f.create_dataset(name="data", data=data, chunks=True) - f["data"].attrs.create(name="add_offset", data=offset) +def add_offset_hdf5_file(tmp_path: Path, np_uncompressed_int16, offset) -> str: + filepath = str(tmp_path / "offset.nc") + + with h5py.File(filepath, "w") as f: + data = np_uncompressed_int16 - offset + f.create_dataset(name="data", data=data, chunks=True) + f["data"].attrs.create(name="add_offset", data=offset) + return filepath @@ -255,13 +273,17 @@ def scale_factor(): @pytest.fixture -def scale_add_offset_hdf5_file(tmpdir, np_uncompressed_int16, offset, scale_factor): - filepath = f"{tmpdir}/scale_offset.nc" - f = h5py.File(filepath, "w") - data = (np_uncompressed_int16 - offset) / scale_factor - f.create_dataset(name="data", data=data, chunks=True) - f["data"].attrs.create(name="add_offset", data=offset) - f["data"].attrs.create(name="scale_factor", data=np.array([scale_factor])) +def scale_add_offset_hdf5_file( + tmp_path: Path, np_uncompressed_int16, offset, scale_factor +) -> str: + filepath = str(tmp_path / "scale_offset.nc") + + with h5py.File(filepath, "w") as f: + data = (np_uncompressed_int16 - offset) / scale_factor + f.create_dataset(name="data", data=data, chunks=True) + f["data"].attrs.create(name="add_offset", data=offset) + f["data"].attrs.create(name="scale_factor", data=np.array([scale_factor])) + return filepath @@ -314,19 +336,21 @@ def filter_and_cf_roundtrip_hdf5_file(tmpdir, request): @pytest.fixture -def root_coordinates_hdf5_file(tmpdir, np_uncompressed_int16): - filepath = f"{tmpdir}/coordinates.nc" - f = h5py.File(filepath, "w") - data = np.random.random((100, 100)) - f.create_dataset(name="data", data=data, chunks=True) - f.create_dataset(name="lat", data=data) - f.create_dataset(name="lon", data=data) - f.attrs.create(name="coordinates", data="lat lon") +def root_coordinates_hdf5_file(tmp_path: Path, np_uncompressed_int16) -> str: + filepath = str(tmp_path / "coordinates.nc") + + with h5py.File(filepath, "w") as f: + data = np.random.random((100, 100)) + f.create_dataset(name="data", data=data, chunks=True) + f.create_dataset(name="lat", data=data) + f.create_dataset(name="lon", data=data) + f.attrs.create(name="coordinates", data="lat lon") + return filepath @pytest.fixture -def netcdf3_file(tmp_path: pathlib.Path) -> pathlib.Path: +def netcdf3_file(tmp_path: Path) -> Path: ds = xr.Dataset({"foo": ("x", np.array([1, 2, 3]))}) filepath = tmp_path / "file.nc" @@ -345,12 +369,14 @@ def non_coord_dim(tmpdir): @pytest.fixture -def scalar_fill_value_hdf5_file(tmpdir): - filepath = f"{tmpdir}/scalar_fill_value.nc" - f = h5py.File(filepath, "w") - data = np.random.randint(0, 10, size=(5)) - fill_value = 42 - f.create_dataset(name="data", data=data, chunks=True, fillvalue=fill_value) +def scalar_fill_value_hdf5_file(tmp_path: Path) -> str: + filepath = str(tmp_path / "scalar_fill_value.nc") + + with h5py.File(filepath, "w") as f: + data = np.random.randint(0, 10, size=(5)) + fill_value = 42 + f.create_dataset(name="data", data=data, chunks=True, fillvalue=fill_value) + return filepath @@ -382,24 +408,28 @@ def scalar_fill_value_hdf5_file(tmpdir): @pytest.fixture(params=fill_values) -def cf_fill_value_hdf5_file(tmpdir, request): - filepath = f"{tmpdir}/cf_fill_value.nc" - f = h5py.File(filepath, "w") - dset = f.create_dataset(name="data", data=request.param["data"], chunks=True) - dim_scale = f.create_dataset( - name="dim_scale", data=request.param["data"], chunks=True - ) - dim_scale.make_scale() - dset.dims[0].attach_scale(dim_scale) - dset.attrs["_FillValue"] = request.param["fill_value"] +def cf_fill_value_hdf5_file(tmp_path: Path, request) -> str: + filepath = str(tmp_path / "cf_fill_value.nc") + + with h5py.File(filepath, "w") as f: + dset = f.create_dataset(name="data", data=request.param["data"], chunks=True) + dim_scale = f.create_dataset( + name="dim_scale", data=request.param["data"], chunks=True + ) + dim_scale.make_scale() + dset.dims[0].attach_scale(dim_scale) + dset.attrs["_FillValue"] = request.param["fill_value"] + return filepath @pytest.fixture -def cf_array_fill_value_hdf5_file(tmpdir): - filepath = f"{tmpdir}/cf_array_fill_value.nc" - f = h5py.File(filepath, "w") - data = np.random.random(5) - dset = f.create_dataset(name="data", data=data, chunks=True) - dset.attrs["_FillValue"] = np.array([np.nan]) +def cf_array_fill_value_hdf5_file(tmp_path: Path) -> str: + filepath = str(tmp_path / "cf_array_fill_value.nc") + + with h5py.File(filepath, "w") as f: + data = np.random.random(5) + dset = f.create_dataset(name="data", data=data, chunks=True) + dset.attrs["_FillValue"] = np.array([np.nan]) + return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py index e1407611a..23fa3949b 100644 --- a/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py @@ -36,10 +36,10 @@ def test_filters_h5netcdf_roundtrip( ): kerchunk_file = str(tmp_path / "kerchunk.json") vds.virtualize.to_kerchunk(kerchunk_file, format="json") - roundtrip = xr.open_dataset( + with xr.open_dataset( kerchunk_file, engine="kerchunk", decode_times=True - ) - xrt.assert_allclose(ds, roundtrip) + ) as roundtrip: + xrt.assert_allclose(ds, roundtrip) def test_filters_netcdf4_roundtrip( self, tmp_path, filter_encoded_roundtrip_netcdf4_file @@ -53,8 +53,8 @@ def test_filters_netcdf4_roundtrip( ): kerchunk_file = str(tmp_path / "kerchunk.json") vds.virtualize.to_kerchunk(kerchunk_file, format="json") - roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") - xrt.assert_equal(ds, roundtrip) + with xr.open_dataset(kerchunk_file, engine="kerchunk") as roundtrip: + xrt.assert_equal(ds, roundtrip) def test_filter_and_cf_roundtrip(self, tmp_path, filter_and_cf_roundtrip_hdf5_file): with ( @@ -65,12 +65,12 @@ def test_filter_and_cf_roundtrip(self, tmp_path, filter_and_cf_roundtrip_hdf5_fi ): kerchunk_file = str(tmp_path / "filter_cf_kerchunk.json") vds.virtualize.to_kerchunk(kerchunk_file, format="json") - roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") - xrt.assert_allclose(ds, roundtrip) - assert ( - ds["temperature"].encoding["_FillValue"] - == roundtrip["temperature"].encoding["_FillValue"] - ) + with xr.open_dataset(kerchunk_file, engine="kerchunk") as roundtrip: + xrt.assert_allclose(ds, roundtrip) + assert ( + ds["temperature"].encoding["_FillValue"] + == roundtrip["temperature"].encoding["_FillValue"] + ) def test_non_coord_dim_roundtrip(self, tmp_path, non_coord_dim): with ( @@ -81,8 +81,8 @@ def test_non_coord_dim_roundtrip(self, tmp_path, non_coord_dim): ): kerchunk_file = str(tmp_path / "kerchunk.json") vds.virtualize.to_kerchunk(kerchunk_file, format="json") - roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") - xrt.assert_equal(ds, roundtrip) + with xr.open_dataset(kerchunk_file, engine="kerchunk") as roundtrip: + xrt.assert_equal(ds, roundtrip) @requires_icechunk def test_cf_fill_value_roundtrip(self, tmp_path, cf_fill_value_hdf5_file): diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index fd856c060..7884a5eb5 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -6,6 +6,7 @@ from xarray import open_dataset from virtualizarr import open_virtual_dataset +from virtualizarr.backend import VirtualBackend from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.tests import ( parametrize_over_hdf_backends, @@ -234,99 +235,110 @@ def test_concat_dim_coords_along_existing_dim(self): @requires_imagecodecs @parametrize_over_hdf_backends class TestCombineUsingIndexes: - def test_combine_by_coords(self, netcdf4_files_factory: Callable, hdf_backend): + def test_combine_by_coords( + self, + netcdf4_files_factory: Callable[[], tuple[str, str]], + hdf_backend: type[VirtualBackend], + ): filepath1, filepath2 = netcdf4_files_factory() - vds1 = open_virtual_dataset( - filepath1, backend=hdf_backend, loadable_variables=["time", "lat", "lon"] - ) - vds2 = open_virtual_dataset( - filepath2, backend=hdf_backend, loadable_variables=["time", "lat", "lon"] - ) - - combined_vds = xr.combine_by_coords( - [vds2, vds1], - ) - - assert combined_vds.xindexes["time"].to_pandas_index().is_monotonic_increasing + with ( + open_virtual_dataset( + filepath1, + backend=hdf_backend, + loadable_variables=["time", "lat", "lon"], + ) as vds1, + open_virtual_dataset( + filepath2, + backend=hdf_backend, + loadable_variables=["time", "lat", "lon"], + ) as vds2, + ): + combined_vds = xr.combine_by_coords( + [vds2, vds1], + ) + + assert ( + combined_vds.xindexes["time"].to_pandas_index().is_monotonic_increasing + ) @pytest.mark.xfail(reason="Not yet implemented, see issue #18") - def test_combine_by_coords_keeping_manifestarrays(self, netcdf4_files, hdf_backend): - filepath1, filepath2 = netcdf4_files + def test_combine_by_coords_keeping_manifestarrays( + self, + netcdf4_files_factory: Callable[[], tuple[str, str]], + hdf_backend: type[VirtualBackend], + ): + filepath1, filepath2 = netcdf4_files_factory() - vds1 = open_virtual_dataset(filepath1, backend=hdf_backend) - vds2 = open_virtual_dataset(filepath2, backend=hdf_backend) + with ( + open_virtual_dataset(filepath1, backend=hdf_backend) as vds1, + open_virtual_dataset(filepath2, backend=hdf_backend) as vds2, + ): + combined_vds = xr.combine_by_coords([vds2, vds1]) - combined_vds = xr.combine_by_coords( - [vds2, vds1], - ) - - assert isinstance(combined_vds["time"].data, ManifestArray) - assert isinstance(combined_vds["lat"].data, ManifestArray) - assert isinstance(combined_vds["lon"].data, ManifestArray) + assert isinstance(combined_vds["time"].data, ManifestArray) + assert isinstance(combined_vds["lat"].data, ManifestArray) + assert isinstance(combined_vds["lon"].data, ManifestArray) @parametrize_over_hdf_backends class TestRenamePaths: def test_rename_to_str(self, netcdf4_file, hdf_backend): - vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend) - renamed_vds = vds.virtualize.rename_paths("s3://bucket/air.nc") - assert ( - renamed_vds["air"].data.manifest.dict()["0.0.0"]["path"] - == "s3://bucket/air.nc" - ) + with open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend) as vds: + renamed_vds = vds.virtualize.rename_paths("s3://bucket/air.nc") + assert ( + renamed_vds["air"].data.manifest.dict()["0.0.0"]["path"] + == "s3://bucket/air.nc" + ) def test_rename_using_function(self, netcdf4_file, hdf_backend): - vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend) - def local_to_s3_url(old_local_path: str) -> str: from pathlib import Path new_s3_bucket_url = "s3://bucket/" - filename = Path(old_local_path).name return str(new_s3_bucket_url + filename) - renamed_vds = vds.virtualize.rename_paths(local_to_s3_url) - assert ( - renamed_vds["air"].data.manifest.dict()["0.0.0"]["path"] - == "s3://bucket/air.nc" - ) + with open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend) as vds: + renamed_vds = vds.virtualize.rename_paths(local_to_s3_url) + assert ( + renamed_vds["air"].data.manifest.dict()["0.0.0"]["path"] + == "s3://bucket/air.nc" + ) def test_invalid_type(self, netcdf4_file, hdf_backend): - vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend) - - with pytest.raises(TypeError): - vds.virtualize.rename_paths(["file1.nc", "file2.nc"]) + with open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend) as vds: + with pytest.raises(TypeError): + vds.virtualize.rename_paths(["file1.nc", "file2.nc"]) @requires_hdf5plugin @requires_imagecodecs def test_mixture_of_manifestarrays_and_numpy_arrays( self, netcdf4_file, hdf_backend ): - vds = open_virtual_dataset( + with open_virtual_dataset( netcdf4_file, indexes={}, loadable_variables=["lat", "lon"], backend=hdf_backend, - ) - renamed_vds = vds.virtualize.rename_paths("s3://bucket/air.nc") - assert ( - renamed_vds["air"].data.manifest.dict()["0.0.0"]["path"] - == "s3://bucket/air.nc" - ) - assert isinstance(renamed_vds["lat"].data, np.ndarray) + ) as vds: + renamed_vds = vds.virtualize.rename_paths("s3://bucket/air.nc") + assert ( + renamed_vds["air"].data.manifest.dict()["0.0.0"]["path"] + == "s3://bucket/air.nc" + ) + assert isinstance(renamed_vds["lat"].data, np.ndarray) @requires_hdf5plugin @requires_imagecodecs def test_nbytes(simple_netcdf4): - vds = open_virtual_dataset(simple_netcdf4) - assert vds.virtualize.nbytes == 32 - assert vds.nbytes == 48 + with open_virtual_dataset(simple_netcdf4) as vds: + assert vds.virtualize.nbytes == 32 + assert vds.nbytes == 48 - vds = open_virtual_dataset(simple_netcdf4, loadable_variables=["foo"]) - assert vds.virtualize.nbytes == 48 + with open_virtual_dataset(simple_netcdf4, loadable_variables=["foo"]) as vds: + assert vds.virtualize.nbytes == 48 - ds = open_dataset(simple_netcdf4) - assert ds.virtualize.nbytes == ds.nbytes + with open_dataset(simple_netcdf4) as ds: + assert ds.virtualize.nbytes == ds.nbytes