diff --git a/conftest.py b/conftest.py index e4a0c88d1..870f339ee 100644 --- a/conftest.py +++ b/conftest.py @@ -29,6 +29,11 @@ def pytest_addoption(parser): action="store_true", help="runs tests requiring a network connection", ) + parser.addoption( + "--run-slow-tests", + action="store_true", + help="runs slow tests", + ) parser.addoption( "--run-minio-tests", action="store_true", @@ -44,6 +49,8 @@ def pytest_runtest_setup(item): ) if "minio" in item.keywords and not item.config.getoption("--run-minio-tests"): pytest.skip("set --run-minio-tests to run tests requiring docker and minio") + if "slow" in item.keywords and not item.config.getoption("--run-slow-tests"): + pytest.skip("set --run-slow-tests to run slow tests") def _xarray_subset(): diff --git a/docs/index.md b/docs/index.md index 2ae7e979f..24259c459 100644 --- a/docs/index.md +++ b/docs/index.md @@ -103,7 +103,7 @@ vds = open_virtual_mfdataset(urls, parser = parser, registry = registry) print(vds) ``` -The magic of VirtualiZarr is that you can persist the virtual dataset to disk in a chunk references format such as [Icechunk][https://icechunk.io/], +The magic of VirtualiZarr is that you can persist the virtual dataset to disk in a chunk references format such as [Icechunk](https://icechunk.io/), meaning that the work of constructing the single coherent dataset only needs to happen once. For subsequent data access, you can use [xarray.open_zarr][] to open that Icechunk store, which on object storage is far faster than using [xarray.open_mfdataset][] to open the the original non-cloud-optimized files. diff --git a/pyproject.toml b/pyproject.toml index d4e488a35..f4bc86a31 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -178,6 +178,7 @@ run-mypy = { cmd = "mypy virtualizarr" } # Using '--dist loadscope' (rather than default of '--dist load' when '-n auto' # is used), reduces test hangs that appear to be macOS-related. run-tests = { cmd = "pytest -n auto --dist loadscope --run-network-tests --verbose --durations=10" } +run-tests-including-slow = { cmd = "pytest -n auto --dist loadscope --run-network-tests --run-slow-tests --verbose --durations=10" } run-tests-no-network = { cmd = "pytest -n auto --verbose" } run-tests-cov = { cmd = "pytest -n auto --run-network-tests --verbose --cov=virtualizarr --cov=term-missing" } run-tests-xml-cov = { cmd = "pytest -n auto --run-network-tests --verbose --cov=virtualizarr --cov-report=xml" } @@ -224,6 +225,7 @@ show_error_codes = true module = [ "docker", "fsspec.*", + "s3fs.*", "h5py", "kerchunk.*", "minio", @@ -305,6 +307,7 @@ markers = [ # this warning: "PytestUnknownMarkWarning: Unknown pytest.mark.flaky" "flaky: flaky tests", "network: marks test requiring internet (select with '--run-network-tests')", + "slow: marks test as slow (select with '--run-slow-tests')", "minio: marks test requiring docker and minio (select with '--run-minio-tests')", ] filterwarnings = [ diff --git a/virtualizarr/codecs.py b/virtualizarr/codecs.py index 343ea65fe..accd4dc94 100644 --- a/virtualizarr/codecs.py +++ b/virtualizarr/codecs.py @@ -20,10 +20,11 @@ ] -def numcodec_config_to_configurable(num_codec: dict) -> dict: +def zarr_codec_config_to_v3(num_codec: dict) -> dict: """ Convert a numcodecs codec into a zarr v3 configurable. """ + # TODO: Special case Blosc codec if num_codec["id"].startswith("numcodecs."): return num_codec @@ -32,6 +33,19 @@ def numcodec_config_to_configurable(num_codec: dict) -> dict: return {"name": name, "configuration": num_codec_copy} +def zarr_codec_config_to_v2(num_codec: dict) -> dict: + """ + Convert a numcodecs codec into a zarr v2 configurable. + """ + # TODO: Special case Blosc codec + if name := num_codec.get("name", None): + return {"id": name, **num_codec["configuration"]} + elif num_codec.get("id", None): + return num_codec + else: + raise ValueError(f"Expected a valid Zarr V2 or V3 codec dict, got {num_codec}") + + def extract_codecs( codecs: CodecPipeline, ) -> DeconstructedCodecPipeline: diff --git a/virtualizarr/parsers/hdf/hdf.py b/virtualizarr/parsers/hdf/hdf.py index 2d94df862..c3968de64 100644 --- a/virtualizarr/parsers/hdf/hdf.py +++ b/virtualizarr/parsers/hdf/hdf.py @@ -8,7 +8,7 @@ import numpy as np -from virtualizarr.codecs import numcodec_config_to_configurable +from virtualizarr.codecs import zarr_codec_config_to_v3 from virtualizarr.manifests import ( ChunkEntry, ChunkManifest, @@ -71,9 +71,7 @@ def _construct_manifest_array( encoded_cf_fill_value = encode_cf_fill_value(attrs["_FillValue"], dtype) attrs["_FillValue"] = encoded_cf_fill_value - codec_configs = [ - numcodec_config_to_configurable(codec.get_config()) for codec in codecs - ] + codec_configs = [zarr_codec_config_to_v3(codec.get_config()) for codec in codecs] fill_value = dataset.fillvalue.item() dims = tuple(_dataset_dims(dataset, group=group)) diff --git a/virtualizarr/parsers/kerchunk/translator.py b/virtualizarr/parsers/kerchunk/translator.py index 98017ec00..7565858c9 100644 --- a/virtualizarr/parsers/kerchunk/translator.py +++ b/virtualizarr/parsers/kerchunk/translator.py @@ -9,7 +9,7 @@ from zarr.core.metadata import ArrayV3Metadata from virtualizarr.codecs import ( - numcodec_config_to_configurable, + zarr_codec_config_to_v3, ) from virtualizarr.manifests import ( ChunkManifest, @@ -65,9 +65,7 @@ def from_kerchunk_refs(decoded_arr_refs_zarray, zattrs) -> "ArrayV3Metadata": # Ensure compressor is a list before unpacking codec_configs = [*filters, *(compressor if compressor is not None else [])] - numcodec_configs = [ - numcodec_config_to_configurable(config) for config in codec_configs - ] + numcodec_configs = [zarr_codec_config_to_v3(config) for config in codec_configs] dimension_names = decoded_arr_refs_zarray["dimension_names"] return create_v3_array_metadata( chunk_shape=tuple(decoded_arr_refs_zarray["chunks"]), diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index f809214f1..9089136f8 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -5,6 +5,7 @@ requires_network = pytest.mark.network requires_minio = pytest.mark.minio +slow_test = pytest.mark.slow def _importorskip( diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index e06350389..05cd62e11 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -7,7 +7,7 @@ import pytest import xarray as xr import xarray.testing as xrt -from obstore.store import LocalStore +from obstore.store import LocalStore, from_url from conftest import ARRAYBYTES_CODEC, ZLIB_CODEC from virtualizarr import open_virtual_dataset @@ -25,7 +25,9 @@ has_icechunk, has_kerchunk, requires_kerchunk, + requires_network, requires_zarr_python, + slow_test, ) icechunk = pytest.importorskip("icechunk") @@ -519,3 +521,35 @@ def test_convert_relative_paths_to_urls(self, netcdf4_file, local_registry): path = manifest["0.0.0"]["path"] assert path == expected_path + + +@requires_kerchunk +@requires_network +@slow_test +def test_roundtrip_dataset_with_multiple_compressors(): + # Regression test to make sure we can load data with a compression and a shuffle codec + # TODO: Simplify this test to not require network access + import s3fs + + bucket = "s3://nex-gddp-cmip6" + path = "NEX-GDDP-CMIP6/ACCESS-CM2/ssp126/r1i1p1f1/tasmax/tasmax_day_ACCESS-CM2_ssp126_r1i1p1f1_gn_2015_v2.0.nc" + url = f"{bucket}/{path}" + store = from_url(bucket, region="us-west-2", skip_signature=True) + registry = ObjectStoreRegistry({bucket: store}) + parser = HDFParser() + vds = open_virtual_dataset( + url=url, parser=parser, registry=registry, loadable_variables=[] + ) + + ds_refs = vds.vz.to_kerchunk(format="dict") + fs = s3fs.S3FileSystem(anon=True) + with ( + xr.open_dataset(fs.open(url), engine="h5netcdf", decode_times=True) as expected, + xr.open_dataset( + ds_refs, + decode_times=True, + engine="kerchunk", + storage_options={"remote_options": {"anon": True}}, + ) as observed, + ): + xr.testing.assert_allclose(expected, observed) diff --git a/virtualizarr/tests/test_parsers/test_dmrpp.py b/virtualizarr/tests/test_parsers/test_dmrpp.py index 69d834eee..857b3413d 100644 --- a/virtualizarr/tests/test_parsers/test_dmrpp.py +++ b/virtualizarr/tests/test_parsers/test_dmrpp.py @@ -10,7 +10,7 @@ from virtualizarr.parsers import DMRPPParser, HDFParser from virtualizarr.parsers.dmrpp import DMRParser from virtualizarr.registry import ObjectStoreRegistry -from virtualizarr.tests import requires_network +from virtualizarr.tests import requires_network, slow_test from virtualizarr.tests.utils import obstore_local, obstore_s3 from virtualizarr.xarray import open_virtual_dataset @@ -346,6 +346,7 @@ def dmrparser(dmrpp_xml_str: str, filepath: str) -> DMRParser: return DMRParser(root=ET.fromstring(dmrpp_xml_str), data_filepath=filepath) +@slow_test @requires_network @pytest.mark.parametrize("data_url, dmrpp_url", urls) def test_NASA_dmrpp(data_url, dmrpp_url): @@ -373,6 +374,7 @@ def test_NASA_dmrpp(data_url, dmrpp_url): @requires_network +@slow_test @pytest.mark.parametrize("data_url, dmrpp_url", urls) def test_NASA_dmrpp_load(data_url, dmrpp_url): store = obstore_s3( diff --git a/virtualizarr/tests/test_writers/test_kerchunk.py b/virtualizarr/tests/test_writers/test_kerchunk.py index 24985930e..4faf99273 100644 --- a/virtualizarr/tests/test_writers/test_kerchunk.py +++ b/virtualizarr/tests/test_writers/test_kerchunk.py @@ -214,7 +214,7 @@ def testconvert_v3_to_v2_metadata(array_v3_metadata): assert v2_metadata.dtype.to_native_dtype() == np.dtype("int32") assert v2_metadata.chunks == chunks assert v2_metadata.fill_value == 0 - compressor_config = v2_metadata.compressor.get_config() + compressor_config = v2_metadata.filters[1].get_config() assert compressor_config["id"] == "blosc" assert compressor_config["cname"] == "zstd" assert compressor_config["clevel"] == 5 diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index 957271acc..00e964ae4 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -21,6 +21,7 @@ requires_imagecodecs, requires_lithops, requires_network, + slow_test, ) from virtualizarr.tests.utils import obstore_http, obstore_s3 @@ -576,6 +577,7 @@ def test_var_attr_coords(self, netcdf4_file_with_2d_coords, local_registry): @requires_network class TestReadRemote: + @slow_test @pytest.mark.parametrize( "indexes", [ @@ -604,14 +606,14 @@ def test_anon_read_s3(self, indexes): for name in ["time", "lat", "lon"]: assert isinstance(vds[name].data, np.ndarray) - @pytest.mark.skip(reason="often times out, as nisar file is 200MB") + @slow_test def test_virtualizarr_vs_local_nisar(self): # Open group directly from locally cached file with xarray url = "https://nisar.asf.earthdatacloud.nasa.gov/NISAR-SAMPLE-DATA/GCOV/ALOS1_Rosamond_20081012/NISAR_L2_PR_GCOV_001_005_A_219_4020_SHNA_A_20081012T060910_20081012T060926_P01101_F_N_J_001.h5" hdf_group = "science/LSAR/GCOV/grids/frequencyA" store = obstore_http(url=url) registry = ObjectStoreRegistry() - registry.register(url, registry) + registry.register(url, store) drop_variables = ["listOfCovarianceTerms", "listOfPolarizations"] parser = HDFParser(group=hdf_group, drop_variables=drop_variables) with ( @@ -625,7 +627,7 @@ def test_virtualizarr_vs_local_nisar(self): # save group reference file via virtualizarr, then open with engine="kerchunk" open_virtual_dataset( url=url, - object_store=store, + registry=registry, parser=parser, ) as vds, ): diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py index d99aa3609..27d06ca7a 100644 --- a/virtualizarr/utils.py +++ b/virtualizarr/utils.py @@ -7,10 +7,10 @@ from typing import TYPE_CHECKING, Any, Iterable, Mapping, Optional, Sequence, Union import obstore as obs -from zarr.abc.codec import ArrayArrayCodec, BytesBytesCodec +from zarr.abc.codec import ArrayBytesCodec from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata -from virtualizarr.codecs import extract_codecs, get_codec_config +from virtualizarr.codecs import get_codec_config, zarr_codec_config_to_v2 from virtualizarr.types.kerchunk import KerchunkStoreRefs # taken from zarr.core.common @@ -132,32 +132,21 @@ def convert_v3_to_v2_metadata( ArrayV2Metadata The metadata object in v2 format. """ - import warnings - - array_filters: tuple[ArrayArrayCodec, ...] - bytes_compressors: tuple[BytesBytesCodec, ...] - array_filters, _, bytes_compressors = extract_codecs(v3_metadata.codecs) - # Handle compressor configuration - compressor_config: dict[str, Any] | None = None - if bytes_compressors: - if len(bytes_compressors) > 1: - warnings.warn( - "Multiple compressors found in v3 metadata. Using the first compressor, " - "others will be ignored. This may affect data compatibility.", - UserWarning, - ) - compressor_config = get_codec_config(bytes_compressors[0]) - - # Handle filter configurations - filter_configs = [get_codec_config(filter_) for filter_ in array_filters] + # TODO: Find a more robust way to exclude any bytes codecs + # TODO: Test round-tripping big endian since that is stored in the bytes codec in V3; it should be included in data type instead for V2 + v2_codecs = [ + zarr_codec_config_to_v2(get_codec_config(codec)) + for codec in v3_metadata.codecs + if not isinstance(codec, ArrayBytesCodec) + ] v2_metadata = ArrayV2Metadata( shape=v3_metadata.shape, dtype=v3_metadata.data_type, chunks=v3_metadata.chunks, fill_value=fill_value or v3_metadata.fill_value, - compressor=compressor_config, - filters=filter_configs, + filters=v2_codecs, + compressor=None, order="C", attributes=v3_metadata.attributes, dimension_separator=".", # Assuming '.' as default dimension separator