Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ def pytest_addoption(parser):
action="store_true",
help="runs tests requiring a network connection",
)
parser.addoption(
"--run-slow-tests",
action="store_true",
help="runs slow tests",
)
parser.addoption(
"--run-minio-tests",
action="store_true",
Expand All @@ -44,6 +49,8 @@ def pytest_runtest_setup(item):
)
if "minio" in item.keywords and not item.config.getoption("--run-minio-tests"):
pytest.skip("set --run-minio-tests to run tests requiring docker and minio")
if "slow" in item.keywords and not item.config.getoption("--run-slow-tests"):
pytest.skip("set --run-slow-tests to run slow tests")


def _xarray_subset():
Expand Down
2 changes: 1 addition & 1 deletion docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ vds = open_virtual_mfdataset(urls, parser = parser, registry = registry)
print(vds)
```

The magic of VirtualiZarr is that you can persist the virtual dataset to disk in a chunk references format such as [Icechunk][https://icechunk.io/],
The magic of VirtualiZarr is that you can persist the virtual dataset to disk in a chunk references format such as [Icechunk](https://icechunk.io/),
meaning that the work of constructing the single coherent dataset only needs to happen once.
For subsequent data access, you can use [xarray.open_zarr][] to open that Icechunk store, which on object storage is
far faster than using [xarray.open_mfdataset][] to open the the original non-cloud-optimized files.
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ run-mypy = { cmd = "mypy virtualizarr" }
# Using '--dist loadscope' (rather than default of '--dist load' when '-n auto'
# is used), reduces test hangs that appear to be macOS-related.
run-tests = { cmd = "pytest -n auto --dist loadscope --run-network-tests --verbose --durations=10" }
run-tests-including-slow = { cmd = "pytest -n auto --dist loadscope --run-network-tests --run-slow-tests --verbose --durations=10" }
run-tests-no-network = { cmd = "pytest -n auto --verbose" }
run-tests-cov = { cmd = "pytest -n auto --run-network-tests --verbose --cov=virtualizarr --cov=term-missing" }
run-tests-xml-cov = { cmd = "pytest -n auto --run-network-tests --verbose --cov=virtualizarr --cov-report=xml" }
Expand Down Expand Up @@ -224,6 +225,7 @@ show_error_codes = true
module = [
"docker",
"fsspec.*",
"s3fs.*",
"h5py",
"kerchunk.*",
"minio",
Expand Down Expand Up @@ -305,6 +307,7 @@ markers = [
# this warning: "PytestUnknownMarkWarning: Unknown pytest.mark.flaky"
"flaky: flaky tests",
"network: marks test requiring internet (select with '--run-network-tests')",
"slow: marks test as slow (select with '--run-slow-tests')",
"minio: marks test requiring docker and minio (select with '--run-minio-tests')",
]
filterwarnings = [
Expand Down
16 changes: 15 additions & 1 deletion virtualizarr/codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,11 @@
]


def numcodec_config_to_configurable(num_codec: dict) -> dict:
def zarr_codec_config_to_v3(num_codec: dict) -> dict:
"""
Convert a numcodecs codec into a zarr v3 configurable.
"""
# TODO: Special case Blosc codec
if num_codec["id"].startswith("numcodecs."):
return num_codec

Expand All @@ -32,6 +33,19 @@ def numcodec_config_to_configurable(num_codec: dict) -> dict:
return {"name": name, "configuration": num_codec_copy}


def zarr_codec_config_to_v2(num_codec: dict) -> dict:
"""
Convert a numcodecs codec into a zarr v2 configurable.
"""
# TODO: Special case Blosc codec
if name := num_codec.get("name", None):
return {"id": name, **num_codec["configuration"]}
elif num_codec.get("id", None):
return num_codec
else:
raise ValueError(f"Expected a valid Zarr V2 or V3 codec dict, got {num_codec}")


def extract_codecs(
codecs: CodecPipeline,
) -> DeconstructedCodecPipeline:
Expand Down
6 changes: 2 additions & 4 deletions virtualizarr/parsers/hdf/hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import numpy as np

from virtualizarr.codecs import numcodec_config_to_configurable
from virtualizarr.codecs import zarr_codec_config_to_v3
from virtualizarr.manifests import (
ChunkEntry,
ChunkManifest,
Expand Down Expand Up @@ -71,9 +71,7 @@ def _construct_manifest_array(
encoded_cf_fill_value = encode_cf_fill_value(attrs["_FillValue"], dtype)
attrs["_FillValue"] = encoded_cf_fill_value

codec_configs = [
numcodec_config_to_configurable(codec.get_config()) for codec in codecs
]
codec_configs = [zarr_codec_config_to_v3(codec.get_config()) for codec in codecs]

fill_value = dataset.fillvalue.item()
dims = tuple(_dataset_dims(dataset, group=group))
Expand Down
6 changes: 2 additions & 4 deletions virtualizarr/parsers/kerchunk/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from zarr.core.metadata import ArrayV3Metadata

from virtualizarr.codecs import (
numcodec_config_to_configurable,
zarr_codec_config_to_v3,
)
from virtualizarr.manifests import (
ChunkManifest,
Expand Down Expand Up @@ -65,9 +65,7 @@ def from_kerchunk_refs(decoded_arr_refs_zarray, zattrs) -> "ArrayV3Metadata":

# Ensure compressor is a list before unpacking
codec_configs = [*filters, *(compressor if compressor is not None else [])]
numcodec_configs = [
numcodec_config_to_configurable(config) for config in codec_configs
]
numcodec_configs = [zarr_codec_config_to_v3(config) for config in codec_configs]
dimension_names = decoded_arr_refs_zarray["dimension_names"]
return create_v3_array_metadata(
chunk_shape=tuple(decoded_arr_refs_zarray["chunks"]),
Expand Down
1 change: 1 addition & 0 deletions virtualizarr/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

requires_network = pytest.mark.network
requires_minio = pytest.mark.minio
slow_test = pytest.mark.slow


def _importorskip(
Expand Down
36 changes: 35 additions & 1 deletion virtualizarr/tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pytest
import xarray as xr
import xarray.testing as xrt
from obstore.store import LocalStore
from obstore.store import LocalStore, from_url

from conftest import ARRAYBYTES_CODEC, ZLIB_CODEC
from virtualizarr import open_virtual_dataset
Expand All @@ -25,7 +25,9 @@
has_icechunk,
has_kerchunk,
requires_kerchunk,
requires_network,
requires_zarr_python,
slow_test,
)

icechunk = pytest.importorskip("icechunk")
Expand Down Expand Up @@ -519,3 +521,35 @@ def test_convert_relative_paths_to_urls(self, netcdf4_file, local_registry):
path = manifest["0.0.0"]["path"]

assert path == expected_path


@requires_kerchunk
@requires_network
@slow_test
def test_roundtrip_dataset_with_multiple_compressors():
# Regression test to make sure we can load data with a compression and a shuffle codec
# TODO: Simplify this test to not require network access
import s3fs

bucket = "s3://nex-gddp-cmip6"
path = "NEX-GDDP-CMIP6/ACCESS-CM2/ssp126/r1i1p1f1/tasmax/tasmax_day_ACCESS-CM2_ssp126_r1i1p1f1_gn_2015_v2.0.nc"
url = f"{bucket}/{path}"
store = from_url(bucket, region="us-west-2", skip_signature=True)
registry = ObjectStoreRegistry({bucket: store})
parser = HDFParser()
vds = open_virtual_dataset(
url=url, parser=parser, registry=registry, loadable_variables=[]
)

ds_refs = vds.vz.to_kerchunk(format="dict")
fs = s3fs.S3FileSystem(anon=True)
with (
xr.open_dataset(fs.open(url), engine="h5netcdf", decode_times=True) as expected,
xr.open_dataset(
ds_refs,
decode_times=True,
engine="kerchunk",
storage_options={"remote_options": {"anon": True}},
) as observed,
):
xr.testing.assert_allclose(expected, observed)
4 changes: 3 additions & 1 deletion virtualizarr/tests/test_parsers/test_dmrpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from virtualizarr.parsers import DMRPPParser, HDFParser
from virtualizarr.parsers.dmrpp import DMRParser
from virtualizarr.registry import ObjectStoreRegistry
from virtualizarr.tests import requires_network
from virtualizarr.tests import requires_network, slow_test
from virtualizarr.tests.utils import obstore_local, obstore_s3
from virtualizarr.xarray import open_virtual_dataset

Expand Down Expand Up @@ -346,6 +346,7 @@ def dmrparser(dmrpp_xml_str: str, filepath: str) -> DMRParser:
return DMRParser(root=ET.fromstring(dmrpp_xml_str), data_filepath=filepath)


@slow_test
@requires_network
@pytest.mark.parametrize("data_url, dmrpp_url", urls)
def test_NASA_dmrpp(data_url, dmrpp_url):
Expand Down Expand Up @@ -373,6 +374,7 @@ def test_NASA_dmrpp(data_url, dmrpp_url):


@requires_network
@slow_test
@pytest.mark.parametrize("data_url, dmrpp_url", urls)
def test_NASA_dmrpp_load(data_url, dmrpp_url):
store = obstore_s3(
Expand Down
2 changes: 1 addition & 1 deletion virtualizarr/tests/test_writers/test_kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def testconvert_v3_to_v2_metadata(array_v3_metadata):
assert v2_metadata.dtype.to_native_dtype() == np.dtype("int32")
assert v2_metadata.chunks == chunks
assert v2_metadata.fill_value == 0
compressor_config = v2_metadata.compressor.get_config()
compressor_config = v2_metadata.filters[1].get_config()
assert compressor_config["id"] == "blosc"
assert compressor_config["cname"] == "zstd"
assert compressor_config["clevel"] == 5
Expand Down
8 changes: 5 additions & 3 deletions virtualizarr/tests/test_xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
requires_imagecodecs,
requires_lithops,
requires_network,
slow_test,
)
from virtualizarr.tests.utils import obstore_http, obstore_s3

Expand Down Expand Up @@ -576,6 +577,7 @@ def test_var_attr_coords(self, netcdf4_file_with_2d_coords, local_registry):

@requires_network
class TestReadRemote:
@slow_test
@pytest.mark.parametrize(
"indexes",
[
Expand Down Expand Up @@ -604,14 +606,14 @@ def test_anon_read_s3(self, indexes):
for name in ["time", "lat", "lon"]:
assert isinstance(vds[name].data, np.ndarray)

@pytest.mark.skip(reason="often times out, as nisar file is 200MB")
@slow_test
def test_virtualizarr_vs_local_nisar(self):
# Open group directly from locally cached file with xarray
url = "https://nisar.asf.earthdatacloud.nasa.gov/NISAR-SAMPLE-DATA/GCOV/ALOS1_Rosamond_20081012/NISAR_L2_PR_GCOV_001_005_A_219_4020_SHNA_A_20081012T060910_20081012T060926_P01101_F_N_J_001.h5"
hdf_group = "science/LSAR/GCOV/grids/frequencyA"
store = obstore_http(url=url)
registry = ObjectStoreRegistry()
registry.register(url, registry)
registry.register(url, store)
drop_variables = ["listOfCovarianceTerms", "listOfPolarizations"]
parser = HDFParser(group=hdf_group, drop_variables=drop_variables)
with (
Expand All @@ -625,7 +627,7 @@ def test_virtualizarr_vs_local_nisar(self):
# save group reference file via virtualizarr, then open with engine="kerchunk"
open_virtual_dataset(
url=url,
object_store=store,
registry=registry,
parser=parser,
) as vds,
):
Expand Down
33 changes: 11 additions & 22 deletions virtualizarr/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
from typing import TYPE_CHECKING, Any, Iterable, Mapping, Optional, Sequence, Union

import obstore as obs
from zarr.abc.codec import ArrayArrayCodec, BytesBytesCodec
from zarr.abc.codec import ArrayBytesCodec
from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata

from virtualizarr.codecs import extract_codecs, get_codec_config
from virtualizarr.codecs import get_codec_config, zarr_codec_config_to_v2
from virtualizarr.types.kerchunk import KerchunkStoreRefs

# taken from zarr.core.common
Expand Down Expand Up @@ -132,32 +132,21 @@ def convert_v3_to_v2_metadata(
ArrayV2Metadata
The metadata object in v2 format.
"""
import warnings

array_filters: tuple[ArrayArrayCodec, ...]
bytes_compressors: tuple[BytesBytesCodec, ...]
array_filters, _, bytes_compressors = extract_codecs(v3_metadata.codecs)
# Handle compressor configuration
compressor_config: dict[str, Any] | None = None
if bytes_compressors:
if len(bytes_compressors) > 1:
warnings.warn(
"Multiple compressors found in v3 metadata. Using the first compressor, "
"others will be ignored. This may affect data compatibility.",
UserWarning,
)
compressor_config = get_codec_config(bytes_compressors[0])

# Handle filter configurations
filter_configs = [get_codec_config(filter_) for filter_ in array_filters]

# TODO: Find a more robust way to exclude any bytes codecs
# TODO: Test round-tripping big endian since that is stored in the bytes codec in V3; it should be included in data type instead for V2
v2_codecs = [
zarr_codec_config_to_v2(get_codec_config(codec))
for codec in v3_metadata.codecs
if not isinstance(codec, ArrayBytesCodec)
]
v2_metadata = ArrayV2Metadata(
shape=v3_metadata.shape,
dtype=v3_metadata.data_type,
chunks=v3_metadata.chunks,
fill_value=fill_value or v3_metadata.fill_value,
compressor=compressor_config,
filters=filter_configs,
filters=v2_codecs,
compressor=None,
order="C",
attributes=v3_metadata.attributes,
dimension_separator=".", # Assuming '.' as default dimension separator
Expand Down