Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 33 additions & 1 deletion virtualizarr/tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pytest
import xarray as xr
import xarray.testing as xrt
from obstore.store import LocalStore
from obstore.store import LocalStore, from_url

from conftest import ARRAYBYTES_CODEC, ZLIB_CODEC
from virtualizarr import open_virtual_dataset
Expand All @@ -25,6 +25,7 @@
has_icechunk,
has_kerchunk,
requires_kerchunk,
requires_network,
requires_zarr_python,
)

Expand Down Expand Up @@ -519,3 +520,34 @@ def test_convert_relative_paths_to_urls(self, netcdf4_file, local_registry):
path = manifest["0.0.0"]["path"]

assert path == expected_path


@requires_kerchunk
@requires_network
def test_roundtrip_dataset_with_multiple_compressors():
# Regression test to make sure we can load data with a compression and a shuffle codec
# TODO: Simplify this test to not require network access
import s3fs

bucket = "s3://nex-gddp-cmip6"
path = "NEX-GDDP-CMIP6/ACCESS-CM2/ssp126/r1i1p1f1/tasmax/tasmax_day_ACCESS-CM2_ssp126_r1i1p1f1_gn_2015_v2.0.nc"
url = f"{bucket}/{path}"
store = from_url(bucket, region="us-west-2", skip_signature=True)
registry = ObjectStoreRegistry({bucket: store})
parser = HDFParser()
vds = open_virtual_dataset(
url=url, parser=parser, registry=registry, loadable_variables=[]
)

ds_refs = vds.vz.to_kerchunk(format="dict")
fs = s3fs.S3FileSystem(anon=True)
with (
xr.open_dataset(fs.open(url), engine="h5netcdf", decode_times=True) as expected,
xr.open_dataset(
ds_refs,
decode_times=True,
engine="kerchunk",
storage_options={"remote_options": {"anon": True}},
) as observed,
):
xr.testing.assert_allclose(expected, observed)
2 changes: 1 addition & 1 deletion virtualizarr/tests/test_writers/test_kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def testconvert_v3_to_v2_metadata(array_v3_metadata):
assert v2_metadata.dtype.to_native_dtype() == np.dtype("int32")
assert v2_metadata.chunks == chunks
assert v2_metadata.fill_value == 0
compressor_config = v2_metadata.compressor.get_config()
compressor_config = v2_metadata.filters[1].get_config()
assert compressor_config["id"] == "blosc"
assert compressor_config["cname"] == "zstd"
assert compressor_config["clevel"] == 5
Expand Down
40 changes: 19 additions & 21 deletions virtualizarr/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,9 @@
from typing import TYPE_CHECKING, Any, Iterable, Mapping, Optional, Sequence, Union

import obstore as obs
from zarr.abc.codec import ArrayArrayCodec, BytesBytesCodec
from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata

from virtualizarr.codecs import extract_codecs, get_codec_config
from virtualizarr.codecs import get_codec_config
from virtualizarr.types.kerchunk import KerchunkStoreRefs

# taken from zarr.core.common
Expand Down Expand Up @@ -132,32 +131,31 @@ def convert_v3_to_v2_metadata(
ArrayV2Metadata
The metadata object in v2 format.
"""
import warnings

array_filters: tuple[ArrayArrayCodec, ...]
bytes_compressors: tuple[BytesBytesCodec, ...]
array_filters, _, bytes_compressors = extract_codecs(v3_metadata.codecs)
# Handle compressor configuration
compressor_config: dict[str, Any] | None = None
if bytes_compressors:
if len(bytes_compressors) > 1:
warnings.warn(
"Multiple compressors found in v3 metadata. Using the first compressor, "
"others will be ignored. This may affect data compatibility.",
UserWarning,
)
compressor_config = get_codec_config(bytes_compressors[0])

# Handle filter configurations
filter_configs = [get_codec_config(filter_) for filter_ in array_filters]
def _to_v2_codec(codec_config: dict) -> dict:
if name := codec_config.get("name", None):
return {"id": name, **codec_config["configuration"]}
elif codec_config.get("id", None):
return codec_config
else:
raise ValueError(
f"Expected a valid Zarr V2 or V3 codec dict, got {codec_config}"
)

# TODO: Find a more robust way to exclude any bytes codecs
# TODO: Test round-tripping big endian since that is stored in the bytes codec in V3; it should be included in data type instead for V2
v2_codecs = [
_to_v2_codec(get_codec_config(codec))
for codec in v3_metadata.codecs
if codec.__class__.__name__ != "BytesCodec"
]
Comment on lines +145 to +142
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@d-v-b are all ArrayBytesCodecs irrelevant for Zarr V2 where we could exclude based on whether they are a ArrayBytesCodec instance?

v2_metadata = ArrayV2Metadata(
shape=v3_metadata.shape,
dtype=v3_metadata.data_type,
chunks=v3_metadata.chunks,
fill_value=fill_value or v3_metadata.fill_value,
compressor=compressor_config,
filters=filter_configs,
filters=v2_codecs,
compressor=None,
order="C",
attributes=v3_metadata.attributes,
dimension_separator=".", # Assuming '.' as default dimension separator
Expand Down
Loading