diff --git a/kerchunk/codecs.py b/kerchunk/codecs.py index 28f0fc15..3755b3f5 100644 --- a/kerchunk/codecs.py +++ b/kerchunk/codecs.py @@ -66,6 +66,19 @@ def decode(self, buf, out=None): numcodecs.register_codec(FillStringsCodec, "fill_hdf_strings") +class FletcherDummyFilter(numcodecs.abc.Codec): + codec_id = "fletcher_null" + + def decode(self, buff, out=None): + return buff[:-4] + + def encode(self, buf): + pass + + +numcodecs.register_codec(FletcherDummyFilter, "fletcher_null") + + class GRIBCodec(numcodecs.abc.Codec): """ Read GRIB stream of bytes as a message using eccodes diff --git a/kerchunk/hdf.py b/kerchunk/hdf.py index b54bd525..ebe461f1 100644 --- a/kerchunk/hdf.py +++ b/kerchunk/hdf.py @@ -7,7 +7,7 @@ import zarr from zarr.meta import encode_fill_value import numcodecs -from .codecs import FillStringsCodec +from .codecs import FillStringsCodec, FletcherDummyFilter from .utils import _encode_for_JSON try: @@ -376,6 +376,8 @@ def _translator(self, name: str, h5obj: Union[h5py.Dataset, h5py.Group]): ) # Create a Zarr array equivalent to this HDF5 dataset... + if h5obj.fletcher32: + filters.append(FletcherDummyFilter()) za = self._zroot.create_dataset( h5obj.name, shape=h5obj.shape, @@ -399,9 +401,6 @@ def _translator(self, name: str, h5obj: Union[h5py.Dataset, h5py.Group]): # Store chunk location metadata... if cinfo: for k, v in cinfo.items(): - if h5obj.fletcher32: - logging.info("Discarding fletcher32 checksum") - v["size"] -= 4 self.store[za._chunk_key(k)] = [ self._uri, v["offset"], diff --git a/kerchunk/tests/test_hdf.py b/kerchunk/tests/test_hdf.py index 50f1dba7..1ed0d20c 100644 --- a/kerchunk/tests/test_hdf.py +++ b/kerchunk/tests/test_hdf.py @@ -286,3 +286,39 @@ def test_compact(): m = fsspec.get_mapper("reference://", fo=out) g = zarr.open(m) assert np.allclose(g.ancillary_data.atlas_sdp_gps_epoch[:], 1.19880002e09) + + +@pytest.mark.parametrize("zlib", [True, False], ids=["zlib", "no_zlib"]) +@pytest.mark.parametrize("shuffle", [True, False], ids=["shuffle", "no_shuffle"]) +@pytest.mark.parametrize( + "fletcher32", [True, False], ids=["fletcher32", "no_fletcher32"] +) +def test_encoding_options(zlib, shuffle, fletcher32, tmp_path): + fname = tmp_path / "test.nc" + + shape = (2, 10) + chunksizes = (1, 10) + + encoding = { + "zlib": zlib, + "shuffle": shuffle, + "complevel": 2, + "fletcher32": fletcher32, + "contiguous": False, + "chunksizes": chunksizes, + } + + da = xr.DataArray( + data=np.random.rand(*shape), dims=["y", "x"], name="foo", attrs={"bar": "baz"} + ) + da.encoding = encoding + ds = da.to_dataset() + ds.to_netcdf(fname, engine="netcdf4", mode="w") + + with fsspec.open(fname) as fp: + h5chunks = kerchunk.hdf.SingleHdf5ToZarr(fp, fname, inline_threshold=0, spec=0) + refs = h5chunks.translate() + + store = fsspec.get_mapper("reference://", fo=refs) + ds2 = xr.open_dataset(store, engine="zarr", chunks={}) + xr.testing.assert_identical(ds, ds2)