diff --git a/pyproject.toml b/pyproject.toml index 1f270b435f..59a065601a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -262,6 +262,13 @@ run-coverage = "pytest --cov-config=pyproject.toml --cov=pkg --cov-report xml -- run-coverage-gpu = "pip install cupy-cuda12x && pytest -m gpu --cov-config=pyproject.toml --cov=pkg --cov-report xml --cov=src --junitxml=junit.xml -o junit_family=legacy" run-coverage-html = "pytest --cov-config=pyproject.toml --cov=pkg --cov-report html --cov=src" +[tool.hatch.envs.regression] +description = "Test environment for tests against older zarr-python versions" +dependencies = [ + "uv ==0.7.8" + ] +features=["test"] + [tool.ruff] line-length = 100 force-exclude = true diff --git a/tests/test_regression/__init__.py b/tests/test_regression/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/test_regression/test_regression.py b/tests/test_regression/test_regression.py new file mode 100644 index 0000000000..130a6b7472 --- /dev/null +++ b/tests/test_regression/test_regression.py @@ -0,0 +1,133 @@ +import subprocess +from dataclasses import asdict, dataclass +from itertools import product +from pathlib import Path + +import numcodecs +import numpy as np +import pytest +from numcodecs import LZ4, LZMA, Blosc, GZip, VLenBytes, VLenUTF8, Zstd + +import zarr +from zarr.core.array import Array +from zarr.core.metadata.v2 import ArrayV2Metadata +from zarr.storage import LocalStore + + +def runner_installed() -> bool: + try: + subprocess.check_output(["uv", "--version"]) + return True + except FileNotFoundError: + return False + + +def array_metadata_equals(a: ArrayV2Metadata, b: ArrayV2Metadata) -> bool: + dict_a, dict_b = asdict(a), asdict(b) + fill_value_a, fill_value_b = dict_a.pop("fill_value"), dict_b.pop("fill_value") + if ( + isinstance(fill_value_a, float) + and isinstance(fill_value_b, float) + and np.isnan(fill_value_a) + and np.isnan(fill_value_b) + ): + return dict_a == dict_b + else: + return fill_value_a == fill_value_b and dict_a == dict_b + + +@dataclass(kw_only=True) +class ArrayParams: + values: np.ndarray[tuple[int], np.dtype[np.generic]] + fill_value: np.generic | str + compressor: numcodecs.abc.Codec + filters: tuple[numcodecs.abc.Codec, ...] | None = None + + +basic_codecs = GZip(), Blosc(), LZ4(), LZMA(), Zstd() +basic_dtypes = "|b", ">i2", ">i4", ">f4", ">f8", "c8", "c16", "M8[10us]", "m8[4ps]" +string_dtypes = ">S1", "U4" + +basic_array_cases = [ + ArrayParams(values=np.arange(4, dtype=dtype), fill_value=1, compressor=codec) + for codec, dtype in product(basic_codecs, basic_dtypes) +] +datetime_array_cases = [ + ArrayParams(values=np.ones((4,), dtype=dtype), fill_value=1, compressor=codec) + for codec, dtype in product(basic_codecs, datetime_dtypes) +] +string_array_cases = [ + ArrayParams( + values=np.array(["aaaa", "bbbb", "ccccc", "dddd"], dtype=dtype), + fill_value="foo", + compressor=codec, + ) + for codec, dtype in product(basic_codecs, string_dtypes) +] +vlen_string_cases = [ + ArrayParams( + values=np.array(["a", "bb", "ccc", "dddd"], dtype="O"), + fill_value="1", + compressor=None, + filters=(VLenUTF8(),), + ) +] + +vlen_bytes_cases = [ + ArrayParams( + values=np.array([b"a", b"bb", b"ccc", b"dddd"], dtype="O"), + fill_value=b"1", + compressor=None, + filters=(VLenBytes(),), + ) +] +array_cases = ( + basic_array_cases + + datetime_array_cases + + string_array_cases + + vlen_string_cases + + vlen_bytes_cases +) + + +@pytest.fixture +def source_array(tmp_path: Path, request: pytest.FixtureRequest) -> Array: + dest = tmp_path / "in" + store = LocalStore(dest) + array_params: ArrayParams = request.param + return zarr.from_array( + store, + data=array_params.values, + chunks=array_params.values.shape, + compressors=array_params.compressor, + fill_value=array_params.fill_value, + order="C", + filters=array_params.filters, + chunk_key_encoding={"name": "v2", "configuration": {"separator": "/"}}, + write_data=True, + zarr_format=2, + ) + + +@pytest.mark.skipif(not runner_installed(), reason="no python script runner installed") +@pytest.mark.parametrize( + "source_array", array_cases, indirect=True, ids=tuple(map(str, array_cases)) +) +def test_roundtrip(source_array: Array, tmp_path: Path) -> None: + out_path = tmp_path / "out" + copy_op = subprocess.run( + [ + "uv", + "run", + Path(__file__).resolve().parent / "v2.18.py", + str(source_array.store).removeprefix("file://"), + str(out_path), + ], + capture_output=True, + text=True, + ) + assert copy_op.returncode == 0 + out_array = zarr.open_array(store=out_path, mode="r", zarr_format=2) + assert array_metadata_equals(source_array.metadata, out_array.metadata) + assert np.array_equal(source_array[:], out_array[:]) diff --git a/tests/test_regression/v2.18.py b/tests/test_regression/v2.18.py new file mode 100644 index 0000000000..39e1c5210c --- /dev/null +++ b/tests/test_regression/v2.18.py @@ -0,0 +1,81 @@ +# /// script +# requires-python = ">=3.11" +# dependencies = [ +# "zarr==2.18", +# "numcodecs==0.15" +# ] +# /// + +import argparse + +import zarr +from zarr._storage.store import BaseStore + + +def copy_group( + *, node: zarr.hierarchy.Group, store: zarr.storage.BaseStore, path: str, overwrite: bool +) -> zarr.hierarchy.Group: + result = zarr.group(store=store, path=path, overwrite=overwrite) + result.attrs.put(node.attrs.asdict()) + for key, child in node.items(): + child_path = f"{path}/{key}" + if isinstance(child, zarr.hierarchy.Group): + copy_group(node=child, store=store, path=child_path, overwrite=overwrite) + elif isinstance(child, zarr.core.Array): + copy_array(node=child, store=store, overwrite=overwrite, path=child_path) + return result + + +def copy_array( + *, node: zarr.core.Array, store: BaseStore, path: str, overwrite: bool +) -> zarr.core.Array: + result = zarr.create( + shape=node.shape, + dtype=node.dtype, + fill_value=node.fill_value, + chunks=node.chunks, + compressor=node.compressor, + filters=node.filters, + order=node.order, + dimension_separator=node._dimension_separator, + store=store, + path=path, + overwrite=overwrite, + ) + result.attrs.put(node.attrs.asdict()) + result[:] = node[:] + return result + + +def copy_node( + node: zarr.hierarchy.Group | zarr.core.Array, store: BaseStore, path: str, overwrite: bool +) -> zarr.hierarchy.Group | zarr.core.Array: + if isinstance(node, zarr.hierarchy.Group): + return copy_group(node=node, store=store, path=path, overwrite=overwrite) + elif isinstance(node, zarr.core.Array): + return copy_array(node=node, store=store, path=path, overwrite=overwrite) + else: + raise TypeError(f"Unexpected node type: {type(node)}") # pragma: no cover + + +def cli() -> None: + parser = argparse.ArgumentParser( + description="Copy a zarr hierarchy from one location to another" + ) + parser.add_argument("source", type=str, help="Path to the source zarr hierarchy") + parser.add_argument("destination", type=str, help="Path to the destination zarr hierarchy") + args = parser.parse_args() + + src, dst = args.source, args.destination + root_src = zarr.open(src, mode="r") + result = copy_node(node=root_src, store=zarr.NestedDirectoryStore(dst), path="", overwrite=True) + + print(f"successfully created {result} at {dst}") + + +def main() -> None: + cli() + + +if __name__ == "__main__": + main()