From 45bb4e56d24ab5d979eabb92b1db11a5897937be Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Tue, 1 Jul 2025 11:08:07 +0100 Subject: [PATCH 01/57] add rough cli converter structure --- src/zarr/core/metadata/converter/__init__.py | 0 src/zarr/core/metadata/converter/cli.py | 0 .../metadata/converter/v2_v3_converter.py | 67 +++++++++++++++++++ src/zarr/core/metadata/v2.py | 2 +- 4 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 src/zarr/core/metadata/converter/__init__.py create mode 100644 src/zarr/core/metadata/converter/cli.py create mode 100644 src/zarr/core/metadata/converter/v2_v3_converter.py diff --git a/src/zarr/core/metadata/converter/__init__.py b/src/zarr/core/metadata/converter/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/zarr/core/metadata/converter/cli.py b/src/zarr/core/metadata/converter/cli.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/zarr/core/metadata/converter/v2_v3_converter.py b/src/zarr/core/metadata/converter/v2_v3_converter.py new file mode 100644 index 0000000000..7d79b5e2d0 --- /dev/null +++ b/src/zarr/core/metadata/converter/v2_v3_converter.py @@ -0,0 +1,67 @@ +from zarr.abc.codec import Codec +from zarr.codecs.blosc import BloscCodec, BloscShuffle +from zarr.core.array import Array +from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding +from zarr.core.metadata.v2 import ArrayV2Metadata +from zarr.core.metadata.v3 import ArrayV3Metadata + + +async def convert_v2_to_v3(zarr_v2: Array) -> None: + if not isinstance(zarr_v2.metadata, ArrayV2Metadata): + raise TypeError("Only arrays / groups with zarr v2 metadata can be converted") + + # zarr_format = zarr_v2.metadata.zarr_format + # if zarr_format != 2: + # raise ValueError( + # f"Input zarr array / group is zarr_format {zarr_format} - only 2 is accepted." + # ) + + # accept array or group - if group, iterate into it to do the whole hierarchy + + # how are the metadata files currently written? Which function? + # could add a to_v3() function on to the ArrayV2Metadata / GroupMetadata classes?? + convert_v2_metadata(zarr_v2.metadata) + + # Check for existing zarr json? + # await zarr_v2._async_array._save_metadata(metadata_v3) + + +def convert_v2_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: + chunk_key_encoding = DefaultChunkKeyEncoding(separator=metadata_v2.dimension_separator) + + # Handle C vs F? (see gist) + codecs = convert_compressor(metadata_v2) + + return ArrayV3Metadata( + shape=metadata_v2.shape, + data_type=metadata_v2.dtype, + chunk_grid=metadata_v2.chunk_grid, + chunk_key_encoding=chunk_key_encoding, + fill_value=metadata_v2.fill_value, + codecs=codecs, + attributes=metadata_v2.attributes, + dimension_names=None, + storage_transformers=None, + ) + + +def convert_compressor(metadata_v2: ArrayV2Metadata) -> list[Codec]: + compressor_codecs: list[Codec] = [] + + if metadata_v2.compressor is None: + return compressor_codecs + + compressor_name = metadata_v2.compressor.codec_id + + if compressor_name == "blosc": + compressor_codecs.append( + BloscCodec( + typesize=metadata_v2.dtype.to_native_dtype().itemsize, + cname=metadata_v2.compressor.cname, + clevel=metadata_v2.compressor.clevel, + shuffle=BloscShuffle.from_int(metadata_v2.compressor.shuffle), + blocksize=metadata_v2.compressor.blocksize, + ) + ) + + return compressor_codecs diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 3ac75e0418..7bdad204b8 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -68,7 +68,7 @@ class ArrayV2Metadata(Metadata): order: MemoryOrder = "C" filters: tuple[numcodecs.abc.Codec, ...] | None = None dimension_separator: Literal[".", "/"] = "." - compressor: CompressorLikev2 + compressor: numcodecs.abc.Codec | None attributes: dict[str, JSON] = field(default_factory=dict) zarr_format: Literal[2] = field(init=False, default=2) From 456c9e774061ccb265ca5752854ae8414077b9f9 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Tue, 1 Jul 2025 12:15:09 +0100 Subject: [PATCH 02/57] allow zstd, gzip and numcodecs zarr 3 compression --- .../metadata/converter/v2_v3_converter.py | 50 +++++++++++++++---- 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/src/zarr/core/metadata/converter/v2_v3_converter.py b/src/zarr/core/metadata/converter/v2_v3_converter.py index 7d79b5e2d0..0208ce9c26 100644 --- a/src/zarr/core/metadata/converter/v2_v3_converter.py +++ b/src/zarr/core/metadata/converter/v2_v3_converter.py @@ -1,9 +1,12 @@ from zarr.abc.codec import Codec from zarr.codecs.blosc import BloscCodec, BloscShuffle +from zarr.codecs.gzip import GzipCodec +from zarr.codecs.zstd import ZstdCodec from zarr.core.array import Array from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata +from zarr.registry import get_codec_class async def convert_v2_to_v3(zarr_v2: Array) -> None: @@ -53,15 +56,44 @@ def convert_compressor(metadata_v2: ArrayV2Metadata) -> list[Codec]: compressor_name = metadata_v2.compressor.codec_id - if compressor_name == "blosc": - compressor_codecs.append( - BloscCodec( - typesize=metadata_v2.dtype.to_native_dtype().itemsize, - cname=metadata_v2.compressor.cname, - clevel=metadata_v2.compressor.clevel, - shuffle=BloscShuffle.from_int(metadata_v2.compressor.shuffle), - blocksize=metadata_v2.compressor.blocksize, + match compressor_name: + case "blosc": + compressor_codecs.append( + BloscCodec( + typesize=metadata_v2.dtype.to_native_dtype().itemsize, + cname=metadata_v2.compressor.cname, + clevel=metadata_v2.compressor.clevel, + shuffle=BloscShuffle.from_int(metadata_v2.compressor.shuffle), + blocksize=metadata_v2.compressor.blocksize, + ) ) - ) + + case "zstd": + compressor_codecs.append( + ZstdCodec( + level=metadata_v2.compressor.level, + checksum=metadata_v2.compressor.checksum, + ) + ) + + case "gzip": + compressor_codecs.append(GzipCodec(level=metadata_v2.compressor.level)) + + case _: + # If possible, find matching numcodecs.zarr3 codec + numcodec_name = f"numcodecs.{compressor_name}" + numcodec_dict = { + "name": numcodec_name, + "configuration": metadata_v2.compressor.get_config(), + } + + try: + compressor_codec = get_codec_class(numcodec_name) + except KeyError as exc: + raise ValueError( + f"Couldn't find corresponding numcodecs.zarr3 codec for {compressor_name}" + ) from exc + + compressor_codecs.append(compressor_codec.from_dict(numcodec_dict)) return compressor_codecs From 242a338353d50ec7ca8c6008fa22d9443c3737d0 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Tue, 1 Jul 2025 14:22:14 +0100 Subject: [PATCH 03/57] convert filters to v3 --- .../metadata/converter/v2_v3_converter.py | 62 ++++++++++++++----- 1 file changed, 46 insertions(+), 16 deletions(-) diff --git a/src/zarr/core/metadata/converter/v2_v3_converter.py b/src/zarr/core/metadata/converter/v2_v3_converter.py index 0208ce9c26..23085addc4 100644 --- a/src/zarr/core/metadata/converter/v2_v3_converter.py +++ b/src/zarr/core/metadata/converter/v2_v3_converter.py @@ -1,4 +1,8 @@ -from zarr.abc.codec import Codec +from typing import cast + +import numcodecs.abc + +from zarr.abc.codec import ArrayArrayCodec, BytesBytesCodec, Codec from zarr.codecs.blosc import BloscCodec, BloscShuffle from zarr.codecs.gzip import GzipCodec from zarr.codecs.zstd import ZstdCodec @@ -33,7 +37,10 @@ def convert_v2_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: chunk_key_encoding = DefaultChunkKeyEncoding(separator=metadata_v2.dimension_separator) # Handle C vs F? (see gist) - codecs = convert_compressor(metadata_v2) + convert_filters(metadata_v2) + convert_compressor(metadata_v2) + + codecs: list[Codec] = [] return ArrayV3Metadata( shape=metadata_v2.shape, @@ -48,8 +55,20 @@ def convert_v2_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: ) -def convert_compressor(metadata_v2: ArrayV2Metadata) -> list[Codec]: - compressor_codecs: list[Codec] = [] +def convert_filters(metadata_v2: ArrayV2Metadata) -> list[ArrayArrayCodec]: + if metadata_v2.filters is None: + return [] + + filters_codecs = [find_numcodecs_zarr3(filter) for filter in metadata_v2.filters] + for codec in filters_codecs: + if not isinstance(codec, ArrayArrayCodec): + raise TypeError(f"Filter {type(codec)} is not an ArrayArrayCodec") + + return cast(list[ArrayArrayCodec], filters_codecs) + + +def convert_compressor(metadata_v2: ArrayV2Metadata) -> list[BytesBytesCodec]: + compressor_codecs: list[BytesBytesCodec] = [] if metadata_v2.compressor is None: return compressor_codecs @@ -81,19 +100,30 @@ def convert_compressor(metadata_v2: ArrayV2Metadata) -> list[Codec]: case _: # If possible, find matching numcodecs.zarr3 codec - numcodec_name = f"numcodecs.{compressor_name}" - numcodec_dict = { - "name": numcodec_name, - "configuration": metadata_v2.compressor.get_config(), - } + compressor_codec = find_numcodecs_zarr3(metadata_v2.compressor) - try: - compressor_codec = get_codec_class(numcodec_name) - except KeyError as exc: - raise ValueError( - f"Couldn't find corresponding numcodecs.zarr3 codec for {compressor_name}" - ) from exc + if not isinstance(compressor_codec, BytesBytesCodec): + raise TypeError(f"Compressor {type(compressor_codec)} is not a BytesBytesCodec") - compressor_codecs.append(compressor_codec.from_dict(numcodec_dict)) + compressor_codecs.append(compressor_codec) return compressor_codecs + + +def find_numcodecs_zarr3(numcodecs_codec: numcodecs.abc.Codec) -> Codec: + """Find matching numcodecs.zarr3 codec (if it exists)""" + + numcodec_name = f"numcodecs.{numcodecs_codec.codec_id}" + numcodec_dict = { + "name": numcodec_name, + "configuration": numcodecs_codec.get_config(), + } + + try: + codec_v3 = get_codec_class(numcodec_name) + except KeyError as exc: + raise ValueError( + f"Couldn't find corresponding numcodecs.zarr3 codec for {numcodecs_codec.codec_id}" + ) from exc + + return codec_v3.from_dict(numcodec_dict) From 1045c331c59e50e22959c054505d15869f94d33c Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Tue, 1 Jul 2025 15:05:33 +0100 Subject: [PATCH 04/57] create BytesCodec with correct endian --- .../metadata/converter/v2_v3_converter.py | 53 ++++++++++--------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/src/zarr/core/metadata/converter/v2_v3_converter.py b/src/zarr/core/metadata/converter/v2_v3_converter.py index 23085addc4..cbf5603abc 100644 --- a/src/zarr/core/metadata/converter/v2_v3_converter.py +++ b/src/zarr/core/metadata/converter/v2_v3_converter.py @@ -4,10 +4,12 @@ from zarr.abc.codec import ArrayArrayCodec, BytesBytesCodec, Codec from zarr.codecs.blosc import BloscCodec, BloscShuffle +from zarr.codecs.bytes import BytesCodec from zarr.codecs.gzip import GzipCodec from zarr.codecs.zstd import ZstdCodec from zarr.core.array import Array from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding +from zarr.core.dtype.common import HasEndianness from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.registry import get_codec_class @@ -37,11 +39,22 @@ def convert_v2_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: chunk_key_encoding = DefaultChunkKeyEncoding(separator=metadata_v2.dimension_separator) # Handle C vs F? (see gist) - convert_filters(metadata_v2) - convert_compressor(metadata_v2) - codecs: list[Codec] = [] + # array-array codecs + codecs.extend(convert_filters(metadata_v2)) + + # array-bytes codecs + if not isinstance(metadata_v2.dtype, HasEndianness): + codecs.append(BytesCodec(endian=None)) + else: + codecs.append(BytesCodec(endian=metadata_v2.dtype.endianness)) + + # bytes-bytes codecs + bytes_bytes_codec = convert_compressor(metadata_v2) + if bytes_bytes_codec is not None: + codecs.append(bytes_bytes_codec) + return ArrayV3Metadata( shape=metadata_v2.shape, data_type=metadata_v2.dtype, @@ -67,36 +80,30 @@ def convert_filters(metadata_v2: ArrayV2Metadata) -> list[ArrayArrayCodec]: return cast(list[ArrayArrayCodec], filters_codecs) -def convert_compressor(metadata_v2: ArrayV2Metadata) -> list[BytesBytesCodec]: - compressor_codecs: list[BytesBytesCodec] = [] - +def convert_compressor(metadata_v2: ArrayV2Metadata) -> BytesBytesCodec | None: if metadata_v2.compressor is None: - return compressor_codecs + return None compressor_name = metadata_v2.compressor.codec_id match compressor_name: case "blosc": - compressor_codecs.append( - BloscCodec( - typesize=metadata_v2.dtype.to_native_dtype().itemsize, - cname=metadata_v2.compressor.cname, - clevel=metadata_v2.compressor.clevel, - shuffle=BloscShuffle.from_int(metadata_v2.compressor.shuffle), - blocksize=metadata_v2.compressor.blocksize, - ) + return BloscCodec( + typesize=metadata_v2.dtype.to_native_dtype().itemsize, + cname=metadata_v2.compressor.cname, + clevel=metadata_v2.compressor.clevel, + shuffle=BloscShuffle.from_int(metadata_v2.compressor.shuffle), + blocksize=metadata_v2.compressor.blocksize, ) case "zstd": - compressor_codecs.append( - ZstdCodec( - level=metadata_v2.compressor.level, - checksum=metadata_v2.compressor.checksum, - ) + return ZstdCodec( + level=metadata_v2.compressor.level, + checksum=metadata_v2.compressor.checksum, ) case "gzip": - compressor_codecs.append(GzipCodec(level=metadata_v2.compressor.level)) + return GzipCodec(level=metadata_v2.compressor.level) case _: # If possible, find matching numcodecs.zarr3 codec @@ -105,9 +112,7 @@ def convert_compressor(metadata_v2: ArrayV2Metadata) -> list[BytesBytesCodec]: if not isinstance(compressor_codec, BytesBytesCodec): raise TypeError(f"Compressor {type(compressor_codec)} is not a BytesBytesCodec") - compressor_codecs.append(compressor_codec) - - return compressor_codecs + return compressor_codec def find_numcodecs_zarr3(numcodecs_codec: numcodecs.abc.Codec) -> Codec: From 4e2442fb8494f4dc264956856f7d94dbcf34133a Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Tue, 1 Jul 2025 15:59:42 +0100 Subject: [PATCH 05/57] handle C vs F order in v2 metadata --- src/zarr/core/metadata/converter/v2_v3_converter.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/zarr/core/metadata/converter/v2_v3_converter.py b/src/zarr/core/metadata/converter/v2_v3_converter.py index cbf5603abc..5571f14487 100644 --- a/src/zarr/core/metadata/converter/v2_v3_converter.py +++ b/src/zarr/core/metadata/converter/v2_v3_converter.py @@ -6,6 +6,7 @@ from zarr.codecs.blosc import BloscCodec, BloscShuffle from zarr.codecs.bytes import BytesCodec from zarr.codecs.gzip import GzipCodec +from zarr.codecs.transpose import TransposeCodec from zarr.codecs.zstd import ZstdCodec from zarr.core.array import Array from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding @@ -38,10 +39,12 @@ async def convert_v2_to_v3(zarr_v2: Array) -> None: def convert_v2_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: chunk_key_encoding = DefaultChunkKeyEncoding(separator=metadata_v2.dimension_separator) - # Handle C vs F? (see gist) codecs: list[Codec] = [] # array-array codecs + if metadata_v2.order == "F": + # F is equivalent to order: n-1, ... 1, 0 + codecs.append(TransposeCodec(order=list(range(len(metadata_v2.shape) - 1, -1, -1)))) codecs.extend(convert_filters(metadata_v2)) # array-bytes codecs From c63f0b87bacfd10a7252f815140f3264a74d041e Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Wed, 2 Jul 2025 15:27:13 +0100 Subject: [PATCH 06/57] save group and array metadata to file --- .../metadata/converter/v2_v3_converter.py | 51 +++++++++++++------ 1 file changed, 36 insertions(+), 15 deletions(-) diff --git a/src/zarr/core/metadata/converter/v2_v3_converter.py b/src/zarr/core/metadata/converter/v2_v3_converter.py index 5571f14487..fce2ad4654 100644 --- a/src/zarr/core/metadata/converter/v2_v3_converter.py +++ b/src/zarr/core/metadata/converter/v2_v3_converter.py @@ -1,3 +1,4 @@ +import asyncio from typing import cast import numcodecs.abc @@ -9,34 +10,37 @@ from zarr.codecs.transpose import TransposeCodec from zarr.codecs.zstd import ZstdCodec from zarr.core.array import Array +from zarr.core.buffer.core import default_buffer_prototype from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding +from zarr.core.common import ZARR_JSON from zarr.core.dtype.common import HasEndianness +from zarr.core.group import Group, GroupMetadata from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.registry import get_codec_class +from zarr.storage._utils import _join_paths -async def convert_v2_to_v3(zarr_v2: Array) -> None: - if not isinstance(zarr_v2.metadata, ArrayV2Metadata): +async def convert_v2_to_v3(zarr_v2: Array | Group) -> None: + if not zarr_v2.metadata.zarr_format == 2: raise TypeError("Only arrays / groups with zarr v2 metadata can be converted") - # zarr_format = zarr_v2.metadata.zarr_format - # if zarr_format != 2: - # raise ValueError( - # f"Input zarr array / group is zarr_format {zarr_format} - only 2 is accepted." - # ) + if isinstance(zarr_v2.metadata, GroupMetadata): + group_metadata_v3 = GroupMetadata( + attributes=zarr_v2.metadata.attributes, zarr_format=3, consolidated_metadata=None + ) + await save_v3_metadata(zarr_v2, group_metadata_v3) - # accept array or group - if group, iterate into it to do the whole hierarchy + # process members of the group + for key in zarr_v2: + await convert_v2_to_v3(zarr_v2[key]) - # how are the metadata files currently written? Which function? - # could add a to_v3() function on to the ArrayV2Metadata / GroupMetadata classes?? - convert_v2_metadata(zarr_v2.metadata) - - # Check for existing zarr json? - # await zarr_v2._async_array._save_metadata(metadata_v3) + else: + array_metadata_v3 = convert_array_v2_metadata(zarr_v2.metadata) + await save_v3_metadata(zarr_v2, array_metadata_v3) -def convert_v2_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: +def convert_array_v2_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: chunk_key_encoding = DefaultChunkKeyEncoding(separator=metadata_v2.dimension_separator) codecs: list[Codec] = [] @@ -135,3 +139,20 @@ def find_numcodecs_zarr3(numcodecs_codec: numcodecs.abc.Codec) -> Codec: ) from exc return codec_v3.from_dict(numcodec_dict) + + +async def save_v3_metadata( + zarr_v2: Array | Group, metadata_v3: ArrayV3Metadata | GroupMetadata +) -> None: + zarr_json_path = _join_paths([zarr_v2.path, ZARR_JSON]) + + if await zarr_v2.store.exists(zarr_json_path): + raise ValueError(f"{ZARR_JSON} already exists at {zarr_v2.store_path}") + + to_save = metadata_v3.to_buffer_dict(default_buffer_prototype()) + awaitables = [ + zarr_v2.store.set_if_not_exists(_join_paths([zarr_v2.path, key]), value) + for key, value in to_save.items() + ] + + await asyncio.gather(*awaitables) From 2947ce4b2fc4953252a3715a7ec941323a79b95f Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Wed, 2 Jul 2025 15:50:50 +0100 Subject: [PATCH 07/57] create overall conversion functions for store, array or group --- .../metadata/converter/v2_v3_converter.py | 55 ++++++++++++++----- 1 file changed, 41 insertions(+), 14 deletions(-) diff --git a/src/zarr/core/metadata/converter/v2_v3_converter.py b/src/zarr/core/metadata/converter/v2_v3_converter.py index fce2ad4654..fa9b61715f 100644 --- a/src/zarr/core/metadata/converter/v2_v3_converter.py +++ b/src/zarr/core/metadata/converter/v2_v3_converter.py @@ -3,6 +3,7 @@ import numcodecs.abc +import zarr from zarr.abc.codec import ArrayArrayCodec, BytesBytesCodec, Codec from zarr.codecs.blosc import BloscCodec, BloscShuffle from zarr.codecs.bytes import BytesCodec @@ -18,10 +19,36 @@ from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.registry import get_codec_class +from zarr.storage import StoreLike from zarr.storage._utils import _join_paths -async def convert_v2_to_v3(zarr_v2: Array | Group) -> None: +async def convert_v2_to_v3(store: StoreLike, path: str | None = None) -> None: + """Convert all v2 metadata in a zarr hierarchy to v3. This will create a zarr.json file at each level + (for every group / array). V2 files (.zarray, .zattrs etc.) will be left as-is. + + Parameters + ---------- + store : StoreLike + Store or path to directory in file system or name of zip file. + path : str | None, optional + The path within the store to open, by default None + """ + + zarr_v2 = zarr.open(store=store, mode="r+", path=path) + await convert_array_or_group(zarr_v2) + + +async def convert_array_or_group(zarr_v2: Array | Group) -> None: + """Convert all v2 metadata in a zarr array/group to v3. Note - if a group is provided, then + all arrays / groups within this group will also be converted. A zarr.json file will be created + at each level, with any V2 files (.zarray, .zattrs etc.) left as-is. + + Parameters + ---------- + zarr_v2 : Array | Group + An array or group with zarr_format = 2 + """ if not zarr_v2.metadata.zarr_format == 2: raise TypeError("Only arrays / groups with zarr v2 metadata can be converted") @@ -29,18 +56,18 @@ async def convert_v2_to_v3(zarr_v2: Array | Group) -> None: group_metadata_v3 = GroupMetadata( attributes=zarr_v2.metadata.attributes, zarr_format=3, consolidated_metadata=None ) - await save_v3_metadata(zarr_v2, group_metadata_v3) + await _save_v3_metadata(zarr_v2, group_metadata_v3) # process members of the group for key in zarr_v2: - await convert_v2_to_v3(zarr_v2[key]) + await convert_array_or_group(zarr_v2[key]) else: - array_metadata_v3 = convert_array_v2_metadata(zarr_v2.metadata) - await save_v3_metadata(zarr_v2, array_metadata_v3) + array_metadata_v3 = _convert_array_metadata(zarr_v2.metadata) + await _save_v3_metadata(zarr_v2, array_metadata_v3) -def convert_array_v2_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: +def _convert_array_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: chunk_key_encoding = DefaultChunkKeyEncoding(separator=metadata_v2.dimension_separator) codecs: list[Codec] = [] @@ -49,7 +76,7 @@ def convert_array_v2_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: if metadata_v2.order == "F": # F is equivalent to order: n-1, ... 1, 0 codecs.append(TransposeCodec(order=list(range(len(metadata_v2.shape) - 1, -1, -1)))) - codecs.extend(convert_filters(metadata_v2)) + codecs.extend(_convert_filters(metadata_v2)) # array-bytes codecs if not isinstance(metadata_v2.dtype, HasEndianness): @@ -58,7 +85,7 @@ def convert_array_v2_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: codecs.append(BytesCodec(endian=metadata_v2.dtype.endianness)) # bytes-bytes codecs - bytes_bytes_codec = convert_compressor(metadata_v2) + bytes_bytes_codec = _convert_compressor(metadata_v2) if bytes_bytes_codec is not None: codecs.append(bytes_bytes_codec) @@ -75,11 +102,11 @@ def convert_array_v2_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: ) -def convert_filters(metadata_v2: ArrayV2Metadata) -> list[ArrayArrayCodec]: +def _convert_filters(metadata_v2: ArrayV2Metadata) -> list[ArrayArrayCodec]: if metadata_v2.filters is None: return [] - filters_codecs = [find_numcodecs_zarr3(filter) for filter in metadata_v2.filters] + filters_codecs = [_find_numcodecs_zarr3(filter) for filter in metadata_v2.filters] for codec in filters_codecs: if not isinstance(codec, ArrayArrayCodec): raise TypeError(f"Filter {type(codec)} is not an ArrayArrayCodec") @@ -87,7 +114,7 @@ def convert_filters(metadata_v2: ArrayV2Metadata) -> list[ArrayArrayCodec]: return cast(list[ArrayArrayCodec], filters_codecs) -def convert_compressor(metadata_v2: ArrayV2Metadata) -> BytesBytesCodec | None: +def _convert_compressor(metadata_v2: ArrayV2Metadata) -> BytesBytesCodec | None: if metadata_v2.compressor is None: return None @@ -114,7 +141,7 @@ def convert_compressor(metadata_v2: ArrayV2Metadata) -> BytesBytesCodec | None: case _: # If possible, find matching numcodecs.zarr3 codec - compressor_codec = find_numcodecs_zarr3(metadata_v2.compressor) + compressor_codec = _find_numcodecs_zarr3(metadata_v2.compressor) if not isinstance(compressor_codec, BytesBytesCodec): raise TypeError(f"Compressor {type(compressor_codec)} is not a BytesBytesCodec") @@ -122,7 +149,7 @@ def convert_compressor(metadata_v2: ArrayV2Metadata) -> BytesBytesCodec | None: return compressor_codec -def find_numcodecs_zarr3(numcodecs_codec: numcodecs.abc.Codec) -> Codec: +def _find_numcodecs_zarr3(numcodecs_codec: numcodecs.abc.Codec) -> Codec: """Find matching numcodecs.zarr3 codec (if it exists)""" numcodec_name = f"numcodecs.{numcodecs_codec.codec_id}" @@ -141,7 +168,7 @@ def find_numcodecs_zarr3(numcodecs_codec: numcodecs.abc.Codec) -> Codec: return codec_v3.from_dict(numcodec_dict) -async def save_v3_metadata( +async def _save_v3_metadata( zarr_v2: Array | Group, metadata_v3: ArrayV3Metadata | GroupMetadata ) -> None: zarr_json_path = _join_paths([zarr_v2.path, ZARR_JSON]) From ba8175502017d169b5d3aa300f11aea5069a48ab Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Thu, 3 Jul 2025 11:56:42 +0100 Subject: [PATCH 08/57] add minimal typer cli --- pyproject.toml | 1 + src/zarr/core/metadata/converter/cli.py | 32 +++++++++++++++++++ ...{v2_v3_converter.py => converter_v2_v3.py} | 24 +++++++++----- 3 files changed, 49 insertions(+), 8 deletions(-) rename src/zarr/core/metadata/converter/{v2_v3_converter.py => converter_v2_v3.py} (88%) diff --git a/pyproject.toml b/pyproject.toml index 6c18563a1f..eb95c5dada 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,6 +68,7 @@ remote = [ gpu = [ "cupy-cuda12x", ] +cli = ["typer"] # Development extras test = [ "coverage", diff --git a/src/zarr/core/metadata/converter/cli.py b/src/zarr/core/metadata/converter/cli.py index e69de29bb2..d5c693e07f 100644 --- a/src/zarr/core/metadata/converter/cli.py +++ b/src/zarr/core/metadata/converter/cli.py @@ -0,0 +1,32 @@ +from typing import Annotated + +import typer + +from zarr.core.metadata.converter.converter_v2_v3 import convert_v2_to_v3 + +app = typer.Typer() + + +@app.command() # type: ignore[misc] +def convert( + store: Annotated[ + str, + typer.Argument( + help="Store or path to directory in file system or name of zip file e.g. 'data/example-1.zarr', 's3://example-bucket/example'..." + ), + ], + path: Annotated[str | None, typer.Option(help="The path within the store to open")] = None, +) -> None: + """Convert all v2 metadata in a zarr hierarchy to v3. This will create a zarr.json file at each level + (for every group / array). V2 files (.zarray, .zattrs etc.) will be left as-is. + """ + convert_v2_to_v3(store=store, path=path) + + +@app.command() # type: ignore[misc] +def clear() -> None: + print("Clearing...") + + +if __name__ == "__main__": + app() diff --git a/src/zarr/core/metadata/converter/v2_v3_converter.py b/src/zarr/core/metadata/converter/converter_v2_v3.py similarity index 88% rename from src/zarr/core/metadata/converter/v2_v3_converter.py rename to src/zarr/core/metadata/converter/converter_v2_v3.py index fa9b61715f..33845a53a8 100644 --- a/src/zarr/core/metadata/converter/v2_v3_converter.py +++ b/src/zarr/core/metadata/converter/converter_v2_v3.py @@ -1,5 +1,5 @@ import asyncio -from typing import cast +from typing import Any, cast import numcodecs.abc @@ -18,12 +18,15 @@ from zarr.core.group import Group, GroupMetadata from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata +from zarr.core.sync import sync from zarr.registry import get_codec_class from zarr.storage import StoreLike from zarr.storage._utils import _join_paths -async def convert_v2_to_v3(store: StoreLike, path: str | None = None) -> None: +def convert_v2_to_v3( + store: StoreLike, path: str | None = None, storage_options: dict[str, Any] | None = None +) -> None: """Convert all v2 metadata in a zarr hierarchy to v3. This will create a zarr.json file at each level (for every group / array). V2 files (.zarray, .zattrs etc.) will be left as-is. @@ -33,13 +36,18 @@ async def convert_v2_to_v3(store: StoreLike, path: str | None = None) -> None: Store or path to directory in file system or name of zip file. path : str | None, optional The path within the store to open, by default None + storage_options : dict | None, optional + If the store is backed by an fsspec-based implementation, then this dict will be passed to + the Store constructor for that implementation. Ignored otherwise. """ - zarr_v2 = zarr.open(store=store, mode="r+", path=path) - await convert_array_or_group(zarr_v2) + zarr_v2 = zarr.open( + store=store, mode="r+", zarr_format=2, path=path, storage_options=storage_options + ) + convert_array_or_group(zarr_v2) -async def convert_array_or_group(zarr_v2: Array | Group) -> None: +def convert_array_or_group(zarr_v2: Array | Group) -> None: """Convert all v2 metadata in a zarr array/group to v3. Note - if a group is provided, then all arrays / groups within this group will also be converted. A zarr.json file will be created at each level, with any V2 files (.zarray, .zattrs etc.) left as-is. @@ -56,15 +64,15 @@ async def convert_array_or_group(zarr_v2: Array | Group) -> None: group_metadata_v3 = GroupMetadata( attributes=zarr_v2.metadata.attributes, zarr_format=3, consolidated_metadata=None ) - await _save_v3_metadata(zarr_v2, group_metadata_v3) + sync(_save_v3_metadata(zarr_v2, group_metadata_v3)) # process members of the group for key in zarr_v2: - await convert_array_or_group(zarr_v2[key]) + convert_array_or_group(zarr_v2[key]) else: array_metadata_v3 = _convert_array_metadata(zarr_v2.metadata) - await _save_v3_metadata(zarr_v2, array_metadata_v3) + sync(_save_v3_metadata(zarr_v2, array_metadata_v3)) def _convert_array_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: From 67f958030f7ab57965dc03061c291dfa95d8de60 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Thu, 3 Jul 2025 14:23:36 +0100 Subject: [PATCH 09/57] add initial tests for converter --- .../metadata/converter/converter_v2_v3.py | 4 +- tests/test_metadata/test_converter_v2_v3.py | 72 +++++++++++++++++++ 2 files changed, 73 insertions(+), 3 deletions(-) create mode 100644 tests/test_metadata/test_converter_v2_v3.py diff --git a/src/zarr/core/metadata/converter/converter_v2_v3.py b/src/zarr/core/metadata/converter/converter_v2_v3.py index 33845a53a8..8d84fac0d6 100644 --- a/src/zarr/core/metadata/converter/converter_v2_v3.py +++ b/src/zarr/core/metadata/converter/converter_v2_v3.py @@ -41,9 +41,7 @@ def convert_v2_to_v3( the Store constructor for that implementation. Ignored otherwise. """ - zarr_v2 = zarr.open( - store=store, mode="r+", zarr_format=2, path=path, storage_options=storage_options - ) + zarr_v2 = zarr.open(store=store, mode="r+", path=path, storage_options=storage_options) convert_array_or_group(zarr_v2) diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py new file mode 100644 index 0000000000..de6e389dad --- /dev/null +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -0,0 +1,72 @@ +import numcodecs +import pytest +from typer.testing import CliRunner + +import zarr +from zarr.abc.store import Store +from zarr.codecs.blosc import BloscCodec +from zarr.codecs.bytes import BytesCodec +from zarr.core.chunk_grids import RegularChunkGrid +from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding +from zarr.core.dtype.npy.int import UInt16 +from zarr.core.metadata.converter.cli import app + +runner = CliRunner() + + +def test_convert_array(local_store: Store) -> None: + shape = (10, 10) + chunks = (10, 10) + dtype = "uint16" + compressors = numcodecs.Blosc(cname="zstd", clevel=3, shuffle=1) + fill_value = 2 + attributes = {"baz": 42, "qux": [1, 4, 7, 12]} + + zarr.create_array( + store=local_store, + shape=shape, + chunks=chunks, + dtype=dtype, + compressors=compressors, + zarr_format=2, + fill_value=fill_value, + attributes=attributes, + ) + + result = runner.invoke(app, ["convert", str(local_store.root)]) + assert result.exit_code == 0 + assert (local_store.root / "zarr.json").exists() + + zarr_array = zarr.open(local_store.root, zarr_format=3) + metadata = zarr_array.metadata + assert metadata.zarr_format == 3 + assert metadata.node_type == "array" + assert metadata.shape == shape + assert metadata.chunk_grid == RegularChunkGrid(chunk_shape=chunks) + assert metadata.chunk_key_encoding == DefaultChunkKeyEncoding(separator=".") + assert metadata.data_type == UInt16("little") + assert metadata.codecs == ( + BytesCodec(endian="little"), + BloscCodec(typesize=2, cname="zstd", clevel=3, shuffle="shuffle", blocksize=0), + ) + assert metadata.fill_value == fill_value + assert metadata.attributes == attributes + assert metadata.dimension_names is None + assert metadata.storage_transformers == () + + +@pytest.mark.parametrize("node_type", ["array", "group"]) +def test_convert_v3(local_store: Store, node_type: str) -> None: + """Attempting to convert a v3 array/group should always fail""" + + if node_type == "array": + zarr.create_array( + store=local_store, shape=(10, 10), chunks=(10, 10), zarr_format=3, dtype="uint16" + ) + else: + zarr.create_group(store=local_store, zarr_format=3) + + result = runner.invoke(app, ["convert", str(local_store.root)]) + assert result.exit_code == 1 + assert isinstance(result.exception, TypeError) + assert str(result.exception) == "Only arrays / groups with zarr v2 metadata can be converted" From 0d7c2c894901425f87163e7c54aacb174c797b2e Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Thu, 3 Jul 2025 15:10:13 +0100 Subject: [PATCH 10/57] add tests for conversion of groups and nested groups and arrays --- .../metadata/converter/converter_v2_v3.py | 10 ++-- tests/test_metadata/test_converter_v2_v3.py | 56 +++++++++++++++++++ 2 files changed, 62 insertions(+), 4 deletions(-) diff --git a/src/zarr/core/metadata/converter/converter_v2_v3.py b/src/zarr/core/metadata/converter/converter_v2_v3.py index 8d84fac0d6..2b9cce91c4 100644 --- a/src/zarr/core/metadata/converter/converter_v2_v3.py +++ b/src/zarr/core/metadata/converter/converter_v2_v3.py @@ -59,16 +59,18 @@ def convert_array_or_group(zarr_v2: Array | Group) -> None: raise TypeError("Only arrays / groups with zarr v2 metadata can be converted") if isinstance(zarr_v2.metadata, GroupMetadata): + # process members of the group + for key in zarr_v2: + convert_array_or_group(zarr_v2[key]) + + # write group's converted metadata group_metadata_v3 = GroupMetadata( attributes=zarr_v2.metadata.attributes, zarr_format=3, consolidated_metadata=None ) sync(_save_v3_metadata(zarr_v2, group_metadata_v3)) - # process members of the group - for key in zarr_v2: - convert_array_or_group(zarr_v2[key]) - else: + # write array's converted metadata array_metadata_v3 = _convert_array_metadata(zarr_v2.metadata) sync(_save_v3_metadata(zarr_v2, array_metadata_v3)) diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py index de6e389dad..f8ec11ffcf 100644 --- a/tests/test_metadata/test_converter_v2_v3.py +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -55,6 +55,62 @@ def test_convert_array(local_store: Store) -> None: assert metadata.storage_transformers == () +def test_convert_group(local_store: Store) -> None: + attributes = {"baz": 42, "qux": [1, 4, 7, 12]} + zarr.create_group(store=local_store, zarr_format=2, attributes=attributes) + + result = runner.invoke(app, ["convert", str(local_store.root)]) + assert result.exit_code == 0 + assert (local_store.root / "zarr.json").exists() + + zarr_array = zarr.open(local_store.root, zarr_format=3) + metadata = zarr_array.metadata + assert metadata.zarr_format == 3 + assert metadata.node_type == "group" + assert metadata.attributes == attributes + assert metadata.consolidated_metadata is None + + +def test_convert_nested_groups_and_arrays(local_store: Store) -> None: + attributes = {"baz": 42, "qux": [1, 4, 7, 12]} + + # 3 levels of nested groups + group_1 = zarr.create_group(store=local_store, zarr_format=2, attributes=attributes) + group_2 = group_1.create_group(name="group_2", attributes=attributes) + group_3 = group_2.create_group(name="group_3", attributes=attributes) + + # 1 array per group + array_1 = group_1.create_array( + name="array_1", shape=(10, 10), chunks=(10, 10), dtype="uint16", attributes=attributes + ) + array_2 = group_2.create_array( + name="array_2", shape=(10, 10), chunks=(10, 10), dtype="uint16", attributes=attributes + ) + array_3 = group_3.create_array( + name="array_3", shape=(10, 10), chunks=(10, 10), dtype="uint16", attributes=attributes + ) + + paths = [group_1.path, group_2.path, group_3.path, array_1.path, array_2.path, array_3.path] + + result = runner.invoke(app, ["convert", str(local_store.root)]) + assert result.exit_code == 0 + + # check zarr.json were created for every group and array + total_zarr_jsons = 0 + for _, _, filenames in local_store.root.walk(): + assert "zarr.json" in filenames + total_zarr_jsons += 1 + assert total_zarr_jsons == 6 + + # Check converted zarr can be opened + metadata accessed at all levels + zarr_array = zarr.open(local_store.root, zarr_format=3) + for path in paths: + zarr_v3 = zarr_array[path] + metadata = zarr_v3.metadata + assert metadata.zarr_format == 3 + assert metadata.attributes == attributes + + @pytest.mark.parametrize("node_type", ["array", "group"]) def test_convert_v3(local_store: Store, node_type: str) -> None: """Attempting to convert a v3 array/group should always fail""" From cf395802e3351a371411471aaf3af528775aca6c Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Thu, 3 Jul 2025 15:47:00 +0100 Subject: [PATCH 11/57] add tests for conversion of compressors and filters --- tests/test_metadata/test_converter_v2_v3.py | 79 +++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py index f8ec11ffcf..5cb2f7cd86 100644 --- a/tests/test_metadata/test_converter_v2_v3.py +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -1,11 +1,18 @@ +import lzma + import numcodecs +import numcodecs.abc import pytest +from numcodecs.zarr3 import LZMA, Delta from typer.testing import CliRunner import zarr +from zarr.abc.codec import Codec from zarr.abc.store import Store from zarr.codecs.blosc import BloscCodec from zarr.codecs.bytes import BytesCodec +from zarr.codecs.gzip import GzipCodec +from zarr.codecs.zstd import ZstdCodec from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding from zarr.core.dtype.npy.int import UInt16 @@ -111,6 +118,78 @@ def test_convert_nested_groups_and_arrays(local_store: Store) -> None: assert metadata.attributes == attributes +@pytest.mark.parametrize( + ("compressor_v2", "compressor_v3"), + [ + ( + numcodecs.Blosc(cname="zstd", clevel=3, shuffle=1), + BloscCodec(typesize=2, cname="zstd", clevel=3, shuffle="shuffle", blocksize=0), + ), + (numcodecs.Zstd(level=3), ZstdCodec(level=3)), + (numcodecs.GZip(level=3), GzipCodec(level=3)), + ( + numcodecs.LZMA( + format=1, check=-1, preset=None, filters=[{"id": lzma.FILTER_DELTA, "dist": 4}] + ), + LZMA(format=1, check=-1, preset=None, filters=[{"id": lzma.FILTER_DELTA, "dist": 4}]), + ), + ], + ids=["blosc", "zstd", "gzip", "numcodecs-compressor"], +) +def test_convert_compressor( + local_store: Store, compressor_v2: numcodecs.abc.Codec, compressor_v3: Codec +) -> None: + zarr.create_array( + store=local_store, + shape=(10, 10), + chunks=(10, 10), + dtype="uint16", + compressors=compressor_v2, + zarr_format=2, + fill_value=0, + ) + + result = runner.invoke(app, ["convert", str(local_store.root)]) + assert result.exit_code == 0 + assert (local_store.root / "zarr.json").exists() + + zarr_array = zarr.open(local_store.root, zarr_format=3) + metadata = zarr_array.metadata + assert metadata.zarr_format == 3 + assert metadata.codecs == ( + BytesCodec(endian="little"), + compressor_v3, + ) + + +def test_convert_filter(local_store: Store) -> None: + filter_v2 = numcodecs.Delta(dtype=" None: """Attempting to convert a v3 array/group should always fail""" From 11499e798754b85da4ccd9fe7c6d6c831f56c528 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Thu, 3 Jul 2025 16:07:41 +0100 Subject: [PATCH 12/57] test conversion of order and endianness --- tests/test_metadata/test_converter_v2_v3.py | 67 ++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py index 5cb2f7cd86..300a0f4a67 100644 --- a/tests/test_metadata/test_converter_v2_v3.py +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -12,10 +12,11 @@ from zarr.codecs.blosc import BloscCodec from zarr.codecs.bytes import BytesCodec from zarr.codecs.gzip import GzipCodec +from zarr.codecs.transpose import TransposeCodec from zarr.codecs.zstd import ZstdCodec from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding -from zarr.core.dtype.npy.int import UInt16 +from zarr.core.dtype.npy.int import BaseInt, UInt8, UInt16 from zarr.core.metadata.converter.cli import app runner = CliRunner() @@ -190,6 +191,70 @@ def test_convert_filter(local_store: Store) -> None: ) +@pytest.mark.parametrize( + ("order", "expected_codecs"), + [ + ("C", (BytesCodec(endian="little"),)), + ("F", (TransposeCodec(order=(1, 0)), BytesCodec(endian="little"))), + ], +) +def test_convert_C_vs_F_order( + local_store: Store, order: str, expected_codecs: tuple[Codec] +) -> None: + zarr.create_array( + store=local_store, + shape=(10, 10), + chunks=(10, 10), + dtype="uint16", + compressors=None, + zarr_format=2, + fill_value=0, + order=order, + ) + + result = runner.invoke(app, ["convert", str(local_store.root)]) + assert result.exit_code == 0 + assert (local_store.root / "zarr.json").exists() + + zarr_array = zarr.open(local_store.root, zarr_format=3) + metadata = zarr_array.metadata + assert metadata.zarr_format == 3 + + assert metadata.codecs == expected_codecs + + +@pytest.mark.parametrize( + ("dtype", "expected_data_type", "expected_codecs"), + [ + ("uint8", UInt8(), (BytesCodec(endian=None),)), + ("uint16", UInt16(), (BytesCodec(endian="little"),)), + ], + ids=["single_byte", "multi_byte"], +) +def test_convert_endian( + local_store: Store, dtype: str, expected_data_type: BaseInt, expected_codecs: tuple[Codec] +) -> None: + zarr.create_array( + store=local_store, + shape=(10, 10), + chunks=(10, 10), + dtype=dtype, + compressors=None, + zarr_format=2, + fill_value=0, + ) + + result = runner.invoke(app, ["convert", str(local_store.root)]) + assert result.exit_code == 0 + assert (local_store.root / "zarr.json").exists() + + zarr_array = zarr.open(local_store.root, zarr_format=3) + metadata = zarr_array.metadata + assert metadata.zarr_format == 3 + assert metadata.data_type == expected_data_type + assert metadata.codecs == expected_codecs + + @pytest.mark.parametrize("node_type", ["array", "group"]) def test_convert_v3(local_store: Store, node_type: str) -> None: """Attempting to convert a v3 array/group should always fail""" From 90b0996bd95a1446ef762d7802f166d4b52290d8 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Thu, 3 Jul 2025 16:26:52 +0100 Subject: [PATCH 13/57] add tests for edge cases of incorrect codecs --- tests/test_metadata/test_converter_v2_v3.py | 64 +++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py index 300a0f4a67..5300fc6656 100644 --- a/tests/test_metadata/test_converter_v2_v3.py +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -270,3 +270,67 @@ def test_convert_v3(local_store: Store, node_type: str) -> None: assert result.exit_code == 1 assert isinstance(result.exception, TypeError) assert str(result.exception) == "Only arrays / groups with zarr v2 metadata can be converted" + + +def test_convert_unknown_codec(local_store: Store) -> None: + """Attempting to convert a codec without a v3 equivalent should always fail""" + + zarr.create_array( + store=local_store, + shape=(10, 10), + chunks=(10, 10), + dtype="uint16", + filters=[numcodecs.Categorize(labels=["a", "b"], dtype=object)], + zarr_format=2, + fill_value=0, + ) + + result = runner.invoke(app, ["convert", str(local_store.root)]) + assert result.exit_code == 1 + assert isinstance(result.exception, ValueError) + assert ( + str(result.exception) == "Couldn't find corresponding numcodecs.zarr3 codec for categorize" + ) + + +def test_convert_incorrect_filter(local_store: Store) -> None: + """Attempting to convert a filter (which is the wrong type of codec) should always fail""" + + zarr.create_array( + store=local_store, + shape=(10, 10), + chunks=(10, 10), + dtype="uint16", + filters=[numcodecs.Zstd(level=3)], + zarr_format=2, + fill_value=0, + ) + + result = runner.invoke(app, ["convert", str(local_store.root)]) + assert result.exit_code == 1 + assert isinstance(result.exception, TypeError) + assert ( + str(result.exception) == "Filter is not an ArrayArrayCodec" + ) + + +def test_convert_incorrect_compressor(local_store: Store) -> None: + """Attempting to convert a compressor (which is the wrong type of codec) should always fail""" + + zarr.create_array( + store=local_store, + shape=(10, 10), + chunks=(10, 10), + dtype="uint16", + compressors=numcodecs.Delta(dtype=" is not a BytesBytesCodec" + ) From 85159bb788fa8f2fcd01d68b286f3e666d6dbf39 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Fri, 4 Jul 2025 10:45:07 +0100 Subject: [PATCH 14/57] add tests for / separator --- tests/test_metadata/test_converter_v2_v3.py | 50 ++++++++++++++------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py index 5300fc6656..b7884fe4d4 100644 --- a/tests/test_metadata/test_converter_v2_v3.py +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -1,4 +1,5 @@ import lzma +from typing import Any import numcodecs import numcodecs.abc @@ -79,26 +80,38 @@ def test_convert_group(local_store: Store) -> None: assert metadata.consolidated_metadata is None -def test_convert_nested_groups_and_arrays(local_store: Store) -> None: - attributes = {"baz": 42, "qux": [1, 4, 7, 12]} +def create_nested_zarr(store: Store, attributes: dict[str, Any], separator: str) -> list[str]: + """Create a zarr with nested groups / arrays, returning the paths to all.""" # 3 levels of nested groups - group_1 = zarr.create_group(store=local_store, zarr_format=2, attributes=attributes) + group_0 = zarr.create_group(store=store, zarr_format=2, attributes=attributes) + group_1 = group_0.create_group(name="group_1", attributes=attributes) group_2 = group_1.create_group(name="group_2", attributes=attributes) - group_3 = group_2.create_group(name="group_3", attributes=attributes) + paths = [group_0.path, group_1.path, group_2.path] # 1 array per group - array_1 = group_1.create_array( - name="array_1", shape=(10, 10), chunks=(10, 10), dtype="uint16", attributes=attributes - ) - array_2 = group_2.create_array( - name="array_2", shape=(10, 10), chunks=(10, 10), dtype="uint16", attributes=attributes - ) - array_3 = group_3.create_array( - name="array_3", shape=(10, 10), chunks=(10, 10), dtype="uint16", attributes=attributes - ) + for i, group in enumerate([group_0, group_1, group_2]): + array = group.create_array( + name=f"array_{i}", + shape=(10, 10), + chunks=(5, 5), + dtype="uint16", + attributes=attributes, + chunk_key_encoding={"name": "v2", "separator": separator}, + ) + array[:] = 1 + paths.append(array.path) - paths = [group_1.path, group_2.path, group_3.path, array_1.path, array_2.path, array_3.path] + return paths + + +@pytest.mark.parametrize("separator", [".", "/"]) +def test_convert_nested_groups_and_arrays(local_store: Store, separator: str) -> None: + """Test that zarr.json are made at the correct points in a hierarchy of groups and arrays + (including when there are additional dirs due to using a / separator)""" + + attributes = {"baz": 42, "qux": [1, 4, 7, 12]} + paths = create_nested_zarr(local_store, attributes, separator) result = runner.invoke(app, ["convert", str(local_store.root)]) assert result.exit_code == 0 @@ -106,8 +119,13 @@ def test_convert_nested_groups_and_arrays(local_store: Store) -> None: # check zarr.json were created for every group and array total_zarr_jsons = 0 for _, _, filenames in local_store.root.walk(): - assert "zarr.json" in filenames - total_zarr_jsons += 1 + # group / array directories + if ".zattrs" in filenames: + assert "zarr.json" in filenames + total_zarr_jsons += 1 + # other directories e.g. for chunks when separator is / + else: + assert "zarr.json" not in filenames assert total_zarr_jsons == 6 # Check converted zarr can be opened + metadata accessed at all levels From 53ba1669323872d9af3711367927ecd8b282c9c9 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Mon, 7 Jul 2025 11:36:32 +0100 Subject: [PATCH 15/57] draft of metadata remover and add test for internal paths --- .../metadata/converter/converter_v2_v3.py | 67 ++++++++++++++++--- tests/test_metadata/test_converter_v2_v3.py | 24 +++++++ 2 files changed, 80 insertions(+), 11 deletions(-) diff --git a/src/zarr/core/metadata/converter/converter_v2_v3.py b/src/zarr/core/metadata/converter/converter_v2_v3.py index 2b9cce91c4..4c7b1e6048 100644 --- a/src/zarr/core/metadata/converter/converter_v2_v3.py +++ b/src/zarr/core/metadata/converter/converter_v2_v3.py @@ -13,7 +13,14 @@ from zarr.core.array import Array from zarr.core.buffer.core import default_buffer_prototype from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding -from zarr.core.common import ZARR_JSON +from zarr.core.common import ( + ZARR_JSON, + ZARRAY_JSON, + ZATTRS_JSON, + ZGROUP_JSON, + ZMETADATA_V2_JSON, + ZarrFormat, +) from zarr.core.dtype.common import HasEndianness from zarr.core.group import Group, GroupMetadata from zarr.core.metadata.v2 import ArrayV2Metadata @@ -21,7 +28,7 @@ from zarr.core.sync import sync from zarr.registry import get_codec_class from zarr.storage import StoreLike -from zarr.storage._utils import _join_paths +from zarr.storage._common import make_store_path def convert_v2_to_v3( @@ -75,6 +82,50 @@ def convert_array_or_group(zarr_v2: Array | Group) -> None: sync(_save_v3_metadata(zarr_v2, array_metadata_v3)) +async def remove_metadata( + store: StoreLike, + zarr_format: ZarrFormat, + path: str | None = None, + storage_options: dict[str, Any] | None = None, +) -> None: + """Remove all v2 (.zarray, .zattrs, .zgroup, .zmetadata) or v3 (zarr.json) metadata files from the given Zarr. + Note - this will remove metadata files at all levels of the hierarchy (every group and array). + + Parameters + ---------- + store : StoreLike + Store or path to directory in file system or name of zip file. + zarr_format : ZarrFormat + Which format's metadata to remove - 2 or 3. + path : str | None, optional + The path within the store to open, by default None + storage_options : dict | None, optional + If the store is backed by an fsspec-based implementation, then this dict will be passed to + the Store constructor for that implementation. Ignored otherwise. + """ + store_path = await make_store_path(store, mode="r+", storage_options=storage_options) + if not store_path.store.supports_deletes: + raise ValueError("Store must support deletes to remove metadata") + + if path is None: + prefix = "" + else: + prefix = path + + if zarr_format == 2: + metadata_files = [ZARRAY_JSON, ZATTRS_JSON, ZGROUP_JSON, ZMETADATA_V2_JSON] + else: + metadata_files = [ZARR_JSON] + + awaitables = [ + (store_path / file_path).delete() + async for file_path in store_path.store.list_prefix(prefix) + if file_path.split("/")[-1] in metadata_files + ] + + await asyncio.gather(*awaitables) + + def _convert_array_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: chunk_key_encoding = DefaultChunkKeyEncoding(separator=metadata_v2.dimension_separator) @@ -179,15 +230,9 @@ def _find_numcodecs_zarr3(numcodecs_codec: numcodecs.abc.Codec) -> Codec: async def _save_v3_metadata( zarr_v2: Array | Group, metadata_v3: ArrayV3Metadata | GroupMetadata ) -> None: - zarr_json_path = _join_paths([zarr_v2.path, ZARR_JSON]) - - if await zarr_v2.store.exists(zarr_json_path): + zarr_json_path = zarr_v2.store_path / ZARR_JSON + if await zarr_json_path.exists(): raise ValueError(f"{ZARR_JSON} already exists at {zarr_v2.store_path}") to_save = metadata_v3.to_buffer_dict(default_buffer_prototype()) - awaitables = [ - zarr_v2.store.set_if_not_exists(_join_paths([zarr_v2.path, key]), value) - for key, value in to_save.items() - ] - - await asyncio.gather(*awaitables) + await zarr_json_path.set_if_not_exists(to_save[ZARR_JSON]) diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py index b7884fe4d4..e0348be777 100644 --- a/tests/test_metadata/test_converter_v2_v3.py +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -137,6 +137,30 @@ def test_convert_nested_groups_and_arrays(local_store: Store, separator: str) -> assert metadata.attributes == attributes +@pytest.mark.parametrize("separator", [".", "/"]) +def test_convert_nested_with_path(local_store: Store, separator: str) -> None: + """Test that only arrays/groups within group_1 are converted (+ no other files in store)""" + + create_nested_zarr(local_store, {}, separator) + + result = runner.invoke(app, ["convert", str(local_store.root), "--path", "group_1"]) + assert result.exit_code == 0 + + group_path = local_store.root / "group_1" + + total_zarr_jsons = 0 + for dirpath, _, filenames in local_store.root.walk(): + inside_group = (dirpath == group_path) or (group_path in dirpath.parents) + if (".zattrs" in filenames) and inside_group: + # group / array directories inside the group + assert "zarr.json" in filenames + total_zarr_jsons += 1 + else: + assert "zarr.json" not in filenames + + assert total_zarr_jsons == 4 + + @pytest.mark.parametrize( ("compressor_v2", "compressor_v3"), [ From d4cdc045a6634840fcf8944054b964d96c139094 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Mon, 7 Jul 2025 15:24:29 +0100 Subject: [PATCH 16/57] add clear command to cli with tests --- src/zarr/core/metadata/converter/cli.py | 28 +++- tests/test_metadata/test_converter_v2_v3.py | 153 ++++++++++++++++---- 2 files changed, 152 insertions(+), 29 deletions(-) diff --git a/src/zarr/core/metadata/converter/cli.py b/src/zarr/core/metadata/converter/cli.py index d5c693e07f..bfa2c6cf04 100644 --- a/src/zarr/core/metadata/converter/cli.py +++ b/src/zarr/core/metadata/converter/cli.py @@ -1,8 +1,9 @@ -from typing import Annotated +from typing import Annotated, Literal, cast import typer -from zarr.core.metadata.converter.converter_v2_v3 import convert_v2_to_v3 +from zarr.core.metadata.converter.converter_v2_v3 import convert_v2_to_v3, remove_metadata +from zarr.core.sync import sync app = typer.Typer() @@ -24,8 +25,27 @@ def convert( @app.command() # type: ignore[misc] -def clear() -> None: - print("Clearing...") +def clear( + store: Annotated[ + str, + typer.Argument( + help="Store or path to directory in file system or name of zip file e.g. 'data/example-1.zarr', 's3://example-bucket/example'..." + ), + ], + zarr_format: Annotated[ + int, + typer.Argument( + help="Which format's metadata to remove - 2 or 3.", + min=2, + max=3, + ), + ], + path: Annotated[str | None, typer.Option(help="The path within the store to open")] = None, +) -> None: + """Remove all v2 (.zarray, .zattrs, .zgroup, .zmetadata) or v3 (zarr.json) metadata files from the given Zarr. + Note - this will remove metadata files at all levels of the hierarchy (every group and array). + """ + sync(remove_metadata(store=store, zarr_format=cast(Literal[2, 3], zarr_format), path=path)) if __name__ == "__main__": diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py index e0348be777..21b3e94c17 100644 --- a/tests/test_metadata/test_converter_v2_v3.py +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -1,4 +1,5 @@ import lzma +from pathlib import Path from typing import Any import numcodecs @@ -23,6 +24,93 @@ runner = CliRunner() +def create_nested_zarr(store: Store, attributes: dict[str, Any], separator: str) -> list[str]: + """Create a zarr with nested groups / arrays, returning the paths to all.""" + + # 3 levels of nested groups + group_0 = zarr.create_group(store=store, zarr_format=2, attributes=attributes) + group_1 = group_0.create_group(name="group_1", attributes=attributes) + group_2 = group_1.create_group(name="group_2", attributes=attributes) + paths = [group_0.path, group_1.path, group_2.path] + + # 1 array per group + for i, group in enumerate([group_0, group_1, group_2]): + array = group.create_array( + name=f"array_{i}", + shape=(10, 10), + chunks=(5, 5), + dtype="uint16", + attributes=attributes, + chunk_key_encoding={"name": "v2", "separator": separator}, + ) + array[:] = 1 + paths.append(array.path) + + return paths + + +@pytest.fixture +def expected_paths_no_metadata() -> list[Path]: + """Expected paths from create_nested_zarr, with no metadata files""" + return [ + Path("array_0"), + Path("array_0/0.0"), + Path("array_0/0.1"), + Path("array_0/1.0"), + Path("array_0/1.1"), + Path("group_1"), + Path("group_1/array_1"), + Path("group_1/array_1/0.0"), + Path("group_1/array_1/0.1"), + Path("group_1/array_1/1.0"), + Path("group_1/array_1/1.1"), + Path("group_1/group_2"), + Path("group_1/group_2/array_2"), + Path("group_1/group_2/array_2/0.0"), + Path("group_1/group_2/array_2/0.1"), + Path("group_1/group_2/array_2/1.0"), + Path("group_1/group_2/array_2/1.1"), + ] + + +@pytest.fixture +def expected_paths_v3_metadata(expected_paths_no_metadata: list[Path]) -> list[Path]: + """Expected paths from create_nested_zarr, with v3 metadata files""" + v3_paths = [ + Path("array_0/zarr.json"), + Path("group_1/array_1/zarr.json"), + Path("group_1/group_2/array_2/zarr.json"), + Path("zarr.json"), + Path("group_1/zarr.json"), + Path("group_1/group_2/zarr.json"), + ] + expected_paths_no_metadata.extend(v3_paths) + + return sorted(expected_paths_no_metadata) + + +@pytest.fixture +def expected_paths_v2_metadata(expected_paths_no_metadata: list[Path]) -> list[Path]: + """Expected paths from create_nested_zarr, with v2 metadata files""" + v2_paths = [ + Path("array_0/.zarray"), + Path("array_0/.zattrs"), + Path("group_1/array_1/.zarray"), + Path("group_1/array_1/.zattrs"), + Path("group_1/group_2/array_2/.zarray"), + Path("group_1/group_2/array_2/.zattrs"), + Path(".zgroup"), + Path(".zattrs"), + Path("group_1/.zgroup"), + Path("group_1/.zattrs"), + Path("group_1/group_2/.zgroup"), + Path("group_1/group_2/.zattrs"), + ] + expected_paths_no_metadata.extend(v2_paths) + + return sorted(expected_paths_no_metadata) + + def test_convert_array(local_store: Store) -> None: shape = (10, 10) chunks = (10, 10) @@ -80,31 +168,6 @@ def test_convert_group(local_store: Store) -> None: assert metadata.consolidated_metadata is None -def create_nested_zarr(store: Store, attributes: dict[str, Any], separator: str) -> list[str]: - """Create a zarr with nested groups / arrays, returning the paths to all.""" - - # 3 levels of nested groups - group_0 = zarr.create_group(store=store, zarr_format=2, attributes=attributes) - group_1 = group_0.create_group(name="group_1", attributes=attributes) - group_2 = group_1.create_group(name="group_2", attributes=attributes) - paths = [group_0.path, group_1.path, group_2.path] - - # 1 array per group - for i, group in enumerate([group_0, group_1, group_2]): - array = group.create_array( - name=f"array_{i}", - shape=(10, 10), - chunks=(5, 5), - dtype="uint16", - attributes=attributes, - chunk_key_encoding={"name": "v2", "separator": separator}, - ) - array[:] = 1 - paths.append(array.path) - - return paths - - @pytest.mark.parametrize("separator", [".", "/"]) def test_convert_nested_groups_and_arrays(local_store: Store, separator: str) -> None: """Test that zarr.json are made at the correct points in a hierarchy of groups and arrays @@ -376,3 +439,43 @@ def test_convert_incorrect_compressor(local_store: Store) -> None: str(result.exception) == "Compressor is not a BytesBytesCodec" ) + + +def test_remove_metadata_v2(local_store: Store, expected_paths_no_metadata: list[Path]) -> None: + """Test all v2 metadata can be removed (leaving all groups / arrays as-is)""" + + attributes = {"baz": 42, "qux": [1, 4, 7, 12]} + create_nested_zarr(local_store, attributes, ".") + + result = runner.invoke(app, ["clear", str(local_store.root), "2"]) + assert result.exit_code == 0 + + # check metadata files removed, but all groups / arrays still remain + paths = sorted(local_store.root.rglob("*")) + + expected_paths = [local_store.root / p for p in expected_paths_no_metadata] + assert paths == expected_paths + + +@pytest.mark.parametrize( + ("zarr_format", "expected_paths"), + [("2", "expected_paths_v3_metadata"), ("3", "expected_paths_v2_metadata")], +) +def test_remove_metadata_after_conversion( + local_store: Store, request: pytest.FixtureRequest, zarr_format: str, expected_paths: list[Path] +) -> None: + """Test all v2/v3 metadata can be removed after metadata conversion (all groups / arrays / + metadata of other versions should remain as-is)""" + + attributes = {"baz": 42, "qux": [1, 4, 7, 12]} + create_nested_zarr(local_store, attributes, ".") + + # convert v2 metadata to v3 (so now both v2 and v3 metadata present!), then remove either the v2 or v3 metadata + result = runner.invoke(app, ["convert", str(local_store.root)]) + assert result.exit_code == 0 + result = runner.invoke(app, ["clear", str(local_store.root), zarr_format]) + assert result.exit_code == 0 + + paths = sorted(local_store.root.rglob("*")) + expected_paths = [local_store.root / p for p in request.getfixturevalue(expected_paths)] + assert paths == expected_paths From dfdc729fa25bd81a762cd48ce8e5348ea020527e Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Mon, 7 Jul 2025 15:59:40 +0100 Subject: [PATCH 17/57] add test for metadata removal with path# --- tests/test_metadata/test_converter_v2_v3.py | 23 +++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py index 21b3e94c17..05592b0f55 100644 --- a/tests/test_metadata/test_converter_v2_v3.py +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -457,6 +457,29 @@ def test_remove_metadata_v2(local_store: Store, expected_paths_no_metadata: list assert paths == expected_paths +def test_remove_metadata_v2_with_path( + local_store: Store, expected_paths_no_metadata: list[Path] +) -> None: + """Test only v2 metadata within the given path (group_1) is removed""" + + attributes = {"baz": 42, "qux": [1, 4, 7, 12]} + create_nested_zarr(local_store, attributes, ".") + + result = runner.invoke(app, ["clear", str(local_store.root), "2", "--path", "group_1"]) + assert result.exit_code == 0 + + # check all metadata files inside group_1 are removed (.zattrs / .zgroup / .zarray should remain only inside the top + # group) + paths = sorted(local_store.root.rglob("*")) + + expected_paths = [local_store.root / p for p in expected_paths_no_metadata] + expected_paths.append(local_store.root / ".zattrs") + expected_paths.append(local_store.root / ".zgroup") + expected_paths.append(local_store.root / "array_0" / ".zarray") + expected_paths.append(local_store.root / "array_0" / ".zattrs") + assert paths == sorted(expected_paths) + + @pytest.mark.parametrize( ("zarr_format", "expected_paths"), [("2", "expected_paths_v3_metadata"), ("3", "expected_paths_v2_metadata")], From ad6099183466fc932674acb02fd6992dedcb8839 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Mon, 7 Jul 2025 17:37:46 +0100 Subject: [PATCH 18/57] add verbose logging option --- src/zarr/core/metadata/converter/cli.py | 20 +++++++++++++++++++ .../metadata/converter/converter_v2_v3.py | 14 ++++++++----- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/src/zarr/core/metadata/converter/cli.py b/src/zarr/core/metadata/converter/cli.py index bfa2c6cf04..8339e502c2 100644 --- a/src/zarr/core/metadata/converter/cli.py +++ b/src/zarr/core/metadata/converter/cli.py @@ -1,3 +1,4 @@ +import logging from typing import Annotated, Literal, cast import typer @@ -48,5 +49,24 @@ def clear( sync(remove_metadata(store=store, zarr_format=cast(Literal[2, 3], zarr_format), path=path)) +@app.callback() # type: ignore[misc] +def main( + verbose: Annotated[ + bool, + typer.Option( + help="enable verbose logging - will print info about metadata files being deleted / saved." + ), + ] = False, +) -> None: + """ + Convert metadata from v2 to v3. See available commands below - access help for individual commands with + cli.py COMMAND --help. + """ + if verbose: + lvl = logging.INFO + fmt = "%(message)s" + logging.basicConfig(level=lvl, format=fmt) + + if __name__ == "__main__": app() diff --git a/src/zarr/core/metadata/converter/converter_v2_v3.py b/src/zarr/core/metadata/converter/converter_v2_v3.py index 4c7b1e6048..03793c7c34 100644 --- a/src/zarr/core/metadata/converter/converter_v2_v3.py +++ b/src/zarr/core/metadata/converter/converter_v2_v3.py @@ -1,4 +1,5 @@ import asyncio +import logging from typing import Any, cast import numcodecs.abc @@ -30,6 +31,8 @@ from zarr.storage import StoreLike from zarr.storage._common import make_store_path +logger = logging.getLogger(__name__) + def convert_v2_to_v3( store: StoreLike, path: str | None = None, storage_options: dict[str, Any] | None = None @@ -117,11 +120,11 @@ async def remove_metadata( else: metadata_files = [ZARR_JSON] - awaitables = [ - (store_path / file_path).delete() - async for file_path in store_path.store.list_prefix(prefix) - if file_path.split("/")[-1] in metadata_files - ] + awaitables = [] + async for file_path in store_path.store.list_prefix(prefix): + if file_path.split("/")[-1] in metadata_files: + logger.info("Deleting metadata at %s", store_path / file_path) + awaitables.append((store_path / file_path).delete()) await asyncio.gather(*awaitables) @@ -234,5 +237,6 @@ async def _save_v3_metadata( if await zarr_json_path.exists(): raise ValueError(f"{ZARR_JSON} already exists at {zarr_v2.store_path}") + logger.info("Saving metadata to %s", zarr_json_path) to_save = metadata_v3.to_buffer_dict(default_buffer_prototype()) await zarr_json_path.set_if_not_exists(to_save[ZARR_JSON]) From 66bae0d02e9080cba34b9a4af6ac4cd475e4746c Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Tue, 8 Jul 2025 11:43:40 +0100 Subject: [PATCH 19/57] add dry run option to cli --- src/zarr/core/metadata/converter/cli.py | 52 ++++++++++++++++--- .../metadata/converter/converter_v2_v3.py | 32 ++++++++---- 2 files changed, 69 insertions(+), 15 deletions(-) diff --git a/src/zarr/core/metadata/converter/cli.py b/src/zarr/core/metadata/converter/cli.py index 8339e502c2..24a3de7937 100644 --- a/src/zarr/core/metadata/converter/cli.py +++ b/src/zarr/core/metadata/converter/cli.py @@ -8,6 +8,21 @@ app = typer.Typer() +logger = logging.getLogger(__name__) + + +def _set_logging_config(verbose: bool) -> None: + if verbose: + lvl = logging.INFO + else: + lvl = logging.WARNING + fmt = "%(message)s" + logging.basicConfig(level=lvl, format=fmt) + + +def _set_verbose_level() -> None: + logging.getLogger().setLevel(logging.INFO) + @app.command() # type: ignore[misc] def convert( @@ -18,11 +33,23 @@ def convert( ), ], path: Annotated[str | None, typer.Option(help="The path within the store to open")] = None, + dry_run: Annotated[ + bool, + typer.Option( + help="Enable a dry-run: files that would be converted are logged, but no new files are actually created." + ), + ] = False, ) -> None: """Convert all v2 metadata in a zarr hierarchy to v3. This will create a zarr.json file at each level (for every group / array). V2 files (.zarray, .zattrs etc.) will be left as-is. """ - convert_v2_to_v3(store=store, path=path) + if dry_run: + _set_verbose_level() + logger.info( + "Dry run enabled - no new files will be created. Log of files that would be created on a real run:" + ) + + convert_v2_to_v3(store=store, path=path, dry_run=dry_run) @app.command() # type: ignore[misc] @@ -42,11 +69,27 @@ def clear( ), ], path: Annotated[str | None, typer.Option(help="The path within the store to open")] = None, + dry_run: Annotated[ + bool, + typer.Option( + help="Enable a dry-run: files that would be deleted are logged, but no files are actually removed." + ), + ] = False, ) -> None: """Remove all v2 (.zarray, .zattrs, .zgroup, .zmetadata) or v3 (zarr.json) metadata files from the given Zarr. Note - this will remove metadata files at all levels of the hierarchy (every group and array). """ - sync(remove_metadata(store=store, zarr_format=cast(Literal[2, 3], zarr_format), path=path)) + if dry_run: + _set_verbose_level() + logger.info( + "Dry run enabled - no files will be deleted. Log of files that would be deleted on a real run:" + ) + + sync( + remove_metadata( + store=store, zarr_format=cast(Literal[2, 3], zarr_format), path=path, dry_run=dry_run + ) + ) @app.callback() # type: ignore[misc] @@ -62,10 +105,7 @@ def main( Convert metadata from v2 to v3. See available commands below - access help for individual commands with cli.py COMMAND --help. """ - if verbose: - lvl = logging.INFO - fmt = "%(message)s" - logging.basicConfig(level=lvl, format=fmt) + _set_logging_config(verbose) if __name__ == "__main__": diff --git a/src/zarr/core/metadata/converter/converter_v2_v3.py b/src/zarr/core/metadata/converter/converter_v2_v3.py index 03793c7c34..f5d0e8f010 100644 --- a/src/zarr/core/metadata/converter/converter_v2_v3.py +++ b/src/zarr/core/metadata/converter/converter_v2_v3.py @@ -35,7 +35,10 @@ def convert_v2_to_v3( - store: StoreLike, path: str | None = None, storage_options: dict[str, Any] | None = None + store: StoreLike, + path: str | None = None, + storage_options: dict[str, Any] | None = None, + dry_run: bool = False, ) -> None: """Convert all v2 metadata in a zarr hierarchy to v3. This will create a zarr.json file at each level (for every group / array). V2 files (.zarray, .zattrs etc.) will be left as-is. @@ -49,13 +52,15 @@ def convert_v2_to_v3( storage_options : dict | None, optional If the store is backed by an fsspec-based implementation, then this dict will be passed to the Store constructor for that implementation. Ignored otherwise. + dry_run : bool, optional + Enable a 'dry run' - files that would be created are logged, but no files are actually created. """ zarr_v2 = zarr.open(store=store, mode="r+", path=path, storage_options=storage_options) - convert_array_or_group(zarr_v2) + convert_array_or_group(zarr_v2, dry_run=dry_run) -def convert_array_or_group(zarr_v2: Array | Group) -> None: +def convert_array_or_group(zarr_v2: Array | Group, dry_run: bool = False) -> None: """Convert all v2 metadata in a zarr array/group to v3. Note - if a group is provided, then all arrays / groups within this group will also be converted. A zarr.json file will be created at each level, with any V2 files (.zarray, .zattrs etc.) left as-is. @@ -64,6 +69,8 @@ def convert_array_or_group(zarr_v2: Array | Group) -> None: ---------- zarr_v2 : Array | Group An array or group with zarr_format = 2 + dry_run : bool, optional + Enable a 'dry run' - files that would be created are logged, but no files are actually created. """ if not zarr_v2.metadata.zarr_format == 2: raise TypeError("Only arrays / groups with zarr v2 metadata can be converted") @@ -71,18 +78,18 @@ def convert_array_or_group(zarr_v2: Array | Group) -> None: if isinstance(zarr_v2.metadata, GroupMetadata): # process members of the group for key in zarr_v2: - convert_array_or_group(zarr_v2[key]) + convert_array_or_group(zarr_v2[key], dry_run=dry_run) # write group's converted metadata group_metadata_v3 = GroupMetadata( attributes=zarr_v2.metadata.attributes, zarr_format=3, consolidated_metadata=None ) - sync(_save_v3_metadata(zarr_v2, group_metadata_v3)) + sync(_save_v3_metadata(zarr_v2, group_metadata_v3, dry_run=dry_run)) else: # write array's converted metadata array_metadata_v3 = _convert_array_metadata(zarr_v2.metadata) - sync(_save_v3_metadata(zarr_v2, array_metadata_v3)) + sync(_save_v3_metadata(zarr_v2, array_metadata_v3, dry_run=dry_run)) async def remove_metadata( @@ -90,6 +97,7 @@ async def remove_metadata( zarr_format: ZarrFormat, path: str | None = None, storage_options: dict[str, Any] | None = None, + dry_run: bool = False, ) -> None: """Remove all v2 (.zarray, .zattrs, .zgroup, .zmetadata) or v3 (zarr.json) metadata files from the given Zarr. Note - this will remove metadata files at all levels of the hierarchy (every group and array). @@ -105,6 +113,8 @@ async def remove_metadata( storage_options : dict | None, optional If the store is backed by an fsspec-based implementation, then this dict will be passed to the Store constructor for that implementation. Ignored otherwise. + dry_run : bool, optional + Enable a 'dry run' - files that would be deleted are logged, but no files are actually removed. """ store_path = await make_store_path(store, mode="r+", storage_options=storage_options) if not store_path.store.supports_deletes: @@ -124,7 +134,9 @@ async def remove_metadata( async for file_path in store_path.store.list_prefix(prefix): if file_path.split("/")[-1] in metadata_files: logger.info("Deleting metadata at %s", store_path / file_path) - awaitables.append((store_path / file_path).delete()) + + if not dry_run: + awaitables.append((store_path / file_path).delete()) await asyncio.gather(*awaitables) @@ -231,7 +243,7 @@ def _find_numcodecs_zarr3(numcodecs_codec: numcodecs.abc.Codec) -> Codec: async def _save_v3_metadata( - zarr_v2: Array | Group, metadata_v3: ArrayV3Metadata | GroupMetadata + zarr_v2: Array | Group, metadata_v3: ArrayV3Metadata | GroupMetadata, dry_run: bool = False ) -> None: zarr_json_path = zarr_v2.store_path / ZARR_JSON if await zarr_json_path.exists(): @@ -239,4 +251,6 @@ async def _save_v3_metadata( logger.info("Saving metadata to %s", zarr_json_path) to_save = metadata_v3.to_buffer_dict(default_buffer_prototype()) - await zarr_json_path.set_if_not_exists(to_save[ZARR_JSON]) + + if not dry_run: + await zarr_json_path.set_if_not_exists(to_save[ZARR_JSON]) From 97df9bf0e29999ddfac06bbe11391cb13f4da023 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Tue, 8 Jul 2025 11:58:12 +0100 Subject: [PATCH 20/57] add test for dry-run --- tests/test_metadata/test_converter_v2_v3.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py index 05592b0f55..029453cfeb 100644 --- a/tests/test_metadata/test_converter_v2_v3.py +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -502,3 +502,24 @@ def test_remove_metadata_after_conversion( paths = sorted(local_store.root.rglob("*")) expected_paths = [local_store.root / p for p in request.getfixturevalue(expected_paths)] assert paths == expected_paths + + +@pytest.mark.parametrize("cli_command", ["convert", "clear"]) +def test_dry_run( + local_store: Store, cli_command: str, expected_paths_v2_metadata: list[Path] +) -> None: + """Test that all files are un-changed after a dry run""" + + attributes = {"baz": 42, "qux": [1, 4, 7, 12]} + create_nested_zarr(local_store, attributes, ".") + + if cli_command == "convert": + result = runner.invoke(app, ["convert", str(local_store.root), "--dry-run"]) + else: + result = runner.invoke(app, ["clear", str(local_store.root), "2", "--dry-run"]) + + assert result.exit_code == 0 + + paths = sorted(local_store.root.rglob("*")) + expected_paths = [local_store.root / p for p in expected_paths_v2_metadata] + assert paths == expected_paths From 42e0435ad7c4f3321fd7797b060160ea447b79e6 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Wed, 9 Jul 2025 12:39:58 +0100 Subject: [PATCH 21/57] add zarr-converter script and enable cli dep in tests --- pyproject.toml | 5 ++- src/zarr/core/metadata/converter/cli.py | 2 +- tests/conftest.py | 23 ++++++++++ tests/test_codec_entrypoints.py | 18 -------- tests/test_dtype_registry.py | 22 ---------- tests/test_metadata/test_converter_v2_v3.py | 47 ++++++++++++--------- 6 files changed, 54 insertions(+), 63 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index eb95c5dada..6273b3ca58 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -110,6 +110,9 @@ docs = [ 'astroid<4' ] +[project.scripts] +zarr-converter = "zarr.core.metadata.converter.cli:app" + [project.urls] "Bug Tracker" = "https://github.com/zarr-developers/zarr-python/issues" @@ -156,7 +159,7 @@ deps = ["minimal", "optional"] [tool.hatch.envs.test.overrides] matrix.deps.dependencies = [ - {value = "zarr[remote, remote_tests, test, optional]", if = ["optional"]} + {value = "zarr[remote, remote_tests, test, optional, cli]", if = ["optional"]} ] [tool.hatch.envs.test.scripts] diff --git a/src/zarr/core/metadata/converter/cli.py b/src/zarr/core/metadata/converter/cli.py index 24a3de7937..21ffaef20f 100644 --- a/src/zarr/core/metadata/converter/cli.py +++ b/src/zarr/core/metadata/converter/cli.py @@ -103,7 +103,7 @@ def main( ) -> None: """ Convert metadata from v2 to v3. See available commands below - access help for individual commands with - cli.py COMMAND --help. + zarr-converter COMMAND --help. """ _set_logging_config(verbose) diff --git a/tests/conftest.py b/tests/conftest.py index 4d300a1fd4..d8262ab086 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,6 +2,7 @@ import os import pathlib +import sys from dataclasses import dataclass, field from typing import TYPE_CHECKING @@ -10,6 +11,7 @@ import pytest from hypothesis import HealthCheck, Verbosity, settings +import zarr.registry from zarr import AsyncGroup, config from zarr.abc.store import Store from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation @@ -175,6 +177,27 @@ def zarr_format(request: pytest.FixtureRequest) -> ZarrFormat: raise ValueError(msg) +def _clear_registries() -> None: + registries = zarr.registry._collect_entrypoints() + for registry in registries: + registry.lazy_load_list.clear() + + +@pytest.fixture +def set_path() -> Generator[None, None, None]: + tests_dir = str(pathlib.Path(__file__).parent.absolute()) + sys.path.append(tests_dir) + _clear_registries() + zarr.registry._collect_entrypoints() + + yield + + sys.path.remove(tests_dir) + _clear_registries() + zarr.registry._collect_entrypoints() + config.reset() + + def pytest_addoption(parser: Any) -> None: parser.addoption( "--run-slow-hypothesis", diff --git a/tests/test_codec_entrypoints.py b/tests/test_codec_entrypoints.py index e1ef027dd4..fc7b79fe54 100644 --- a/tests/test_codec_entrypoints.py +++ b/tests/test_codec_entrypoints.py @@ -1,26 +1,8 @@ -import os.path -import sys -from collections.abc import Generator - import pytest import zarr.registry from zarr import config -here = os.path.abspath(os.path.dirname(__file__)) - - -@pytest.fixture -def set_path() -> Generator[None, None, None]: - sys.path.append(here) - zarr.registry._collect_entrypoints() - yield - sys.path.remove(here) - registries = zarr.registry._collect_entrypoints() - for registry in registries: - registry.lazy_load_list.clear() - config.reset() - @pytest.mark.usefixtures("set_path") @pytest.mark.parametrize("codec_name", ["TestEntrypointCodec", "TestEntrypointGroup.Codec"]) diff --git a/tests/test_dtype_registry.py b/tests/test_dtype_registry.py index d4e37440a7..c40d4d98b1 100644 --- a/tests/test_dtype_registry.py +++ b/tests/test_dtype_registry.py @@ -1,16 +1,12 @@ from __future__ import annotations import re -import sys -from pathlib import Path from typing import TYPE_CHECKING, Any, get_args import numpy as np import pytest -import zarr from tests.conftest import skip_object_dtype -from zarr.core.config import config from zarr.core.dtype import ( AnyDType, Bool, @@ -29,8 +25,6 @@ ) if TYPE_CHECKING: - from collections.abc import Generator - from zarr.core.common import ZarrFormat from .test_dtype.conftest import zdtype_examples @@ -147,22 +141,6 @@ def test_match_dtype_unique( data_type_registry_fixture.match_json(instance_dict, zarr_format=zarr_format) -# this is copied from the registry tests -- we should deduplicate -here = str(Path(__file__).parent.absolute()) - - -@pytest.fixture -def set_path() -> Generator[None, None, None]: - sys.path.append(here) - zarr.registry._collect_entrypoints() - yield - sys.path.remove(here) - registries = zarr.registry._collect_entrypoints() - for registry in registries: - registry.lazy_load_list.clear() - config.reset() - - @pytest.mark.usefixtures("set_path") def test_entrypoint_dtype(zarr_format: ZarrFormat) -> None: from package_with_entrypoint import TestDataType diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py index 029453cfeb..efc831ff28 100644 --- a/tests/test_metadata/test_converter_v2_v3.py +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -6,7 +6,6 @@ import numcodecs.abc import pytest from numcodecs.zarr3 import LZMA, Delta -from typer.testing import CliRunner import zarr from zarr.abc.codec import Codec @@ -19,9 +18,15 @@ from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding from zarr.core.dtype.npy.int import BaseInt, UInt8, UInt16 -from zarr.core.metadata.converter.cli import app -runner = CliRunner() +typer_testing = pytest.importorskip( + "typer.testing", reason="optional cli dependencies aren't installed" +) +cli = pytest.importorskip( + "zarr.core.metadata.converter.cli", reason="optional cli dependencies aren't installed" +) + +runner = typer_testing.CliRunner() def create_nested_zarr(store: Store, attributes: dict[str, Any], separator: str) -> list[str]: @@ -130,7 +135,7 @@ def test_convert_array(local_store: Store) -> None: attributes=attributes, ) - result = runner.invoke(app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 0 assert (local_store.root / "zarr.json").exists() @@ -156,7 +161,7 @@ def test_convert_group(local_store: Store) -> None: attributes = {"baz": 42, "qux": [1, 4, 7, 12]} zarr.create_group(store=local_store, zarr_format=2, attributes=attributes) - result = runner.invoke(app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 0 assert (local_store.root / "zarr.json").exists() @@ -176,7 +181,7 @@ def test_convert_nested_groups_and_arrays(local_store: Store, separator: str) -> attributes = {"baz": 42, "qux": [1, 4, 7, 12]} paths = create_nested_zarr(local_store, attributes, separator) - result = runner.invoke(app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 0 # check zarr.json were created for every group and array @@ -206,7 +211,7 @@ def test_convert_nested_with_path(local_store: Store, separator: str) -> None: create_nested_zarr(local_store, {}, separator) - result = runner.invoke(app, ["convert", str(local_store.root), "--path", "group_1"]) + result = runner.invoke(cli.app, ["convert", str(local_store.root), "--path", "group_1"]) assert result.exit_code == 0 group_path = local_store.root / "group_1" @@ -255,7 +260,7 @@ def test_convert_compressor( fill_value=0, ) - result = runner.invoke(app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 0 assert (local_store.root / "zarr.json").exists() @@ -283,7 +288,7 @@ def test_convert_filter(local_store: Store) -> None: fill_value=0, ) - result = runner.invoke(app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 0 assert (local_store.root / "zarr.json").exists() @@ -317,7 +322,7 @@ def test_convert_C_vs_F_order( order=order, ) - result = runner.invoke(app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 0 assert (local_store.root / "zarr.json").exists() @@ -349,7 +354,7 @@ def test_convert_endian( fill_value=0, ) - result = runner.invoke(app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 0 assert (local_store.root / "zarr.json").exists() @@ -371,7 +376,7 @@ def test_convert_v3(local_store: Store, node_type: str) -> None: else: zarr.create_group(store=local_store, zarr_format=3) - result = runner.invoke(app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 1 assert isinstance(result.exception, TypeError) assert str(result.exception) == "Only arrays / groups with zarr v2 metadata can be converted" @@ -390,7 +395,7 @@ def test_convert_unknown_codec(local_store: Store) -> None: fill_value=0, ) - result = runner.invoke(app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 1 assert isinstance(result.exception, ValueError) assert ( @@ -411,7 +416,7 @@ def test_convert_incorrect_filter(local_store: Store) -> None: fill_value=0, ) - result = runner.invoke(app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 1 assert isinstance(result.exception, TypeError) assert ( @@ -432,7 +437,7 @@ def test_convert_incorrect_compressor(local_store: Store) -> None: fill_value=0, ) - result = runner.invoke(app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 1 assert isinstance(result.exception, TypeError) assert ( @@ -447,7 +452,7 @@ def test_remove_metadata_v2(local_store: Store, expected_paths_no_metadata: list attributes = {"baz": 42, "qux": [1, 4, 7, 12]} create_nested_zarr(local_store, attributes, ".") - result = runner.invoke(app, ["clear", str(local_store.root), "2"]) + result = runner.invoke(cli.app, ["clear", str(local_store.root), "2"]) assert result.exit_code == 0 # check metadata files removed, but all groups / arrays still remain @@ -465,7 +470,7 @@ def test_remove_metadata_v2_with_path( attributes = {"baz": 42, "qux": [1, 4, 7, 12]} create_nested_zarr(local_store, attributes, ".") - result = runner.invoke(app, ["clear", str(local_store.root), "2", "--path", "group_1"]) + result = runner.invoke(cli.app, ["clear", str(local_store.root), "2", "--path", "group_1"]) assert result.exit_code == 0 # check all metadata files inside group_1 are removed (.zattrs / .zgroup / .zarray should remain only inside the top @@ -494,9 +499,9 @@ def test_remove_metadata_after_conversion( create_nested_zarr(local_store, attributes, ".") # convert v2 metadata to v3 (so now both v2 and v3 metadata present!), then remove either the v2 or v3 metadata - result = runner.invoke(app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 0 - result = runner.invoke(app, ["clear", str(local_store.root), zarr_format]) + result = runner.invoke(cli.app, ["clear", str(local_store.root), zarr_format]) assert result.exit_code == 0 paths = sorted(local_store.root.rglob("*")) @@ -514,9 +519,9 @@ def test_dry_run( create_nested_zarr(local_store, attributes, ".") if cli_command == "convert": - result = runner.invoke(app, ["convert", str(local_store.root), "--dry-run"]) + result = runner.invoke(cli.app, ["convert", str(local_store.root), "--dry-run"]) else: - result = runner.invoke(app, ["clear", str(local_store.root), "2", "--dry-run"]) + result = runner.invoke(cli.app, ["clear", str(local_store.root), "2", "--dry-run"]) assert result.exit_code == 0 From 9e20b3901325f6c21e3615de1a80f144581d7355 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Wed, 9 Jul 2025 17:40:31 +0100 Subject: [PATCH 22/57] use v2 chunk key encoding type --- src/zarr/core/metadata/converter/converter_v2_v3.py | 4 ++-- tests/test_metadata/test_converter_v2_v3.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/zarr/core/metadata/converter/converter_v2_v3.py b/src/zarr/core/metadata/converter/converter_v2_v3.py index f5d0e8f010..e2895c757b 100644 --- a/src/zarr/core/metadata/converter/converter_v2_v3.py +++ b/src/zarr/core/metadata/converter/converter_v2_v3.py @@ -13,7 +13,7 @@ from zarr.codecs.zstd import ZstdCodec from zarr.core.array import Array from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding +from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding from zarr.core.common import ( ZARR_JSON, ZARRAY_JSON, @@ -142,7 +142,7 @@ async def remove_metadata( def _convert_array_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: - chunk_key_encoding = DefaultChunkKeyEncoding(separator=metadata_v2.dimension_separator) + chunk_key_encoding = V2ChunkKeyEncoding(separator=metadata_v2.dimension_separator) codecs: list[Codec] = [] diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py index efc831ff28..11083570c1 100644 --- a/tests/test_metadata/test_converter_v2_v3.py +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -16,7 +16,7 @@ from zarr.codecs.transpose import TransposeCodec from zarr.codecs.zstd import ZstdCodec from zarr.core.chunk_grids import RegularChunkGrid -from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding +from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding from zarr.core.dtype.npy.int import BaseInt, UInt8, UInt16 typer_testing = pytest.importorskip( @@ -145,7 +145,7 @@ def test_convert_array(local_store: Store) -> None: assert metadata.node_type == "array" assert metadata.shape == shape assert metadata.chunk_grid == RegularChunkGrid(chunk_shape=chunks) - assert metadata.chunk_key_encoding == DefaultChunkKeyEncoding(separator=".") + assert metadata.chunk_key_encoding == V2ChunkKeyEncoding(separator=".") assert metadata.data_type == UInt16("little") assert metadata.codecs == ( BytesCodec(endian="little"), From ce409a38fbc5c6aa1c2b51e5c6167abc367f0d3d Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Mon, 14 Jul 2025 10:30:28 +0100 Subject: [PATCH 23/57] update endianness of test data type --- tests/test_metadata/test_converter_v2_v3.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py index 11083570c1..e47ea985bf 100644 --- a/tests/test_metadata/test_converter_v2_v3.py +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -30,7 +30,7 @@ def create_nested_zarr(store: Store, attributes: dict[str, Any], separator: str) -> list[str]: - """Create a zarr with nested groups / arrays, returning the paths to all.""" + """Create a zarr with nested groups / arrays for testing, returning the paths to all.""" # 3 levels of nested groups group_0 = zarr.create_group(store=store, zarr_format=2, attributes=attributes) @@ -146,7 +146,7 @@ def test_convert_array(local_store: Store) -> None: assert metadata.shape == shape assert metadata.chunk_grid == RegularChunkGrid(chunk_shape=chunks) assert metadata.chunk_key_encoding == V2ChunkKeyEncoding(separator=".") - assert metadata.data_type == UInt16("little") + assert metadata.data_type == UInt16(endianness="little") assert metadata.codecs == ( BytesCodec(endian="little"), BloscCodec(typesize=2, cname="zstd", clevel=3, shuffle="shuffle", blocksize=0), From 6585f246f5115512a4f47f4ca4627b56ed078dba Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Wed, 16 Jul 2025 12:45:25 +0100 Subject: [PATCH 24/57] check converted arrays can be accessed --- tests/test_metadata/test_converter_v2_v3.py | 31 +++++++++++++++++---- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py index e47ea985bf..3fe32100f0 100644 --- a/tests/test_metadata/test_converter_v2_v3.py +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -240,9 +240,23 @@ def test_convert_nested_with_path(local_store: Store, separator: str) -> None: (numcodecs.GZip(level=3), GzipCodec(level=3)), ( numcodecs.LZMA( - format=1, check=-1, preset=None, filters=[{"id": lzma.FILTER_DELTA, "dist": 4}] + format=lzma.FORMAT_RAW, + check=-1, + preset=None, + filters=[ + {"id": lzma.FILTER_DELTA, "dist": 4}, + {"id": lzma.FILTER_LZMA2, "preset": 1}, + ], + ), + LZMA( + format=lzma.FORMAT_RAW, + check=-1, + preset=None, + filters=[ + {"id": lzma.FILTER_DELTA, "dist": 4}, + {"id": lzma.FILTER_LZMA2, "preset": 1}, + ], ), - LZMA(format=1, check=-1, preset=None, filters=[{"id": lzma.FILTER_DELTA, "dist": 4}]), ), ], ids=["blosc", "zstd", "gzip", "numcodecs-compressor"], @@ -250,7 +264,7 @@ def test_convert_nested_with_path(local_store: Store, separator: str) -> None: def test_convert_compressor( local_store: Store, compressor_v2: numcodecs.abc.Codec, compressor_v3: Codec ) -> None: - zarr.create_array( + zarr_array = zarr.create_array( store=local_store, shape=(10, 10), chunks=(10, 10), @@ -259,6 +273,7 @@ def test_convert_compressor( zarr_format=2, fill_value=0, ) + zarr_array[:] = 1 result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 0 @@ -271,6 +286,7 @@ def test_convert_compressor( BytesCodec(endian="little"), compressor_v3, ) + assert (zarr_array[:] == 1).all() def test_convert_filter(local_store: Store) -> None: @@ -311,7 +327,7 @@ def test_convert_filter(local_store: Store) -> None: def test_convert_C_vs_F_order( local_store: Store, order: str, expected_codecs: tuple[Codec] ) -> None: - zarr.create_array( + zarr_array = zarr.create_array( store=local_store, shape=(10, 10), chunks=(10, 10), @@ -321,6 +337,7 @@ def test_convert_C_vs_F_order( fill_value=0, order=order, ) + zarr_array[:] = 1 result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 0 @@ -329,8 +346,8 @@ def test_convert_C_vs_F_order( zarr_array = zarr.open(local_store.root, zarr_format=3) metadata = zarr_array.metadata assert metadata.zarr_format == 3 - assert metadata.codecs == expected_codecs + assert (zarr_array[:] == 1).all() @pytest.mark.parametrize( @@ -344,7 +361,7 @@ def test_convert_C_vs_F_order( def test_convert_endian( local_store: Store, dtype: str, expected_data_type: BaseInt, expected_codecs: tuple[Codec] ) -> None: - zarr.create_array( + zarr_array = zarr.create_array( store=local_store, shape=(10, 10), chunks=(10, 10), @@ -353,6 +370,7 @@ def test_convert_endian( zarr_format=2, fill_value=0, ) + zarr_array[:] = 1 result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 0 @@ -363,6 +381,7 @@ def test_convert_endian( assert metadata.zarr_format == 3 assert metadata.data_type == expected_data_type assert metadata.codecs == expected_codecs + assert (zarr_array[:] == 1).all() @pytest.mark.parametrize("node_type", ["array", "group"]) From 08fc138d4a2ac5c18042c2526e608c6642cd0f5b Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Wed, 16 Jul 2025 16:58:41 +0100 Subject: [PATCH 25/57] remove uses of pathlib walk, as it didn't exist in python 3.11 --- tests/test_metadata/test_converter_v2_v3.py | 114 +++++++++++--------- 1 file changed, 62 insertions(+), 52 deletions(-) diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py index 3fe32100f0..275bb0a4ed 100644 --- a/tests/test_metadata/test_converter_v2_v3.py +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -79,39 +79,57 @@ def expected_paths_no_metadata() -> list[Path]: @pytest.fixture -def expected_paths_v3_metadata(expected_paths_no_metadata: list[Path]) -> list[Path]: - """Expected paths from create_nested_zarr, with v3 metadata files""" - v3_paths = [ - Path("array_0/zarr.json"), - Path("group_1/array_1/zarr.json"), - Path("group_1/group_2/array_2/zarr.json"), - Path("zarr.json"), - Path("group_1/zarr.json"), - Path("group_1/group_2/zarr.json"), - ] - expected_paths_no_metadata.extend(v3_paths) +def expected_v3_metadata() -> list[Path]: + """Expected v3 metadata for create_nested_zarr""" + return sorted( + [ + Path("array_0/zarr.json"), + Path("group_1/array_1/zarr.json"), + Path("group_1/group_2/array_2/zarr.json"), + Path("zarr.json"), + Path("group_1/zarr.json"), + Path("group_1/group_2/zarr.json"), + ] + ) + + +@pytest.fixture +def expected_v2_metadata() -> list[Path]: + """Expected v2 metadata for create_nested_zarr""" + return sorted( + [ + Path("array_0/.zarray"), + Path("array_0/.zattrs"), + Path("group_1/array_1/.zarray"), + Path("group_1/array_1/.zattrs"), + Path("group_1/group_2/array_2/.zarray"), + Path("group_1/group_2/array_2/.zattrs"), + Path(".zgroup"), + Path(".zattrs"), + Path("group_1/.zgroup"), + Path("group_1/.zattrs"), + Path("group_1/group_2/.zgroup"), + Path("group_1/group_2/.zattrs"), + ] + ) + + +@pytest.fixture +def expected_paths_v3_metadata( + expected_paths_no_metadata: list[Path], expected_v3_metadata: list[Path] +) -> list[Path]: + """Expected paths from create_nested_zarr + v3 metadata files""" + expected_paths_no_metadata.extend(expected_v3_metadata) return sorted(expected_paths_no_metadata) @pytest.fixture -def expected_paths_v2_metadata(expected_paths_no_metadata: list[Path]) -> list[Path]: - """Expected paths from create_nested_zarr, with v2 metadata files""" - v2_paths = [ - Path("array_0/.zarray"), - Path("array_0/.zattrs"), - Path("group_1/array_1/.zarray"), - Path("group_1/array_1/.zattrs"), - Path("group_1/group_2/array_2/.zarray"), - Path("group_1/group_2/array_2/.zattrs"), - Path(".zgroup"), - Path(".zattrs"), - Path("group_1/.zgroup"), - Path("group_1/.zattrs"), - Path("group_1/group_2/.zgroup"), - Path("group_1/group_2/.zattrs"), - ] - expected_paths_no_metadata.extend(v2_paths) +def expected_paths_v2_metadata( + expected_paths_no_metadata: list[Path], expected_v2_metadata: list[Path] +) -> list[Path]: + """Expected paths from create_nested_zarr + v2 metadata files""" + expected_paths_no_metadata.extend(expected_v2_metadata) return sorted(expected_paths_no_metadata) @@ -174,7 +192,9 @@ def test_convert_group(local_store: Store) -> None: @pytest.mark.parametrize("separator", [".", "/"]) -def test_convert_nested_groups_and_arrays(local_store: Store, separator: str) -> None: +def test_convert_nested_groups_and_arrays( + local_store: Store, separator: str, expected_v3_metadata: list[Path] +) -> None: """Test that zarr.json are made at the correct points in a hierarchy of groups and arrays (including when there are additional dirs due to using a / separator)""" @@ -184,17 +204,9 @@ def test_convert_nested_groups_and_arrays(local_store: Store, separator: str) -> result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 0 - # check zarr.json were created for every group and array - total_zarr_jsons = 0 - for _, _, filenames in local_store.root.walk(): - # group / array directories - if ".zattrs" in filenames: - assert "zarr.json" in filenames - total_zarr_jsons += 1 - # other directories e.g. for chunks when separator is / - else: - assert "zarr.json" not in filenames - assert total_zarr_jsons == 6 + zarr_json_paths = sorted(local_store.root.rglob("zarr.json")) + expected_zarr_json_paths = [local_store.root / p for p in expected_v3_metadata] + assert zarr_json_paths == expected_zarr_json_paths # Check converted zarr can be opened + metadata accessed at all levels zarr_array = zarr.open(local_store.root, zarr_format=3) @@ -206,7 +218,9 @@ def test_convert_nested_groups_and_arrays(local_store: Store, separator: str) -> @pytest.mark.parametrize("separator", [".", "/"]) -def test_convert_nested_with_path(local_store: Store, separator: str) -> None: +def test_convert_nested_with_path( + local_store: Store, separator: str, expected_v3_metadata: list[Path] +) -> None: """Test that only arrays/groups within group_1 are converted (+ no other files in store)""" create_nested_zarr(local_store, {}, separator) @@ -216,17 +230,13 @@ def test_convert_nested_with_path(local_store: Store, separator: str) -> None: group_path = local_store.root / "group_1" - total_zarr_jsons = 0 - for dirpath, _, filenames in local_store.root.walk(): - inside_group = (dirpath == group_path) or (group_path in dirpath.parents) - if (".zattrs" in filenames) and inside_group: - # group / array directories inside the group - assert "zarr.json" in filenames - total_zarr_jsons += 1 - else: - assert "zarr.json" not in filenames - - assert total_zarr_jsons == 4 + zarr_json_paths = sorted(local_store.root.rglob("zarr.json")) + expected_zarr_json_paths = [ + local_store.root / p + for p in expected_v3_metadata + if group_path in (local_store.root / p).parents + ] + assert zarr_json_paths == expected_zarr_json_paths @pytest.mark.parametrize( From 3540434162daf08832cb366bbfbe5de16babadba Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Wed, 16 Jul 2025 17:11:04 +0100 Subject: [PATCH 26/57] include tags in checkout for gpu test, to avoid numcodecs.zarr3 requesting a zarr version greater than 3 --- .github/workflows/gpu_test.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/gpu_test.yml b/.github/workflows/gpu_test.yml index 752440719b..fdd0d27463 100644 --- a/.github/workflows/gpu_test.yml +++ b/.github/workflows/gpu_test.yml @@ -30,6 +30,8 @@ jobs: steps: - uses: actions/checkout@v4 + with: + fetch-depth: 0 # grab all branches and tags # - name: cuda-toolkit # uses: Jimver/cuda-toolkit@v0.2.16 # id: cuda-toolkit From 0889979e57d2c96e3f442f10c52849369a65200d Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Wed, 23 Jul 2025 16:00:20 +0100 Subject: [PATCH 27/57] rename cli commands from review comments --- pyproject.toml | 2 +- src/zarr/core/metadata/converter/cli.py | 24 ++++++++++++------- .../{converter_v2_v3.py => migrate_to_v3.py} | 12 +++++----- 3 files changed, 23 insertions(+), 15 deletions(-) rename src/zarr/core/metadata/converter/{converter_v2_v3.py => migrate_to_v3.py} (96%) diff --git a/pyproject.toml b/pyproject.toml index edc16e4e4f..82f78ff6c6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -115,7 +115,7 @@ docs = [ ] [project.scripts] -zarr-converter = "zarr.core.metadata.converter.cli:app" +zarr = "zarr.core.metadata.converter.cli:app" [project.urls] diff --git a/src/zarr/core/metadata/converter/cli.py b/src/zarr/core/metadata/converter/cli.py index 21ffaef20f..0245609d87 100644 --- a/src/zarr/core/metadata/converter/cli.py +++ b/src/zarr/core/metadata/converter/cli.py @@ -3,7 +3,7 @@ import typer -from zarr.core.metadata.converter.converter_v2_v3 import convert_v2_to_v3, remove_metadata +import zarr.core.metadata.converter.migrate_to_v3 as migrate_metadata from zarr.core.sync import sync app = typer.Typer() @@ -25,7 +25,15 @@ def _set_verbose_level() -> None: @app.command() # type: ignore[misc] -def convert( +def migrate( + zarr_format: Annotated[ + int, + typer.Argument( + help="Zarr format to migrate to. Currently only 'v3' is supported.", + min=3, + max=3, + ), + ], store: Annotated[ str, typer.Argument( @@ -40,7 +48,7 @@ def convert( ), ] = False, ) -> None: - """Convert all v2 metadata in a zarr hierarchy to v3. This will create a zarr.json file at each level + """Migrate all v2 metadata in a zarr hierarchy to v3. This will create a zarr.json file at each level (for every group / array). V2 files (.zarray, .zattrs etc.) will be left as-is. """ if dry_run: @@ -49,11 +57,11 @@ def convert( "Dry run enabled - no new files will be created. Log of files that would be created on a real run:" ) - convert_v2_to_v3(store=store, path=path, dry_run=dry_run) + migrate_metadata.migrate_to_v3(store=store, path=path, dry_run=dry_run) @app.command() # type: ignore[misc] -def clear( +def remove_metadata( store: Annotated[ str, typer.Argument( @@ -86,7 +94,7 @@ def clear( ) sync( - remove_metadata( + migrate_metadata.remove_metadata( store=store, zarr_format=cast(Literal[2, 3], zarr_format), path=path, dry_run=dry_run ) ) @@ -102,8 +110,8 @@ def main( ] = False, ) -> None: """ - Convert metadata from v2 to v3. See available commands below - access help for individual commands with - zarr-converter COMMAND --help. + Migrate metadata from v2 to v3. See available commands below - access help for individual commands with + zarr COMMAND --help. """ _set_logging_config(verbose) diff --git a/src/zarr/core/metadata/converter/converter_v2_v3.py b/src/zarr/core/metadata/converter/migrate_to_v3.py similarity index 96% rename from src/zarr/core/metadata/converter/converter_v2_v3.py rename to src/zarr/core/metadata/converter/migrate_to_v3.py index e2895c757b..b74d18cf29 100644 --- a/src/zarr/core/metadata/converter/converter_v2_v3.py +++ b/src/zarr/core/metadata/converter/migrate_to_v3.py @@ -34,13 +34,13 @@ logger = logging.getLogger(__name__) -def convert_v2_to_v3( +def migrate_to_v3( store: StoreLike, path: str | None = None, storage_options: dict[str, Any] | None = None, dry_run: bool = False, ) -> None: - """Convert all v2 metadata in a zarr hierarchy to v3. This will create a zarr.json file at each level + """Migrate all v2 metadata in a zarr hierarchy to v3. This will create a zarr.json file at each level (for every group / array). V2 files (.zarray, .zattrs etc.) will be left as-is. Parameters @@ -57,11 +57,11 @@ def convert_v2_to_v3( """ zarr_v2 = zarr.open(store=store, mode="r+", path=path, storage_options=storage_options) - convert_array_or_group(zarr_v2, dry_run=dry_run) + migrate_array_or_group(zarr_v2, dry_run=dry_run) -def convert_array_or_group(zarr_v2: Array | Group, dry_run: bool = False) -> None: - """Convert all v2 metadata in a zarr array/group to v3. Note - if a group is provided, then +def migrate_array_or_group(zarr_v2: Array | Group, dry_run: bool = False) -> None: + """Migrate all v2 metadata in a zarr array/group to v3. Note - if a group is provided, then all arrays / groups within this group will also be converted. A zarr.json file will be created at each level, with any V2 files (.zarray, .zattrs etc.) left as-is. @@ -78,7 +78,7 @@ def convert_array_or_group(zarr_v2: Array | Group, dry_run: bool = False) -> Non if isinstance(zarr_v2.metadata, GroupMetadata): # process members of the group for key in zarr_v2: - convert_array_or_group(zarr_v2[key], dry_run=dry_run) + migrate_array_or_group(zarr_v2[key], dry_run=dry_run) # write group's converted metadata group_metadata_v3 = GroupMetadata( From d906dba46e1dab1a88adcea8ac9452dc0a7561d9 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Wed, 23 Jul 2025 16:05:15 +0100 Subject: [PATCH 28/57] remove path option --- src/zarr/core/metadata/converter/cli.py | 6 ++---- src/zarr/core/metadata/converter/migrate_to_v3.py | 15 ++------------- 2 files changed, 4 insertions(+), 17 deletions(-) diff --git a/src/zarr/core/metadata/converter/cli.py b/src/zarr/core/metadata/converter/cli.py index 0245609d87..a5246b703b 100644 --- a/src/zarr/core/metadata/converter/cli.py +++ b/src/zarr/core/metadata/converter/cli.py @@ -40,7 +40,6 @@ def migrate( help="Store or path to directory in file system or name of zip file e.g. 'data/example-1.zarr', 's3://example-bucket/example'..." ), ], - path: Annotated[str | None, typer.Option(help="The path within the store to open")] = None, dry_run: Annotated[ bool, typer.Option( @@ -57,7 +56,7 @@ def migrate( "Dry run enabled - no new files will be created. Log of files that would be created on a real run:" ) - migrate_metadata.migrate_to_v3(store=store, path=path, dry_run=dry_run) + migrate_metadata.migrate_to_v3(store=store, dry_run=dry_run) @app.command() # type: ignore[misc] @@ -76,7 +75,6 @@ def remove_metadata( max=3, ), ], - path: Annotated[str | None, typer.Option(help="The path within the store to open")] = None, dry_run: Annotated[ bool, typer.Option( @@ -95,7 +93,7 @@ def remove_metadata( sync( migrate_metadata.remove_metadata( - store=store, zarr_format=cast(Literal[2, 3], zarr_format), path=path, dry_run=dry_run + store=store, zarr_format=cast(Literal[2, 3], zarr_format), dry_run=dry_run ) ) diff --git a/src/zarr/core/metadata/converter/migrate_to_v3.py b/src/zarr/core/metadata/converter/migrate_to_v3.py index b74d18cf29..5234c5a1c6 100644 --- a/src/zarr/core/metadata/converter/migrate_to_v3.py +++ b/src/zarr/core/metadata/converter/migrate_to_v3.py @@ -36,7 +36,6 @@ def migrate_to_v3( store: StoreLike, - path: str | None = None, storage_options: dict[str, Any] | None = None, dry_run: bool = False, ) -> None: @@ -47,8 +46,6 @@ def migrate_to_v3( ---------- store : StoreLike Store or path to directory in file system or name of zip file. - path : str | None, optional - The path within the store to open, by default None storage_options : dict | None, optional If the store is backed by an fsspec-based implementation, then this dict will be passed to the Store constructor for that implementation. Ignored otherwise. @@ -56,7 +53,7 @@ def migrate_to_v3( Enable a 'dry run' - files that would be created are logged, but no files are actually created. """ - zarr_v2 = zarr.open(store=store, mode="r+", path=path, storage_options=storage_options) + zarr_v2 = zarr.open(store=store, mode="r+", storage_options=storage_options) migrate_array_or_group(zarr_v2, dry_run=dry_run) @@ -95,7 +92,6 @@ def migrate_array_or_group(zarr_v2: Array | Group, dry_run: bool = False) -> Non async def remove_metadata( store: StoreLike, zarr_format: ZarrFormat, - path: str | None = None, storage_options: dict[str, Any] | None = None, dry_run: bool = False, ) -> None: @@ -108,8 +104,6 @@ async def remove_metadata( Store or path to directory in file system or name of zip file. zarr_format : ZarrFormat Which format's metadata to remove - 2 or 3. - path : str | None, optional - The path within the store to open, by default None storage_options : dict | None, optional If the store is backed by an fsspec-based implementation, then this dict will be passed to the Store constructor for that implementation. Ignored otherwise. @@ -120,18 +114,13 @@ async def remove_metadata( if not store_path.store.supports_deletes: raise ValueError("Store must support deletes to remove metadata") - if path is None: - prefix = "" - else: - prefix = path - if zarr_format == 2: metadata_files = [ZARRAY_JSON, ZATTRS_JSON, ZGROUP_JSON, ZMETADATA_V2_JSON] else: metadata_files = [ZARR_JSON] awaitables = [] - async for file_path in store_path.store.list_prefix(prefix): + async for file_path in store_path.store.list_prefix(""): if file_path.split("/")[-1] in metadata_files: logger.info("Deleting metadata at %s", store_path / file_path) From 5e03e3cae307274b20547ca09bfb036cef7f3a08 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Thu, 24 Jul 2025 09:38:20 +0100 Subject: [PATCH 29/57] allow metadata to be written to a separate store location --- src/zarr/core/metadata/converter/cli.py | 25 +++++++-- .../core/metadata/converter/migrate_to_v3.py | 54 +++++++++++++------ 2 files changed, 57 insertions(+), 22 deletions(-) diff --git a/src/zarr/core/metadata/converter/cli.py b/src/zarr/core/metadata/converter/cli.py index a5246b703b..db7d917e44 100644 --- a/src/zarr/core/metadata/converter/cli.py +++ b/src/zarr/core/metadata/converter/cli.py @@ -34,12 +34,25 @@ def migrate( max=3, ), ], - store: Annotated[ + input_store: Annotated[ str, typer.Argument( - help="Store or path to directory in file system or name of zip file e.g. 'data/example-1.zarr', 's3://example-bucket/example'..." + help=( + "Input Zarr to migrate - should be a store, path to directory in file system or name of zip file " + "e.g. 'data/example-1.zarr', 's3://example-bucket/example'..." + ) ), ], + output_store: Annotated[ + str | None, + typer.Argument( + help=( + "Output location to write generated metadata (no chunks will be copied). If not provided, " + "metadata will be written to input_store. Should be a store, path to directory in file system " + "or name of zip file e.g. 'data/example-1.zarr', 's3://example-bucket/example'..." + ) + ), + ] = None, dry_run: Annotated[ bool, typer.Option( @@ -47,8 +60,8 @@ def migrate( ), ] = False, ) -> None: - """Migrate all v2 metadata in a zarr hierarchy to v3. This will create a zarr.json file at each level - (for every group / array). V2 files (.zarray, .zattrs etc.) will be left as-is. + """Migrate all v2 metadata in a zarr hierarchy to v3. This will create a zarr.json file for each level + (every group / array). V2 files (.zarray, .zattrs etc.) will be left as-is. """ if dry_run: _set_verbose_level() @@ -56,7 +69,9 @@ def migrate( "Dry run enabled - no new files will be created. Log of files that would be created on a real run:" ) - migrate_metadata.migrate_to_v3(store=store, dry_run=dry_run) + migrate_metadata.migrate_to_v3( + input_store=input_store, output_store=output_store, dry_run=dry_run + ) @app.command() # type: ignore[misc] diff --git a/src/zarr/core/metadata/converter/migrate_to_v3.py b/src/zarr/core/metadata/converter/migrate_to_v3.py index 5234c5a1c6..7a95fdb50c 100644 --- a/src/zarr/core/metadata/converter/migrate_to_v3.py +++ b/src/zarr/core/metadata/converter/migrate_to_v3.py @@ -29,13 +29,14 @@ from zarr.core.sync import sync from zarr.registry import get_codec_class from zarr.storage import StoreLike -from zarr.storage._common import make_store_path +from zarr.storage._common import StorePath, make_store_path logger = logging.getLogger(__name__) def migrate_to_v3( - store: StoreLike, + input_store: StoreLike, + output_store: StoreLike | None = None, storage_options: dict[str, Any] | None = None, dry_run: bool = False, ) -> None: @@ -44,28 +45,47 @@ def migrate_to_v3( Parameters ---------- - store : StoreLike - Store or path to directory in file system or name of zip file. + input_store : StoreLike + Input Zarr to migrate - should be a store, path to directory in file system or name of zip file. + output_store : StoreLike + Output location to write v3 metadata (no chunks will be copied). If not provided, v3 metadata will be + written to input_store. Should be a store, path to directory in file system or name of zip file. storage_options : dict | None, optional If the store is backed by an fsspec-based implementation, then this dict will be passed to - the Store constructor for that implementation. Ignored otherwise. + the Store constructor for that implementation. Ignored otherwise. Note - the same storage_options will + be passed to both input_store and output_store (if provided). dry_run : bool, optional Enable a 'dry run' - files that would be created are logged, but no files are actually created. """ - zarr_v2 = zarr.open(store=store, mode="r+", storage_options=storage_options) - migrate_array_or_group(zarr_v2, dry_run=dry_run) + zarr_v2 = zarr.open(store=input_store, mode="r+", storage_options=storage_options) + + if output_store is not None: + # w- access to not allow overwrite of existing data + output_path = sync( + make_store_path(output_store, mode="w-", storage_options=storage_options) + ) + else: + output_path = zarr_v2.store_path + + migrate_array_or_group(zarr_v2, output_path, dry_run=dry_run) -def migrate_array_or_group(zarr_v2: Array | Group, dry_run: bool = False) -> None: - """Migrate all v2 metadata in a zarr array/group to v3. Note - if a group is provided, then - all arrays / groups within this group will also be converted. A zarr.json file will be created - at each level, with any V2 files (.zarray, .zattrs etc.) left as-is. +def migrate_array_or_group( + zarr_v2: Array | Group, output_path: StorePath, dry_run: bool = False +) -> None: + """Migrate all v2 metadata in a zarr array/group to v3. + + Note - if a group is provided, then all arrays / groups within this group will also be converted. + A zarr.json file will be created for each level and written to output_path, with any V2 files + (.zarray, .zattrs etc.) left as-is. Parameters ---------- zarr_v2 : Array | Group An array or group with zarr_format = 2 + output_path : StorePath + The store path to write generated v3 metadata to. dry_run : bool, optional Enable a 'dry run' - files that would be created are logged, but no files are actually created. """ @@ -75,18 +95,18 @@ def migrate_array_or_group(zarr_v2: Array | Group, dry_run: bool = False) -> Non if isinstance(zarr_v2.metadata, GroupMetadata): # process members of the group for key in zarr_v2: - migrate_array_or_group(zarr_v2[key], dry_run=dry_run) + migrate_array_or_group(zarr_v2[key], output_path=output_path / key, dry_run=dry_run) # write group's converted metadata group_metadata_v3 = GroupMetadata( attributes=zarr_v2.metadata.attributes, zarr_format=3, consolidated_metadata=None ) - sync(_save_v3_metadata(zarr_v2, group_metadata_v3, dry_run=dry_run)) + sync(_save_v3_metadata(group_metadata_v3, output_path, dry_run=dry_run)) else: # write array's converted metadata array_metadata_v3 = _convert_array_metadata(zarr_v2.metadata) - sync(_save_v3_metadata(zarr_v2, array_metadata_v3, dry_run=dry_run)) + sync(_save_v3_metadata(array_metadata_v3, output_path, dry_run=dry_run)) async def remove_metadata( @@ -232,11 +252,11 @@ def _find_numcodecs_zarr3(numcodecs_codec: numcodecs.abc.Codec) -> Codec: async def _save_v3_metadata( - zarr_v2: Array | Group, metadata_v3: ArrayV3Metadata | GroupMetadata, dry_run: bool = False + metadata_v3: ArrayV3Metadata | GroupMetadata, output_path: StorePath, dry_run: bool = False ) -> None: - zarr_json_path = zarr_v2.store_path / ZARR_JSON + zarr_json_path = output_path / ZARR_JSON if await zarr_json_path.exists(): - raise ValueError(f"{ZARR_JSON} already exists at {zarr_v2.store_path}") + raise ValueError(f"{ZARR_JSON} already exists at {zarr_json_path}") logger.info("Saving metadata to %s", zarr_json_path) to_save = metadata_v3.to_buffer_dict(default_buffer_prototype()) From 89aa095fc1d4462928f7acbec835583e805a3cda Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Thu, 24 Jul 2025 10:01:19 +0100 Subject: [PATCH 30/57] add overwrite and remove-v2-metadata options --- src/zarr/core/metadata/converter/cli.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/zarr/core/metadata/converter/cli.py b/src/zarr/core/metadata/converter/cli.py index db7d917e44..0003853a38 100644 --- a/src/zarr/core/metadata/converter/cli.py +++ b/src/zarr/core/metadata/converter/cli.py @@ -53,6 +53,14 @@ def migrate( ) ), ] = None, + overwrite: Annotated[ + bool, + typer.Option(help="Overwrite any existing v3 metadata at the output location."), + ] = False, + remove_v2_metadata: Annotated[ + bool, + typer.Option(help="Remove v2 metadata (if any) from the output location."), + ] = False, dry_run: Annotated[ bool, typer.Option( @@ -69,10 +77,18 @@ def migrate( "Dry run enabled - no new files will be created. Log of files that would be created on a real run:" ) + write_store = output_store if output_store is not None else input_store + + if overwrite: + sync(migrate_metadata.remove_metadata(write_store, 3, dry_run=dry_run)) + migrate_metadata.migrate_to_v3( input_store=input_store, output_store=output_store, dry_run=dry_run ) + if remove_v2_metadata: + sync(migrate_metadata.remove_metadata(write_store, 2, dry_run=dry_run)) + @app.command() # type: ignore[misc] def remove_metadata( From ade9c3befb423b095f68f658fea6b61f0b3bf95e Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Thu, 24 Jul 2025 11:30:12 +0100 Subject: [PATCH 31/57] add force option --- src/zarr/core/metadata/converter/cli.py | 39 ++++++++++++--- .../core/metadata/converter/migrate_to_v3.py | 50 ++++++++++++++++--- 2 files changed, 73 insertions(+), 16 deletions(-) diff --git a/src/zarr/core/metadata/converter/cli.py b/src/zarr/core/metadata/converter/cli.py index 0003853a38..1ae94bb2e8 100644 --- a/src/zarr/core/metadata/converter/cli.py +++ b/src/zarr/core/metadata/converter/cli.py @@ -53,18 +53,31 @@ def migrate( ) ), ] = None, + dry_run: Annotated[ + bool, + typer.Option( + help="Enable a dry-run: files that would be converted are logged, but no new files are actually created." + ), + ] = False, overwrite: Annotated[ bool, - typer.Option(help="Overwrite any existing v3 metadata at the output location."), + typer.Option( + help="Remove any existing v3 metadata at the output location, before migration starts." + ), ] = False, - remove_v2_metadata: Annotated[ + force: Annotated[ bool, - typer.Option(help="Remove v2 metadata (if any) from the output location."), + typer.Option( + help=( + "Only used when --overwrite is given. Allows v3 metadata to be removed when no valid " + "v2 metadata exists at the output location." + ) + ), ] = False, - dry_run: Annotated[ + remove_v2_metadata: Annotated[ bool, typer.Option( - help="Enable a dry-run: files that would be converted are logged, but no new files are actually created." + help="Remove v2 metadata (if any) from the output location, after migration is complete." ), ] = False, ) -> None: @@ -80,14 +93,15 @@ def migrate( write_store = output_store if output_store is not None else input_store if overwrite: - sync(migrate_metadata.remove_metadata(write_store, 3, dry_run=dry_run)) + sync(migrate_metadata.remove_metadata(write_store, 3, force=force, dry_run=dry_run)) migrate_metadata.migrate_to_v3( input_store=input_store, output_store=output_store, dry_run=dry_run ) if remove_v2_metadata: - sync(migrate_metadata.remove_metadata(write_store, 2, dry_run=dry_run)) + # There should always be valid v3 metadata at the output location after migration, so force=False + sync(migrate_metadata.remove_metadata(write_store, 2, force=False, dry_run=dry_run)) @app.command() # type: ignore[misc] @@ -106,6 +120,15 @@ def remove_metadata( max=3, ), ], + force: Annotated[ + bool, + typer.Option( + help=( + "Allow metadata to be deleted when no valid alternative exists e.g. allow deletion of v2 metadata, " + "when no v3 metadata is present." + ) + ), + ] = False, dry_run: Annotated[ bool, typer.Option( @@ -124,7 +147,7 @@ def remove_metadata( sync( migrate_metadata.remove_metadata( - store=store, zarr_format=cast(Literal[2, 3], zarr_format), dry_run=dry_run + store=store, zarr_format=cast(Literal[2, 3], zarr_format), force=force, dry_run=dry_run ) ) diff --git a/src/zarr/core/metadata/converter/migrate_to_v3.py b/src/zarr/core/metadata/converter/migrate_to_v3.py index 7a95fdb50c..75bd9a86ad 100644 --- a/src/zarr/core/metadata/converter/migrate_to_v3.py +++ b/src/zarr/core/metadata/converter/migrate_to_v3.py @@ -1,6 +1,6 @@ import asyncio import logging -from typing import Any, cast +from typing import Any, Literal, cast import numcodecs.abc @@ -40,8 +40,10 @@ def migrate_to_v3( storage_options: dict[str, Any] | None = None, dry_run: bool = False, ) -> None: - """Migrate all v2 metadata in a zarr hierarchy to v3. This will create a zarr.json file at each level - (for every group / array). V2 files (.zarray, .zattrs etc.) will be left as-is. + """Migrate all v2 metadata in a zarr hierarchy to v3. + + This will create a zarr.json file at each level (for every group / array). V2 files (.zarray, .zattrs etc.) + will be left as-is. Parameters ---------- @@ -113,9 +115,11 @@ async def remove_metadata( store: StoreLike, zarr_format: ZarrFormat, storage_options: dict[str, Any] | None = None, + force: bool = False, dry_run: bool = False, ) -> None: """Remove all v2 (.zarray, .zattrs, .zgroup, .zmetadata) or v3 (zarr.json) metadata files from the given Zarr. + Note - this will remove metadata files at all levels of the hierarchy (every group and array). Parameters @@ -127,6 +131,10 @@ async def remove_metadata( storage_options : dict | None, optional If the store is backed by an fsspec-based implementation, then this dict will be passed to the Store constructor for that implementation. Ignored otherwise. + force : bool, optional + When False, metadata can only be removed if a valid alternative exists e.g. deletion of v2 metadata will + only be allowed when v3 metadata is also present. When True, metadata can be removed when there is no + alternative. dry_run : bool, optional Enable a 'dry run' - files that would be deleted are logged, but no files are actually removed. """ @@ -134,22 +142,48 @@ async def remove_metadata( if not store_path.store.supports_deletes: raise ValueError("Store must support deletes to remove metadata") + metadata_files_all = { + 2: [ZARRAY_JSON, ZATTRS_JSON, ZGROUP_JSON, ZMETADATA_V2_JSON], + 3: [ZARR_JSON], + } + if zarr_format == 2: - metadata_files = [ZARRAY_JSON, ZATTRS_JSON, ZGROUP_JSON, ZMETADATA_V2_JSON] + alternative_metadata = 3 else: - metadata_files = [ZARR_JSON] + alternative_metadata = 2 awaitables = [] - async for file_path in store_path.store.list_prefix(""): - if file_path.split("/")[-1] in metadata_files: - logger.info("Deleting metadata at %s", store_path / file_path) + async for file_path in store_path.store.list(): + parent_path, _, file_name = file_path.rpartition("/") + + if file_name not in metadata_files_all[zarr_format]: + continue + if force or await _metadata_exists( + cast(Literal[2, 3], alternative_metadata), store_path / parent_path + ): + logger.info("Deleting metadata at %s", store_path / file_path) if not dry_run: awaitables.append((store_path / file_path).delete()) + else: + raise ValueError( + f"Cannot remove v{zarr_format} metadata at {store_path / file_path} - no v{alternative_metadata} " + "metadata exists. To delete anyway, use the 'force' option." + ) await asyncio.gather(*awaitables) +async def _metadata_exists(zarr_format: ZarrFormat, store_path: StorePath) -> bool: + metadata_files_required = {2: [ZARRAY_JSON, ZGROUP_JSON], 3: [ZARR_JSON]} + + for metadata_file in metadata_files_required[zarr_format]: + if await (store_path / metadata_file).exists(): + return True + + return False + + def _convert_array_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: chunk_key_encoding = V2ChunkKeyEncoding(separator=metadata_v2.dimension_separator) From 218e8a88578b3cbebc7231c45ed8c2b65025fa8b Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Thu, 24 Jul 2025 11:53:28 +0100 Subject: [PATCH 32/57] use v2, v3 format for CLI --- src/zarr/core/metadata/converter/cli.py | 32 +++++++++++++++---------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/src/zarr/core/metadata/converter/cli.py b/src/zarr/core/metadata/converter/cli.py index 1ae94bb2e8..785384917d 100644 --- a/src/zarr/core/metadata/converter/cli.py +++ b/src/zarr/core/metadata/converter/cli.py @@ -1,4 +1,5 @@ import logging +from enum import Enum from typing import Annotated, Literal, cast import typer @@ -24,14 +25,23 @@ def _set_verbose_level() -> None: logging.getLogger().setLevel(logging.INFO) +class ZarrFormat(str, Enum): + v2 = "v2" + v3 = "v3" + + +class ZarrFormatV3(str, Enum): + """Limit CLI choice to only V3""" + + v3 = "v3" + + @app.command() # type: ignore[misc] def migrate( zarr_format: Annotated[ - int, + ZarrFormatV3, typer.Argument( help="Zarr format to migrate to. Currently only 'v3' is supported.", - min=3, - max=3, ), ], input_store: Annotated[ @@ -113,12 +123,8 @@ def remove_metadata( ), ], zarr_format: Annotated[ - int, - typer.Argument( - help="Which format's metadata to remove - 2 or 3.", - min=2, - max=3, - ), + ZarrFormat, + typer.Argument(help="Which format's metadata to remove - v2 or v3."), ], force: Annotated[ bool, @@ -147,7 +153,10 @@ def remove_metadata( sync( migrate_metadata.remove_metadata( - store=store, zarr_format=cast(Literal[2, 3], zarr_format), force=force, dry_run=dry_run + store=store, + zarr_format=cast(Literal[2, 3], int(zarr_format[1:])), + force=force, + dry_run=dry_run, ) ) @@ -162,8 +171,7 @@ def main( ] = False, ) -> None: """ - Migrate metadata from v2 to v3. See available commands below - access help for individual commands with - zarr COMMAND --help. + See available commands below - access help for individual commands with zarr COMMAND --help. """ _set_logging_config(verbose) From 49787f6a04812b9af0422078a87447aaac2455b3 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Thu, 24 Jul 2025 12:07:22 +0100 Subject: [PATCH 33/57] split into convert_group and convert_array functions --- src/zarr/core/metadata/converter/cli.py | 2 +- .../core/metadata/converter/migrate_to_v3.py | 40 ++++++++++--------- 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/src/zarr/core/metadata/converter/cli.py b/src/zarr/core/metadata/converter/cli.py index 785384917d..7495ab2803 100644 --- a/src/zarr/core/metadata/converter/cli.py +++ b/src/zarr/core/metadata/converter/cli.py @@ -105,7 +105,7 @@ def migrate( if overwrite: sync(migrate_metadata.remove_metadata(write_store, 3, force=force, dry_run=dry_run)) - migrate_metadata.migrate_to_v3( + migrate_metadata.migrate_v2_to_v3( input_store=input_store, output_store=output_store, dry_run=dry_run ) diff --git a/src/zarr/core/metadata/converter/migrate_to_v3.py b/src/zarr/core/metadata/converter/migrate_to_v3.py index 75bd9a86ad..d4cdebe3e7 100644 --- a/src/zarr/core/metadata/converter/migrate_to_v3.py +++ b/src/zarr/core/metadata/converter/migrate_to_v3.py @@ -34,7 +34,7 @@ logger = logging.getLogger(__name__) -def migrate_to_v3( +def migrate_v2_to_v3( input_store: StoreLike, output_store: StoreLike | None = None, storage_options: dict[str, Any] | None = None, @@ -70,12 +70,10 @@ def migrate_to_v3( else: output_path = zarr_v2.store_path - migrate_array_or_group(zarr_v2, output_path, dry_run=dry_run) + migrate_to_v3(zarr_v2, output_path, dry_run=dry_run) -def migrate_array_or_group( - zarr_v2: Array | Group, output_path: StorePath, dry_run: bool = False -) -> None: +def migrate_to_v3(zarr_v2: Array | Group, output_path: StorePath, dry_run: bool = False) -> None: """Migrate all v2 metadata in a zarr array/group to v3. Note - if a group is provided, then all arrays / groups within this group will also be converted. @@ -95,20 +93,9 @@ def migrate_array_or_group( raise TypeError("Only arrays / groups with zarr v2 metadata can be converted") if isinstance(zarr_v2.metadata, GroupMetadata): - # process members of the group - for key in zarr_v2: - migrate_array_or_group(zarr_v2[key], output_path=output_path / key, dry_run=dry_run) - - # write group's converted metadata - group_metadata_v3 = GroupMetadata( - attributes=zarr_v2.metadata.attributes, zarr_format=3, consolidated_metadata=None - ) - sync(_save_v3_metadata(group_metadata_v3, output_path, dry_run=dry_run)) - + _convert_group(zarr_v2, output_path, dry_run) else: - # write array's converted metadata - array_metadata_v3 = _convert_array_metadata(zarr_v2.metadata) - sync(_save_v3_metadata(array_metadata_v3, output_path, dry_run=dry_run)) + _convert_array(zarr_v2, output_path, dry_run) async def remove_metadata( @@ -174,6 +161,23 @@ async def remove_metadata( await asyncio.gather(*awaitables) +def _convert_group(zarr_v2: Group, output_path: StorePath, dry_run: bool) -> None: + # process members of the group + for key in zarr_v2: + migrate_to_v3(zarr_v2[key], output_path=output_path / key, dry_run=dry_run) + + # write group's converted metadata + group_metadata_v3 = GroupMetadata( + attributes=zarr_v2.metadata.attributes, zarr_format=3, consolidated_metadata=None + ) + sync(_save_v3_metadata(group_metadata_v3, output_path, dry_run=dry_run)) + + +def _convert_array(zarr_v2: Array, output_path: StorePath, dry_run: bool) -> None: + array_metadata_v3 = _convert_array_metadata(cast(ArrayV2Metadata, zarr_v2.metadata)) + sync(_save_v3_metadata(array_metadata_v3, output_path, dry_run=dry_run)) + + async def _metadata_exists(zarr_format: ZarrFormat, store_path: StorePath) -> bool: metadata_files_required = {2: [ZARRAY_JSON, ZGROUP_JSON], 3: [ZARR_JSON]} From 488485cbd7b66b7538d99b63e35e256f0b4f8fee Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Thu, 24 Jul 2025 13:15:18 +0100 Subject: [PATCH 34/57] update command names in converter tests --- src/zarr/core/metadata/converter/cli.py | 8 ++-- tests/test_metadata/test_converter_v2_v3.py | 46 +++++++++++---------- 2 files changed, 29 insertions(+), 25 deletions(-) diff --git a/src/zarr/core/metadata/converter/cli.py b/src/zarr/core/metadata/converter/cli.py index 7495ab2803..07132313c7 100644 --- a/src/zarr/core/metadata/converter/cli.py +++ b/src/zarr/core/metadata/converter/cli.py @@ -116,16 +116,16 @@ def migrate( @app.command() # type: ignore[misc] def remove_metadata( + zarr_format: Annotated[ + ZarrFormat, + typer.Argument(help="Which format's metadata to remove - v2 or v3."), + ], store: Annotated[ str, typer.Argument( help="Store or path to directory in file system or name of zip file e.g. 'data/example-1.zarr', 's3://example-bucket/example'..." ), ], - zarr_format: Annotated[ - ZarrFormat, - typer.Argument(help="Which format's metadata to remove - v2 or v3."), - ], force: Annotated[ bool, typer.Option( diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py index 275bb0a4ed..51ef95db85 100644 --- a/tests/test_metadata/test_converter_v2_v3.py +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -153,7 +153,7 @@ def test_convert_array(local_store: Store) -> None: attributes=attributes, ) - result = runner.invoke(cli.app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) assert result.exit_code == 0 assert (local_store.root / "zarr.json").exists() @@ -179,7 +179,7 @@ def test_convert_group(local_store: Store) -> None: attributes = {"baz": 42, "qux": [1, 4, 7, 12]} zarr.create_group(store=local_store, zarr_format=2, attributes=attributes) - result = runner.invoke(cli.app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) assert result.exit_code == 0 assert (local_store.root / "zarr.json").exists() @@ -201,7 +201,7 @@ def test_convert_nested_groups_and_arrays( attributes = {"baz": 42, "qux": [1, 4, 7, 12]} paths = create_nested_zarr(local_store, attributes, separator) - result = runner.invoke(cli.app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) assert result.exit_code == 0 zarr_json_paths = sorted(local_store.root.rglob("zarr.json")) @@ -225,7 +225,7 @@ def test_convert_nested_with_path( create_nested_zarr(local_store, {}, separator) - result = runner.invoke(cli.app, ["convert", str(local_store.root), "--path", "group_1"]) + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root), "--path", "group_1"]) assert result.exit_code == 0 group_path = local_store.root / "group_1" @@ -285,7 +285,7 @@ def test_convert_compressor( ) zarr_array[:] = 1 - result = runner.invoke(cli.app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) assert result.exit_code == 0 assert (local_store.root / "zarr.json").exists() @@ -314,7 +314,7 @@ def test_convert_filter(local_store: Store) -> None: fill_value=0, ) - result = runner.invoke(cli.app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) assert result.exit_code == 0 assert (local_store.root / "zarr.json").exists() @@ -349,7 +349,7 @@ def test_convert_C_vs_F_order( ) zarr_array[:] = 1 - result = runner.invoke(cli.app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) assert result.exit_code == 0 assert (local_store.root / "zarr.json").exists() @@ -382,7 +382,7 @@ def test_convert_endian( ) zarr_array[:] = 1 - result = runner.invoke(cli.app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) assert result.exit_code == 0 assert (local_store.root / "zarr.json").exists() @@ -405,7 +405,7 @@ def test_convert_v3(local_store: Store, node_type: str) -> None: else: zarr.create_group(store=local_store, zarr_format=3) - result = runner.invoke(cli.app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) assert result.exit_code == 1 assert isinstance(result.exception, TypeError) assert str(result.exception) == "Only arrays / groups with zarr v2 metadata can be converted" @@ -424,7 +424,7 @@ def test_convert_unknown_codec(local_store: Store) -> None: fill_value=0, ) - result = runner.invoke(cli.app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) assert result.exit_code == 1 assert isinstance(result.exception, ValueError) assert ( @@ -445,7 +445,7 @@ def test_convert_incorrect_filter(local_store: Store) -> None: fill_value=0, ) - result = runner.invoke(cli.app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) assert result.exit_code == 1 assert isinstance(result.exception, TypeError) assert ( @@ -466,7 +466,7 @@ def test_convert_incorrect_compressor(local_store: Store) -> None: fill_value=0, ) - result = runner.invoke(cli.app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) assert result.exit_code == 1 assert isinstance(result.exception, TypeError) assert ( @@ -481,7 +481,7 @@ def test_remove_metadata_v2(local_store: Store, expected_paths_no_metadata: list attributes = {"baz": 42, "qux": [1, 4, 7, 12]} create_nested_zarr(local_store, attributes, ".") - result = runner.invoke(cli.app, ["clear", str(local_store.root), "2"]) + result = runner.invoke(cli.app, ["remove-metadata", "v2", str(local_store.root)]) assert result.exit_code == 0 # check metadata files removed, but all groups / arrays still remain @@ -499,7 +499,9 @@ def test_remove_metadata_v2_with_path( attributes = {"baz": 42, "qux": [1, 4, 7, 12]} create_nested_zarr(local_store, attributes, ".") - result = runner.invoke(cli.app, ["clear", str(local_store.root), "2", "--path", "group_1"]) + result = runner.invoke( + cli.app, ["remove-metadata", "v2", str(local_store.root), "--path", "group_1"] + ) assert result.exit_code == 0 # check all metadata files inside group_1 are removed (.zattrs / .zgroup / .zarray should remain only inside the top @@ -516,7 +518,7 @@ def test_remove_metadata_v2_with_path( @pytest.mark.parametrize( ("zarr_format", "expected_paths"), - [("2", "expected_paths_v3_metadata"), ("3", "expected_paths_v2_metadata")], + [("v2", "expected_paths_v3_metadata"), ("v3", "expected_paths_v2_metadata")], ) def test_remove_metadata_after_conversion( local_store: Store, request: pytest.FixtureRequest, zarr_format: str, expected_paths: list[Path] @@ -528,9 +530,9 @@ def test_remove_metadata_after_conversion( create_nested_zarr(local_store, attributes, ".") # convert v2 metadata to v3 (so now both v2 and v3 metadata present!), then remove either the v2 or v3 metadata - result = runner.invoke(cli.app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) assert result.exit_code == 0 - result = runner.invoke(cli.app, ["clear", str(local_store.root), zarr_format]) + result = runner.invoke(cli.app, ["remove-metadata", zarr_format, str(local_store.root)]) assert result.exit_code == 0 paths = sorted(local_store.root.rglob("*")) @@ -538,7 +540,7 @@ def test_remove_metadata_after_conversion( assert paths == expected_paths -@pytest.mark.parametrize("cli_command", ["convert", "clear"]) +@pytest.mark.parametrize("cli_command", ["migrate", "remove-metadata"]) def test_dry_run( local_store: Store, cli_command: str, expected_paths_v2_metadata: list[Path] ) -> None: @@ -547,10 +549,12 @@ def test_dry_run( attributes = {"baz": 42, "qux": [1, 4, 7, 12]} create_nested_zarr(local_store, attributes, ".") - if cli_command == "convert": - result = runner.invoke(cli.app, ["convert", str(local_store.root), "--dry-run"]) + if cli_command == "migrate": + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root), "--dry-run"]) else: - result = runner.invoke(cli.app, ["clear", str(local_store.root), "2", "--dry-run"]) + result = runner.invoke( + cli.app, ["remove-metadata", "v2", str(local_store.root), "--dry-run"] + ) assert result.exit_code == 0 From 18487c913ae267cb20e084ce794d3c04ff415b35 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Thu, 24 Jul 2025 13:16:16 +0100 Subject: [PATCH 35/57] update test filename to reflect command name change --- .../{test_converter_v2_v3.py => test_migrate_to_v3.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/test_metadata/{test_converter_v2_v3.py => test_migrate_to_v3.py} (100%) diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_migrate_to_v3.py similarity index 100% rename from tests/test_metadata/test_converter_v2_v3.py rename to tests/test_metadata/test_migrate_to_v3.py From a5cd760f325781820366cfcf967988ab75c652e7 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Thu, 24 Jul 2025 13:28:56 +0100 Subject: [PATCH 36/57] fix tests for sub-groups --- tests/test_metadata/test_migrate_to_v3.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/tests/test_metadata/test_migrate_to_v3.py b/tests/test_metadata/test_migrate_to_v3.py index 51ef95db85..cb59f5c086 100644 --- a/tests/test_metadata/test_migrate_to_v3.py +++ b/tests/test_metadata/test_migrate_to_v3.py @@ -218,18 +218,17 @@ def test_convert_nested_groups_and_arrays( @pytest.mark.parametrize("separator", [".", "/"]) -def test_convert_nested_with_path( +def test_convert_sub_group( local_store: Store, separator: str, expected_v3_metadata: list[Path] ) -> None: """Test that only arrays/groups within group_1 are converted (+ no other files in store)""" create_nested_zarr(local_store, {}, separator) + group_path = local_store.root / "group_1" - result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root), "--path", "group_1"]) + result = runner.invoke(cli.app, ["migrate", "v3", str(group_path)]) assert result.exit_code == 0 - group_path = local_store.root / "group_1" - zarr_json_paths = sorted(local_store.root.rglob("zarr.json")) expected_zarr_json_paths = [ local_store.root / p @@ -491,16 +490,16 @@ def test_remove_metadata_v2(local_store: Store, expected_paths_no_metadata: list assert paths == expected_paths -def test_remove_metadata_v2_with_path( +def test_remove_metadata_sub_group( local_store: Store, expected_paths_no_metadata: list[Path] ) -> None: - """Test only v2 metadata within the given path (group_1) is removed""" + """Test only v2 metadata within group_1 is removed and rest remains un-changed.""" attributes = {"baz": 42, "qux": [1, 4, 7, 12]} create_nested_zarr(local_store, attributes, ".") result = runner.invoke( - cli.app, ["remove-metadata", "v2", str(local_store.root), "--path", "group_1"] + cli.app, ["remove-metadata", "v2", str(local_store.root / "group_1"), "--force"] ) assert result.exit_code == 0 @@ -553,7 +552,7 @@ def test_dry_run( result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root), "--dry-run"]) else: result = runner.invoke( - cli.app, ["remove-metadata", "v2", str(local_store.root), "--dry-run"] + cli.app, ["remove-metadata", "v2", str(local_store.root), "--force", "--dry-run"] ) assert result.exit_code == 0 From bde452f5d7f9de79122317b7dcb9e489778522d3 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Thu, 24 Jul 2025 13:59:56 +0100 Subject: [PATCH 37/57] add tests for --force --- tests/test_metadata/test_migrate_to_v3.py | 35 +++++++++++++++++------ 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/tests/test_metadata/test_migrate_to_v3.py b/tests/test_metadata/test_migrate_to_v3.py index cb59f5c086..cb1f37c62a 100644 --- a/tests/test_metadata/test_migrate_to_v3.py +++ b/tests/test_metadata/test_migrate_to_v3.py @@ -17,6 +17,7 @@ from zarr.codecs.zstd import ZstdCodec from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding +from zarr.core.common import ZarrFormat from zarr.core.dtype.npy.int import BaseInt, UInt8, UInt16 typer_testing = pytest.importorskip( @@ -29,11 +30,13 @@ runner = typer_testing.CliRunner() -def create_nested_zarr(store: Store, attributes: dict[str, Any], separator: str) -> list[str]: +def create_nested_zarr( + store: Store, attributes: dict[str, Any], separator: str, zarr_format: ZarrFormat = 2 +) -> list[str]: """Create a zarr with nested groups / arrays for testing, returning the paths to all.""" # 3 levels of nested groups - group_0 = zarr.create_group(store=store, zarr_format=2, attributes=attributes) + group_0 = zarr.create_group(store=store, zarr_format=zarr_format, attributes=attributes) group_1 = group_0.create_group(name="group_1", attributes=attributes) group_2 = group_1.create_group(name="group_2", attributes=attributes) paths = [group_0.path, group_1.path, group_2.path] @@ -474,18 +477,34 @@ def test_convert_incorrect_compressor(local_store: Store) -> None: ) -def test_remove_metadata_v2(local_store: Store, expected_paths_no_metadata: list[Path]) -> None: - """Test all v2 metadata can be removed (leaving all groups / arrays as-is)""" +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_remove_metadata_fails_without_force(local_store: Store, zarr_format: ZarrFormat) -> None: + """Test removing metadata (when no alternate metadata is present) fails without --force.""" attributes = {"baz": 42, "qux": [1, 4, 7, 12]} - create_nested_zarr(local_store, attributes, ".") + create_nested_zarr(local_store, attributes, ".", zarr_format=zarr_format) + + result = runner.invoke(cli.app, ["remove-metadata", f"v{zarr_format}", str(local_store.root)]) + assert result.exit_code == 1 + assert isinstance(result.exception, ValueError) + assert str(result.exception).startswith(f"Cannot remove v{zarr_format} metadata at file") + + +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_remove_metadata_succeeds_with_force( + local_store: Store, zarr_format: ZarrFormat, expected_paths_no_metadata: list[Path] +) -> None: + """Test removing metadata (when no alternate metadata is present) succeeds with --force.""" + + attributes = {"baz": 42, "qux": [1, 4, 7, 12]} + create_nested_zarr(local_store, attributes, ".", zarr_format=zarr_format) - result = runner.invoke(cli.app, ["remove-metadata", "v2", str(local_store.root)]) + result = runner.invoke( + cli.app, ["remove-metadata", f"v{zarr_format}", str(local_store.root), "--force"] + ) assert result.exit_code == 0 - # check metadata files removed, but all groups / arrays still remain paths = sorted(local_store.root.rglob("*")) - expected_paths = [local_store.root / p for p in expected_paths_no_metadata] assert paths == expected_paths From 671c5e346b671a0a5a99a735977badc34ca6c028 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Thu, 24 Jul 2025 15:24:12 +0100 Subject: [PATCH 38/57] add test for migrating to separate output location --- tests/test_metadata/test_migrate_to_v3.py | 66 ++++++++++++++++++----- 1 file changed, 52 insertions(+), 14 deletions(-) diff --git a/tests/test_metadata/test_migrate_to_v3.py b/tests/test_metadata/test_migrate_to_v3.py index cb1f37c62a..397a685876 100644 --- a/tests/test_metadata/test_migrate_to_v3.py +++ b/tests/test_metadata/test_migrate_to_v3.py @@ -19,6 +19,7 @@ from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding from zarr.core.common import ZarrFormat from zarr.core.dtype.npy.int import BaseInt, UInt8, UInt16 +from zarr.storage._local import LocalStore typer_testing = pytest.importorskip( "typer.testing", reason="optional cli dependencies aren't installed" @@ -31,10 +32,16 @@ def create_nested_zarr( - store: Store, attributes: dict[str, Any], separator: str, zarr_format: ZarrFormat = 2 + store: Store, + attributes: dict[str, Any] | None = None, + separator: str = ".", + zarr_format: ZarrFormat = 2, ) -> list[str]: """Create a zarr with nested groups / arrays for testing, returning the paths to all.""" + if attributes is None: + attributes = {"baz": 42, "qux": [1, 4, 7, 12]} + # 3 levels of nested groups group_0 = zarr.create_group(store=store, zarr_format=zarr_format, attributes=attributes) group_1 = group_0.create_group(name="group_1", attributes=attributes) @@ -195,14 +202,14 @@ def test_convert_group(local_store: Store) -> None: @pytest.mark.parametrize("separator", [".", "/"]) -def test_convert_nested_groups_and_arrays( +def test_convert_nested_groups_and_arrays_in_place( local_store: Store, separator: str, expected_v3_metadata: list[Path] ) -> None: """Test that zarr.json are made at the correct points in a hierarchy of groups and arrays (including when there are additional dirs due to using a / separator)""" attributes = {"baz": 42, "qux": [1, 4, 7, 12]} - paths = create_nested_zarr(local_store, attributes, separator) + paths = create_nested_zarr(local_store, attributes=attributes, separator=separator) result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) assert result.exit_code == 0 @@ -220,13 +227,49 @@ def test_convert_nested_groups_and_arrays( assert metadata.attributes == attributes +@pytest.mark.parametrize("separator", [".", "/"]) +async def test_convert_nested_groups_and_arrays_separate_location( + tmp_path: Path, + separator: str, + expected_v2_metadata: list[Path], + expected_v3_metadata: list[Path], +) -> None: + """Test that zarr.json are made at the correct paths, when saving to a separate output location.""" + + input_zarr_path = tmp_path / "input.zarr" + output_zarr_path = tmp_path / "output.zarr" + + local_store = await LocalStore.open(str(input_zarr_path)) + paths = create_nested_zarr(local_store, separator=separator) + + result = runner.invoke(cli.app, ["migrate", "v3", str(input_zarr_path), str(output_zarr_path)]) + assert result.exit_code == 0 + + # Files in input zarr should be unchanged i.e. still v2 only + zarr_json_paths = sorted(input_zarr_path.rglob("zarr.json")) + assert len(zarr_json_paths) == 0 + + paths = [ + path + for path in input_zarr_path.rglob("*") + if path.stem in [".zarray", ".zgroup", ".zattrs"] + ] + expected_paths = [input_zarr_path / p for p in expected_v2_metadata] + assert sorted(paths) == expected_paths + + # Files in output zarr should only contain v3 metadata + zarr_json_paths = sorted(output_zarr_path.rglob("zarr.json")) + expected_zarr_json_paths = [output_zarr_path / p for p in expected_v3_metadata] + assert zarr_json_paths == expected_zarr_json_paths + + @pytest.mark.parametrize("separator", [".", "/"]) def test_convert_sub_group( local_store: Store, separator: str, expected_v3_metadata: list[Path] ) -> None: """Test that only arrays/groups within group_1 are converted (+ no other files in store)""" - create_nested_zarr(local_store, {}, separator) + create_nested_zarr(local_store, separator=separator) group_path = local_store.root / "group_1" result = runner.invoke(cli.app, ["migrate", "v3", str(group_path)]) @@ -481,8 +524,7 @@ def test_convert_incorrect_compressor(local_store: Store) -> None: def test_remove_metadata_fails_without_force(local_store: Store, zarr_format: ZarrFormat) -> None: """Test removing metadata (when no alternate metadata is present) fails without --force.""" - attributes = {"baz": 42, "qux": [1, 4, 7, 12]} - create_nested_zarr(local_store, attributes, ".", zarr_format=zarr_format) + create_nested_zarr(local_store, zarr_format=zarr_format) result = runner.invoke(cli.app, ["remove-metadata", f"v{zarr_format}", str(local_store.root)]) assert result.exit_code == 1 @@ -496,8 +538,7 @@ def test_remove_metadata_succeeds_with_force( ) -> None: """Test removing metadata (when no alternate metadata is present) succeeds with --force.""" - attributes = {"baz": 42, "qux": [1, 4, 7, 12]} - create_nested_zarr(local_store, attributes, ".", zarr_format=zarr_format) + create_nested_zarr(local_store, zarr_format=zarr_format) result = runner.invoke( cli.app, ["remove-metadata", f"v{zarr_format}", str(local_store.root), "--force"] @@ -514,8 +555,7 @@ def test_remove_metadata_sub_group( ) -> None: """Test only v2 metadata within group_1 is removed and rest remains un-changed.""" - attributes = {"baz": 42, "qux": [1, 4, 7, 12]} - create_nested_zarr(local_store, attributes, ".") + create_nested_zarr(local_store) result = runner.invoke( cli.app, ["remove-metadata", "v2", str(local_store.root / "group_1"), "--force"] @@ -544,8 +584,7 @@ def test_remove_metadata_after_conversion( """Test all v2/v3 metadata can be removed after metadata conversion (all groups / arrays / metadata of other versions should remain as-is)""" - attributes = {"baz": 42, "qux": [1, 4, 7, 12]} - create_nested_zarr(local_store, attributes, ".") + create_nested_zarr(local_store) # convert v2 metadata to v3 (so now both v2 and v3 metadata present!), then remove either the v2 or v3 metadata result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) @@ -564,8 +603,7 @@ def test_dry_run( ) -> None: """Test that all files are un-changed after a dry run""" - attributes = {"baz": 42, "qux": [1, 4, 7, 12]} - create_nested_zarr(local_store, attributes, ".") + create_nested_zarr(local_store) if cli_command == "migrate": result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root), "--dry-run"]) From 0281cc18defc0eebbd479d84f983bb0db4559eb4 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Fri, 25 Jul 2025 09:42:14 +0100 Subject: [PATCH 39/57] add test for remove-v2-metadata option --- tests/test_metadata/test_migrate_to_v3.py | 34 +++++++++++++++++------ 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/tests/test_metadata/test_migrate_to_v3.py b/tests/test_metadata/test_migrate_to_v3.py index 397a685876..26b6a1c0bf 100644 --- a/tests/test_metadata/test_migrate_to_v3.py +++ b/tests/test_metadata/test_migrate_to_v3.py @@ -93,12 +93,12 @@ def expected_v3_metadata() -> list[Path]: """Expected v3 metadata for create_nested_zarr""" return sorted( [ - Path("array_0/zarr.json"), - Path("group_1/array_1/zarr.json"), - Path("group_1/group_2/array_2/zarr.json"), Path("zarr.json"), + Path("array_0/zarr.json"), Path("group_1/zarr.json"), + Path("group_1/array_1/zarr.json"), Path("group_1/group_2/zarr.json"), + Path("group_1/group_2/array_2/zarr.json"), ] ) @@ -108,18 +108,18 @@ def expected_v2_metadata() -> list[Path]: """Expected v2 metadata for create_nested_zarr""" return sorted( [ - Path("array_0/.zarray"), - Path("array_0/.zattrs"), - Path("group_1/array_1/.zarray"), - Path("group_1/array_1/.zattrs"), - Path("group_1/group_2/array_2/.zarray"), - Path("group_1/group_2/array_2/.zattrs"), Path(".zgroup"), Path(".zattrs"), + Path("array_0/.zarray"), + Path("array_0/.zattrs"), Path("group_1/.zgroup"), Path("group_1/.zattrs"), + Path("group_1/array_1/.zarray"), + Path("group_1/array_1/.zattrs"), Path("group_1/group_2/.zgroup"), Path("group_1/group_2/.zattrs"), + Path("group_1/group_2/array_2/.zarray"), + Path("group_1/group_2/array_2/.zattrs"), ] ) @@ -263,6 +263,22 @@ async def test_convert_nested_groups_and_arrays_separate_location( assert zarr_json_paths == expected_zarr_json_paths +def test_remove_v2_metadata_option( + local_store: Store, expected_paths_v3_metadata: list[Path] +) -> None: + create_nested_zarr(local_store) + + # convert v2 metadata to v3, then remove v2 metadata + result = runner.invoke( + cli.app, ["migrate", "v3", str(local_store.root), "--remove-v2-metadata"] + ) + assert result.exit_code == 0 + + paths = sorted(local_store.root.rglob("*")) + expected_paths = [local_store.root / p for p in expected_paths_v3_metadata] + assert paths == expected_paths + + @pytest.mark.parametrize("separator", [".", "/"]) def test_convert_sub_group( local_store: Store, separator: str, expected_v3_metadata: list[Path] From 2ffe854003d6683ff86f70e9198b72c129043c20 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Fri, 25 Jul 2025 09:46:14 +0100 Subject: [PATCH 40/57] update test names to match command name --- tests/test_metadata/test_migrate_to_v3.py | 26 +++++++++++------------ 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/test_metadata/test_migrate_to_v3.py b/tests/test_metadata/test_migrate_to_v3.py index 26b6a1c0bf..870b4c44f6 100644 --- a/tests/test_metadata/test_migrate_to_v3.py +++ b/tests/test_metadata/test_migrate_to_v3.py @@ -144,7 +144,7 @@ def expected_paths_v2_metadata( return sorted(expected_paths_no_metadata) -def test_convert_array(local_store: Store) -> None: +def test_migrate_array(local_store: Store) -> None: shape = (10, 10) chunks = (10, 10) dtype = "uint16" @@ -185,7 +185,7 @@ def test_convert_array(local_store: Store) -> None: assert metadata.storage_transformers == () -def test_convert_group(local_store: Store) -> None: +def test_migrate_group(local_store: Store) -> None: attributes = {"baz": 42, "qux": [1, 4, 7, 12]} zarr.create_group(store=local_store, zarr_format=2, attributes=attributes) @@ -202,7 +202,7 @@ def test_convert_group(local_store: Store) -> None: @pytest.mark.parametrize("separator", [".", "/"]) -def test_convert_nested_groups_and_arrays_in_place( +def test_migrate_nested_groups_and_arrays_in_place( local_store: Store, separator: str, expected_v3_metadata: list[Path] ) -> None: """Test that zarr.json are made at the correct points in a hierarchy of groups and arrays @@ -228,7 +228,7 @@ def test_convert_nested_groups_and_arrays_in_place( @pytest.mark.parametrize("separator", [".", "/"]) -async def test_convert_nested_groups_and_arrays_separate_location( +async def test_migrate_nested_groups_and_arrays_separate_location( tmp_path: Path, separator: str, expected_v2_metadata: list[Path], @@ -280,7 +280,7 @@ def test_remove_v2_metadata_option( @pytest.mark.parametrize("separator", [".", "/"]) -def test_convert_sub_group( +def test_migrate_sub_group( local_store: Store, separator: str, expected_v3_metadata: list[Path] ) -> None: """Test that only arrays/groups within group_1 are converted (+ no other files in store)""" @@ -332,7 +332,7 @@ def test_convert_sub_group( ], ids=["blosc", "zstd", "gzip", "numcodecs-compressor"], ) -def test_convert_compressor( +def test_migrate_compressor( local_store: Store, compressor_v2: numcodecs.abc.Codec, compressor_v3: Codec ) -> None: zarr_array = zarr.create_array( @@ -360,7 +360,7 @@ def test_convert_compressor( assert (zarr_array[:] == 1).all() -def test_convert_filter(local_store: Store) -> None: +def test_migrate_filter(local_store: Store) -> None: filter_v2 = numcodecs.Delta(dtype=" None: ("F", (TransposeCodec(order=(1, 0)), BytesCodec(endian="little"))), ], ) -def test_convert_C_vs_F_order( +def test_migrate_C_vs_F_order( local_store: Store, order: str, expected_codecs: tuple[Codec] ) -> None: zarr_array = zarr.create_array( @@ -429,7 +429,7 @@ def test_convert_C_vs_F_order( ], ids=["single_byte", "multi_byte"], ) -def test_convert_endian( +def test_migrate_endian( local_store: Store, dtype: str, expected_data_type: BaseInt, expected_codecs: tuple[Codec] ) -> None: zarr_array = zarr.create_array( @@ -456,7 +456,7 @@ def test_convert_endian( @pytest.mark.parametrize("node_type", ["array", "group"]) -def test_convert_v3(local_store: Store, node_type: str) -> None: +def test_migrate_v3(local_store: Store, node_type: str) -> None: """Attempting to convert a v3 array/group should always fail""" if node_type == "array": @@ -472,7 +472,7 @@ def test_convert_v3(local_store: Store, node_type: str) -> None: assert str(result.exception) == "Only arrays / groups with zarr v2 metadata can be converted" -def test_convert_unknown_codec(local_store: Store) -> None: +def test_migrate_unknown_codec(local_store: Store) -> None: """Attempting to convert a codec without a v3 equivalent should always fail""" zarr.create_array( @@ -493,7 +493,7 @@ def test_convert_unknown_codec(local_store: Store) -> None: ) -def test_convert_incorrect_filter(local_store: Store) -> None: +def test_migrate_incorrect_filter(local_store: Store) -> None: """Attempting to convert a filter (which is the wrong type of codec) should always fail""" zarr.create_array( @@ -514,7 +514,7 @@ def test_convert_incorrect_filter(local_store: Store) -> None: ) -def test_convert_incorrect_compressor(local_store: Store) -> None: +def test_migrate_incorrect_compressor(local_store: Store) -> None: """Attempting to convert a compressor (which is the wrong type of codec) should always fail""" zarr.create_array( From 432eae6a47c87ea81d2307d8b495a0d47d07d9ca Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Fri, 25 Jul 2025 10:24:55 +0100 Subject: [PATCH 41/57] add test for --remove-v2-metadata with separate output location --- tests/test_metadata/test_migrate_to_v3.py | 28 +++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/tests/test_metadata/test_migrate_to_v3.py b/tests/test_metadata/test_migrate_to_v3.py index 870b4c44f6..1555d4e6d6 100644 --- a/tests/test_metadata/test_migrate_to_v3.py +++ b/tests/test_metadata/test_migrate_to_v3.py @@ -240,7 +240,7 @@ async def test_migrate_nested_groups_and_arrays_separate_location( output_zarr_path = tmp_path / "output.zarr" local_store = await LocalStore.open(str(input_zarr_path)) - paths = create_nested_zarr(local_store, separator=separator) + create_nested_zarr(local_store, separator=separator) result = runner.invoke(cli.app, ["migrate", "v3", str(input_zarr_path), str(output_zarr_path)]) assert result.exit_code == 0 @@ -263,7 +263,7 @@ async def test_migrate_nested_groups_and_arrays_separate_location( assert zarr_json_paths == expected_zarr_json_paths -def test_remove_v2_metadata_option( +def test_remove_v2_metadata_option_in_place( local_store: Store, expected_paths_v3_metadata: list[Path] ) -> None: create_nested_zarr(local_store) @@ -279,6 +279,30 @@ def test_remove_v2_metadata_option( assert paths == expected_paths +async def test_remove_v2_metadata_option_separate_location( + tmp_path: Path, expected_paths_v2_metadata: list[Path] +) -> None: + """Check that when using --remove-v2-metadata with a separate output location, no v2 metadata is removed from + the input location.""" + + input_zarr_path = tmp_path / "input.zarr" + output_zarr_path = tmp_path / "output.zarr" + + local_store = await LocalStore.open(str(input_zarr_path)) + create_nested_zarr(local_store) + + result = runner.invoke( + cli.app, + ["migrate", "v3", str(input_zarr_path), str(output_zarr_path), "--remove-v2-metadata"], + ) + assert result.exit_code == 0 + + # input image should be unchanged + paths = sorted(input_zarr_path.rglob("*")) + expected_paths = [input_zarr_path / p for p in expected_paths_v2_metadata] + assert paths == expected_paths + + @pytest.mark.parametrize("separator", [".", "/"]) def test_migrate_sub_group( local_store: Store, separator: str, expected_v3_metadata: list[Path] From 6e6788d3f95a61590dbc6c14706b122ba41b5d01 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Fri, 25 Jul 2025 14:35:10 +0100 Subject: [PATCH 42/57] separate cli fixtures from the tests --- tests/test_cli/conftest.py | 165 ++++++++++++++++++ .../test_migrate_to_v3.py | 143 +++------------ 2 files changed, 191 insertions(+), 117 deletions(-) create mode 100644 tests/test_cli/conftest.py rename tests/{test_metadata => test_cli}/test_migrate_to_v3.py (83%) diff --git a/tests/test_cli/conftest.py b/tests/test_cli/conftest.py new file mode 100644 index 0000000000..e595a87391 --- /dev/null +++ b/tests/test_cli/conftest.py @@ -0,0 +1,165 @@ +from pathlib import Path +from typing import Any + +import pytest + +import zarr +from zarr.abc.store import Store +from zarr.core.common import ZarrFormat + + +def create_nested_zarr( + store: Store, + attributes: dict[str, Any] | None = None, + separator: str = ".", + zarr_format: ZarrFormat = 2, +) -> list[str]: + """Create a zarr with nested groups / arrays for testing, returning the paths to all.""" + + if attributes is None: + attributes = {"baz": 42, "qux": [1, 4, 7, 12]} + + # 3 levels of nested groups + group_0 = zarr.create_group(store=store, zarr_format=zarr_format, attributes=attributes) + group_1 = group_0.create_group(name="group_1", attributes=attributes) + group_2 = group_1.create_group(name="group_2", attributes=attributes) + paths = [group_0.path, group_1.path, group_2.path] + + # 1 array per group + for i, group in enumerate([group_0, group_1, group_2]): + array = group.create_array( + name=f"array_{i}", + shape=(10, 10), + chunks=(5, 5), + dtype="uint16", + attributes=attributes, + chunk_key_encoding={"name": "v2", "separator": separator}, + ) + array[:] = 1 + paths.append(array.path) + + return paths + + +@pytest.fixture +def expected_paths() -> list[Path]: + """Expected paths for create_nested_zarr, with no metadata files or chunks""" + return [ + Path("array_0"), + Path("group_1"), + Path("group_1/array_1"), + Path("group_1/group_2"), + Path("group_1/group_2/array_2"), + ] + + +@pytest.fixture +def expected_chunks() -> list[Path]: + """Expected chunks for create_nested_zarr""" + return [ + Path("array_0/0.0"), + Path("array_0/0.1"), + Path("array_0/1.0"), + Path("array_0/1.1"), + Path("group_1/array_1/0.0"), + Path("group_1/array_1/0.1"), + Path("group_1/array_1/1.0"), + Path("group_1/array_1/1.1"), + Path("group_1/group_2/array_2/0.0"), + Path("group_1/group_2/array_2/0.1"), + Path("group_1/group_2/array_2/1.0"), + Path("group_1/group_2/array_2/1.1"), + ] + + +@pytest.fixture +def expected_v3_metadata() -> list[Path]: + """Expected v3 metadata for create_nested_zarr""" + return sorted( + [ + Path("zarr.json"), + Path("array_0/zarr.json"), + Path("group_1/zarr.json"), + Path("group_1/array_1/zarr.json"), + Path("group_1/group_2/zarr.json"), + Path("group_1/group_2/array_2/zarr.json"), + ] + ) + + +@pytest.fixture +def expected_v2_metadata() -> list[Path]: + """Expected v2 metadata for create_nested_zarr""" + return sorted( + [ + Path(".zgroup"), + Path(".zattrs"), + Path("array_0/.zarray"), + Path("array_0/.zattrs"), + Path("group_1/.zgroup"), + Path("group_1/.zattrs"), + Path("group_1/array_1/.zarray"), + Path("group_1/array_1/.zattrs"), + Path("group_1/group_2/.zgroup"), + Path("group_1/group_2/.zattrs"), + Path("group_1/group_2/array_2/.zarray"), + Path("group_1/group_2/array_2/.zattrs"), + ] + ) + + +@pytest.fixture +def expected_paths_no_metadata( + expected_paths: list[Path], expected_chunks: list[Path] +) -> list[Path]: + """Expected paths from create_nested_zarr + chunks""" + expected_paths.extend(expected_chunks) + + return sorted(expected_paths) + + +@pytest.fixture +def expected_paths_v3_metadata( + expected_paths: list[Path], expected_chunks: list[Path], expected_v3_metadata: list[Path] +) -> list[Path]: + """Expected paths from create_nested_zarr + chunks + v3 metadata files""" + expected_paths.extend(expected_chunks) + expected_paths.extend(expected_v3_metadata) + + return sorted(expected_paths) + + +@pytest.fixture +def expected_paths_v3_metadata_no_chunks( + expected_paths: list[Path], expected_v3_metadata: list[Path] +) -> list[Path]: + """Expected paths from create_nested_zarr + v3 metadata files (with no chunks)""" + expected_paths.extend(expected_v3_metadata) + + return sorted(expected_paths) + + +@pytest.fixture +def expected_paths_v2_metadata( + expected_paths: list[Path], expected_chunks: list[Path], expected_v2_metadata: list[Path] +) -> list[Path]: + """Expected paths from create_nested_zarr + chunks + v2 metadata files""" + expected_paths.extend(expected_chunks) + expected_paths.extend(expected_v2_metadata) + + return sorted(expected_paths) + + +@pytest.fixture +def expected_paths_v2_v3_metadata( + expected_paths: list[Path], + expected_chunks: list[Path], + expected_v2_metadata: list[Path], + expected_v3_metadata: list[Path], +) -> list[Path]: + """Expected paths from create_nested_zarr + chunks + v2 metadata files + v3 metadata files""" + expected_paths.extend(expected_chunks) + expected_paths.extend(expected_v2_metadata) + expected_paths.extend(expected_v3_metadata) + + return sorted(expected_paths) diff --git a/tests/test_metadata/test_migrate_to_v3.py b/tests/test_cli/test_migrate_to_v3.py similarity index 83% rename from tests/test_metadata/test_migrate_to_v3.py rename to tests/test_cli/test_migrate_to_v3.py index 1555d4e6d6..ee9575bede 100644 --- a/tests/test_metadata/test_migrate_to_v3.py +++ b/tests/test_cli/test_migrate_to_v3.py @@ -1,6 +1,5 @@ import lzma from pathlib import Path -from typing import Any import numcodecs import numcodecs.abc @@ -8,6 +7,7 @@ from numcodecs.zarr3 import LZMA, Delta import zarr +from tests.test_cli.conftest import create_nested_zarr from zarr.abc.codec import Codec from zarr.abc.store import Store from zarr.codecs.blosc import BloscCodec @@ -31,119 +31,6 @@ runner = typer_testing.CliRunner() -def create_nested_zarr( - store: Store, - attributes: dict[str, Any] | None = None, - separator: str = ".", - zarr_format: ZarrFormat = 2, -) -> list[str]: - """Create a zarr with nested groups / arrays for testing, returning the paths to all.""" - - if attributes is None: - attributes = {"baz": 42, "qux": [1, 4, 7, 12]} - - # 3 levels of nested groups - group_0 = zarr.create_group(store=store, zarr_format=zarr_format, attributes=attributes) - group_1 = group_0.create_group(name="group_1", attributes=attributes) - group_2 = group_1.create_group(name="group_2", attributes=attributes) - paths = [group_0.path, group_1.path, group_2.path] - - # 1 array per group - for i, group in enumerate([group_0, group_1, group_2]): - array = group.create_array( - name=f"array_{i}", - shape=(10, 10), - chunks=(5, 5), - dtype="uint16", - attributes=attributes, - chunk_key_encoding={"name": "v2", "separator": separator}, - ) - array[:] = 1 - paths.append(array.path) - - return paths - - -@pytest.fixture -def expected_paths_no_metadata() -> list[Path]: - """Expected paths from create_nested_zarr, with no metadata files""" - return [ - Path("array_0"), - Path("array_0/0.0"), - Path("array_0/0.1"), - Path("array_0/1.0"), - Path("array_0/1.1"), - Path("group_1"), - Path("group_1/array_1"), - Path("group_1/array_1/0.0"), - Path("group_1/array_1/0.1"), - Path("group_1/array_1/1.0"), - Path("group_1/array_1/1.1"), - Path("group_1/group_2"), - Path("group_1/group_2/array_2"), - Path("group_1/group_2/array_2/0.0"), - Path("group_1/group_2/array_2/0.1"), - Path("group_1/group_2/array_2/1.0"), - Path("group_1/group_2/array_2/1.1"), - ] - - -@pytest.fixture -def expected_v3_metadata() -> list[Path]: - """Expected v3 metadata for create_nested_zarr""" - return sorted( - [ - Path("zarr.json"), - Path("array_0/zarr.json"), - Path("group_1/zarr.json"), - Path("group_1/array_1/zarr.json"), - Path("group_1/group_2/zarr.json"), - Path("group_1/group_2/array_2/zarr.json"), - ] - ) - - -@pytest.fixture -def expected_v2_metadata() -> list[Path]: - """Expected v2 metadata for create_nested_zarr""" - return sorted( - [ - Path(".zgroup"), - Path(".zattrs"), - Path("array_0/.zarray"), - Path("array_0/.zattrs"), - Path("group_1/.zgroup"), - Path("group_1/.zattrs"), - Path("group_1/array_1/.zarray"), - Path("group_1/array_1/.zattrs"), - Path("group_1/group_2/.zgroup"), - Path("group_1/group_2/.zattrs"), - Path("group_1/group_2/array_2/.zarray"), - Path("group_1/group_2/array_2/.zattrs"), - ] - ) - - -@pytest.fixture -def expected_paths_v3_metadata( - expected_paths_no_metadata: list[Path], expected_v3_metadata: list[Path] -) -> list[Path]: - """Expected paths from create_nested_zarr + v3 metadata files""" - expected_paths_no_metadata.extend(expected_v3_metadata) - - return sorted(expected_paths_no_metadata) - - -@pytest.fixture -def expected_paths_v2_metadata( - expected_paths_no_metadata: list[Path], expected_v2_metadata: list[Path] -) -> list[Path]: - """Expected paths from create_nested_zarr + v2 metadata files""" - expected_paths_no_metadata.extend(expected_v2_metadata) - - return sorted(expected_paths_no_metadata) - - def test_migrate_array(local_store: Store) -> None: shape = (10, 10) chunks = (10, 10) @@ -303,6 +190,24 @@ async def test_remove_v2_metadata_option_separate_location( assert paths == expected_paths +def test_overwrite_option_in_place( + local_store: Store, expected_paths_v2_v3_metadata: list[Path] +) -> None: + create_nested_zarr(local_store) + + # add v3 metadata in place + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) + assert result.exit_code == 0 + + # check that v3 metadata can be overwritten with --overwrite + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root), "--overwrite"]) + assert result.exit_code == 0 + + paths = sorted(local_store.root.rglob("*")) + expected_paths = [local_store.root / p for p in expected_paths_v2_v3_metadata] + assert paths == expected_paths + + @pytest.mark.parametrize("separator", [".", "/"]) def test_migrate_sub_group( local_store: Store, separator: str, expected_v3_metadata: list[Path] @@ -615,11 +520,14 @@ def test_remove_metadata_sub_group( @pytest.mark.parametrize( - ("zarr_format", "expected_paths"), + ("zarr_format", "expected_output_paths"), [("v2", "expected_paths_v3_metadata"), ("v3", "expected_paths_v2_metadata")], ) def test_remove_metadata_after_conversion( - local_store: Store, request: pytest.FixtureRequest, zarr_format: str, expected_paths: list[Path] + local_store: Store, + request: pytest.FixtureRequest, + zarr_format: str, + expected_output_paths: str, ) -> None: """Test all v2/v3 metadata can be removed after metadata conversion (all groups / arrays / metadata of other versions should remain as-is)""" @@ -633,7 +541,8 @@ def test_remove_metadata_after_conversion( assert result.exit_code == 0 paths = sorted(local_store.root.rglob("*")) - expected_paths = [local_store.root / p for p in request.getfixturevalue(expected_paths)] + expected_paths = request.getfixturevalue(expected_output_paths) + expected_paths = [local_store.root / p for p in expected_paths] assert paths == expected_paths From 4abc84a3a0c5c9d85a9d0907e657b6ddaabcaa6b Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Fri, 25 Jul 2025 14:43:23 +0100 Subject: [PATCH 43/57] add test for overwrite option in separate location --- tests/test_cli/test_migrate_to_v3.py | 36 ++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tests/test_cli/test_migrate_to_v3.py b/tests/test_cli/test_migrate_to_v3.py index ee9575bede..225e527727 100644 --- a/tests/test_cli/test_migrate_to_v3.py +++ b/tests/test_cli/test_migrate_to_v3.py @@ -208,6 +208,42 @@ def test_overwrite_option_in_place( assert paths == expected_paths +async def test_overwrite_option_separate_location( + tmp_path: Path, + expected_paths_v2_metadata: list[Path], + expected_paths_v3_metadata_no_chunks: list[Path], +) -> None: + input_zarr_path = tmp_path / "input.zarr" + output_zarr_path = tmp_path / "output.zarr" + + local_store = await LocalStore.open(str(input_zarr_path)) + create_nested_zarr(local_store) + + # create v3 metadata at output_zarr_path + result = runner.invoke( + cli.app, + ["migrate", "v3", str(input_zarr_path), str(output_zarr_path)], + ) + assert result.exit_code == 0 + + # re-run with --overwrite option + result = runner.invoke( + cli.app, + ["migrate", "v3", str(input_zarr_path), str(output_zarr_path), "--overwrite", "--force"], + ) + assert result.exit_code == 0 + + # original image should be un-changed + paths = sorted(input_zarr_path.rglob("*")) + expected_paths = [input_zarr_path / p for p in expected_paths_v2_metadata] + assert paths == expected_paths + + # output image is only v3 metadata + paths = sorted(output_zarr_path.rglob("*")) + expected_paths = [output_zarr_path / p for p in expected_paths_v3_metadata_no_chunks] + assert paths == expected_paths + + @pytest.mark.parametrize("separator", [".", "/"]) def test_migrate_sub_group( local_store: Store, separator: str, expected_v3_metadata: list[Path] From 0bdd6f8bfd4cef3faaf0fe17236cecf5556189b0 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Fri, 25 Jul 2025 14:56:02 +0100 Subject: [PATCH 44/57] fix failing test --- tests/test_cli/conftest.py | 29 +++++------------------------ 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/tests/test_cli/conftest.py b/tests/test_cli/conftest.py index e595a87391..c04ccfcede 100644 --- a/tests/test_cli/conftest.py +++ b/tests/test_cli/conftest.py @@ -112,42 +112,28 @@ def expected_v2_metadata() -> list[Path]: def expected_paths_no_metadata( expected_paths: list[Path], expected_chunks: list[Path] ) -> list[Path]: - """Expected paths from create_nested_zarr + chunks""" - expected_paths.extend(expected_chunks) - - return sorted(expected_paths) + return sorted(expected_paths + expected_chunks) @pytest.fixture def expected_paths_v3_metadata( expected_paths: list[Path], expected_chunks: list[Path], expected_v3_metadata: list[Path] ) -> list[Path]: - """Expected paths from create_nested_zarr + chunks + v3 metadata files""" - expected_paths.extend(expected_chunks) - expected_paths.extend(expected_v3_metadata) - - return sorted(expected_paths) + return sorted(expected_paths + expected_chunks + expected_v3_metadata) @pytest.fixture def expected_paths_v3_metadata_no_chunks( expected_paths: list[Path], expected_v3_metadata: list[Path] ) -> list[Path]: - """Expected paths from create_nested_zarr + v3 metadata files (with no chunks)""" - expected_paths.extend(expected_v3_metadata) - - return sorted(expected_paths) + return sorted(expected_paths + expected_v3_metadata) @pytest.fixture def expected_paths_v2_metadata( expected_paths: list[Path], expected_chunks: list[Path], expected_v2_metadata: list[Path] ) -> list[Path]: - """Expected paths from create_nested_zarr + chunks + v2 metadata files""" - expected_paths.extend(expected_chunks) - expected_paths.extend(expected_v2_metadata) - - return sorted(expected_paths) + return sorted(expected_paths + expected_chunks + expected_v2_metadata) @pytest.fixture @@ -157,9 +143,4 @@ def expected_paths_v2_v3_metadata( expected_v2_metadata: list[Path], expected_v3_metadata: list[Path], ) -> list[Path]: - """Expected paths from create_nested_zarr + chunks + v2 metadata files + v3 metadata files""" - expected_paths.extend(expected_chunks) - expected_paths.extend(expected_v2_metadata) - expected_paths.extend(expected_v3_metadata) - - return sorted(expected_paths) + return sorted(expected_paths + expected_chunks + expected_v2_metadata + expected_v3_metadata) From f2fa389810b42d5556a76a9933725a252934edb4 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Fri, 25 Jul 2025 15:09:00 +0100 Subject: [PATCH 45/57] small fixes to tests --- tests/test_cli/test_migrate_to_v3.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tests/test_cli/test_migrate_to_v3.py b/tests/test_cli/test_migrate_to_v3.py index 225e527727..e7153af6aa 100644 --- a/tests/test_cli/test_migrate_to_v3.py +++ b/tests/test_cli/test_migrate_to_v3.py @@ -167,7 +167,9 @@ def test_remove_v2_metadata_option_in_place( async def test_remove_v2_metadata_option_separate_location( - tmp_path: Path, expected_paths_v2_metadata: list[Path] + tmp_path: Path, + expected_paths_v2_metadata: list[Path], + expected_paths_v3_metadata_no_chunks: list[Path], ) -> None: """Check that when using --remove-v2-metadata with a separate output location, no v2 metadata is removed from the input location.""" @@ -189,6 +191,11 @@ async def test_remove_v2_metadata_option_separate_location( expected_paths = [input_zarr_path / p for p in expected_paths_v2_metadata] assert paths == expected_paths + # output image should be only v3 metadata + paths = sorted(output_zarr_path.rglob("*")) + expected_paths = [output_zarr_path / p for p in expected_paths_v3_metadata_no_chunks] + assert paths == expected_paths + def test_overwrite_option_in_place( local_store: Store, expected_paths_v2_v3_metadata: list[Path] @@ -591,7 +598,9 @@ def test_dry_run( create_nested_zarr(local_store) if cli_command == "migrate": - result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root), "--dry-run"]) + result = runner.invoke( + cli.app, ["migrate", "v3", str(local_store.root), "--overwrite", "--force", "--dry-run"] + ) else: result = runner.invoke( cli.app, ["remove-metadata", "v2", str(local_store.root), "--force", "--dry-run"] From 649bb20df13539949f25f381a0c35b81cc64d1ba Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Mon, 28 Jul 2025 10:41:58 +0100 Subject: [PATCH 46/57] fix pre-commit errors --- tests/test_cli/conftest.py | 4 +- tests/test_cli/test_migrate_to_v3.py | 70 ++++++++++++++++------------ 2 files changed, 41 insertions(+), 33 deletions(-) diff --git a/tests/test_cli/conftest.py b/tests/test_cli/conftest.py index c04ccfcede..4f95f47b5e 100644 --- a/tests/test_cli/conftest.py +++ b/tests/test_cli/conftest.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import Any +from typing import Any, Literal import pytest @@ -11,7 +11,7 @@ def create_nested_zarr( store: Store, attributes: dict[str, Any] | None = None, - separator: str = ".", + separator: Literal[".", "/"] = ".", zarr_format: ZarrFormat = 2, ) -> list[str]: """Create a zarr with nested groups / arrays for testing, returning the paths to all.""" diff --git a/tests/test_cli/test_migrate_to_v3.py b/tests/test_cli/test_migrate_to_v3.py index e7153af6aa..6ce125c755 100644 --- a/tests/test_cli/test_migrate_to_v3.py +++ b/tests/test_cli/test_migrate_to_v3.py @@ -1,24 +1,27 @@ import lzma from pathlib import Path +from typing import Literal, cast import numcodecs import numcodecs.abc +import numpy as np import pytest from numcodecs.zarr3 import LZMA, Delta import zarr from tests.test_cli.conftest import create_nested_zarr from zarr.abc.codec import Codec -from zarr.abc.store import Store from zarr.codecs.blosc import BloscCodec from zarr.codecs.bytes import BytesCodec from zarr.codecs.gzip import GzipCodec from zarr.codecs.transpose import TransposeCodec from zarr.codecs.zstd import ZstdCodec +from zarr.core.array import Array from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding -from zarr.core.common import ZarrFormat -from zarr.core.dtype.npy.int import BaseInt, UInt8, UInt16 +from zarr.core.common import JSON, ZarrFormat +from zarr.core.dtype.npy.int import UInt8, UInt16 +from zarr.core.group import Group from zarr.storage._local import LocalStore typer_testing = pytest.importorskip( @@ -31,13 +34,13 @@ runner = typer_testing.CliRunner() -def test_migrate_array(local_store: Store) -> None: +def test_migrate_array(local_store: LocalStore) -> None: shape = (10, 10) chunks = (10, 10) dtype = "uint16" compressors = numcodecs.Blosc(cname="zstd", clevel=3, shuffle=1) fill_value = 2 - attributes = {"baz": 42, "qux": [1, 4, 7, 12]} + attributes = cast(dict[str, JSON], {"baz": 42, "qux": [1, 4, 7, 12]}) zarr.create_array( store=local_store, @@ -72,7 +75,7 @@ def test_migrate_array(local_store: Store) -> None: assert metadata.storage_transformers == () -def test_migrate_group(local_store: Store) -> None: +def test_migrate_group(local_store: LocalStore) -> None: attributes = {"baz": 42, "qux": [1, 4, 7, 12]} zarr.create_group(store=local_store, zarr_format=2, attributes=attributes) @@ -90,7 +93,7 @@ def test_migrate_group(local_store: Store) -> None: @pytest.mark.parametrize("separator", [".", "/"]) def test_migrate_nested_groups_and_arrays_in_place( - local_store: Store, separator: str, expected_v3_metadata: list[Path] + local_store: LocalStore, separator: str, expected_v3_metadata: list[Path] ) -> None: """Test that zarr.json are made at the correct points in a hierarchy of groups and arrays (including when there are additional dirs due to using a / separator)""" @@ -108,7 +111,7 @@ def test_migrate_nested_groups_and_arrays_in_place( # Check converted zarr can be opened + metadata accessed at all levels zarr_array = zarr.open(local_store.root, zarr_format=3) for path in paths: - zarr_v3 = zarr_array[path] + zarr_v3 = cast(Array | Group, zarr_array[path]) metadata = zarr_v3.metadata assert metadata.zarr_format == 3 assert metadata.attributes == attributes @@ -151,7 +154,7 @@ async def test_migrate_nested_groups_and_arrays_separate_location( def test_remove_v2_metadata_option_in_place( - local_store: Store, expected_paths_v3_metadata: list[Path] + local_store: LocalStore, expected_paths_v3_metadata: list[Path] ) -> None: create_nested_zarr(local_store) @@ -198,7 +201,7 @@ async def test_remove_v2_metadata_option_separate_location( def test_overwrite_option_in_place( - local_store: Store, expected_paths_v2_v3_metadata: list[Path] + local_store: LocalStore, expected_paths_v2_v3_metadata: list[Path] ) -> None: create_nested_zarr(local_store) @@ -253,7 +256,7 @@ async def test_overwrite_option_separate_location( @pytest.mark.parametrize("separator", [".", "/"]) def test_migrate_sub_group( - local_store: Store, separator: str, expected_v3_metadata: list[Path] + local_store: LocalStore, separator: str, expected_v3_metadata: list[Path] ) -> None: """Test that only arrays/groups within group_1 are converted (+ no other files in store)""" @@ -305,7 +308,7 @@ def test_migrate_sub_group( ids=["blosc", "zstd", "gzip", "numcodecs-compressor"], ) def test_migrate_compressor( - local_store: Store, compressor_v2: numcodecs.abc.Codec, compressor_v3: Codec + local_store: LocalStore, compressor_v2: numcodecs.abc.Codec, compressor_v3: Codec ) -> None: zarr_array = zarr.create_array( store=local_store, @@ -322,17 +325,17 @@ def test_migrate_compressor( assert result.exit_code == 0 assert (local_store.root / "zarr.json").exists() - zarr_array = zarr.open(local_store.root, zarr_format=3) + zarr_array = zarr.open_array(local_store.root, zarr_format=3) metadata = zarr_array.metadata assert metadata.zarr_format == 3 assert metadata.codecs == ( BytesCodec(endian="little"), compressor_v3, ) - assert (zarr_array[:] == 1).all() + assert np.all(zarr_array[:] == 1) -def test_migrate_filter(local_store: Store) -> None: +def test_migrate_filter(local_store: LocalStore) -> None: filter_v2 = numcodecs.Delta(dtype=" None: assert result.exit_code == 0 assert (local_store.root / "zarr.json").exists() - zarr_array = zarr.open(local_store.root, zarr_format=3) + zarr_array = zarr.open_array(local_store.root, zarr_format=3) metadata = zarr_array.metadata assert metadata.zarr_format == 3 assert metadata.codecs == ( @@ -368,7 +371,7 @@ def test_migrate_filter(local_store: Store) -> None: ], ) def test_migrate_C_vs_F_order( - local_store: Store, order: str, expected_codecs: tuple[Codec] + local_store: LocalStore, order: Literal["C", "F"], expected_codecs: tuple[Codec] ) -> None: zarr_array = zarr.create_array( store=local_store, @@ -386,11 +389,11 @@ def test_migrate_C_vs_F_order( assert result.exit_code == 0 assert (local_store.root / "zarr.json").exists() - zarr_array = zarr.open(local_store.root, zarr_format=3) + zarr_array = zarr.open_array(local_store.root, zarr_format=3) metadata = zarr_array.metadata assert metadata.zarr_format == 3 assert metadata.codecs == expected_codecs - assert (zarr_array[:] == 1).all() + assert np.all(zarr_array[:] == 1) @pytest.mark.parametrize( @@ -402,7 +405,10 @@ def test_migrate_C_vs_F_order( ids=["single_byte", "multi_byte"], ) def test_migrate_endian( - local_store: Store, dtype: str, expected_data_type: BaseInt, expected_codecs: tuple[Codec] + local_store: LocalStore, + dtype: str, + expected_data_type: UInt8 | UInt16, + expected_codecs: tuple[Codec], ) -> None: zarr_array = zarr.create_array( store=local_store, @@ -419,16 +425,16 @@ def test_migrate_endian( assert result.exit_code == 0 assert (local_store.root / "zarr.json").exists() - zarr_array = zarr.open(local_store.root, zarr_format=3) + zarr_array = zarr.open_array(local_store.root, zarr_format=3) metadata = zarr_array.metadata assert metadata.zarr_format == 3 assert metadata.data_type == expected_data_type assert metadata.codecs == expected_codecs - assert (zarr_array[:] == 1).all() + assert np.all(zarr_array[:] == 1) @pytest.mark.parametrize("node_type", ["array", "group"]) -def test_migrate_v3(local_store: Store, node_type: str) -> None: +def test_migrate_v3(local_store: LocalStore, node_type: str) -> None: """Attempting to convert a v3 array/group should always fail""" if node_type == "array": @@ -444,7 +450,7 @@ def test_migrate_v3(local_store: Store, node_type: str) -> None: assert str(result.exception) == "Only arrays / groups with zarr v2 metadata can be converted" -def test_migrate_unknown_codec(local_store: Store) -> None: +def test_migrate_unknown_codec(local_store: LocalStore) -> None: """Attempting to convert a codec without a v3 equivalent should always fail""" zarr.create_array( @@ -465,7 +471,7 @@ def test_migrate_unknown_codec(local_store: Store) -> None: ) -def test_migrate_incorrect_filter(local_store: Store) -> None: +def test_migrate_incorrect_filter(local_store: LocalStore) -> None: """Attempting to convert a filter (which is the wrong type of codec) should always fail""" zarr.create_array( @@ -486,7 +492,7 @@ def test_migrate_incorrect_filter(local_store: Store) -> None: ) -def test_migrate_incorrect_compressor(local_store: Store) -> None: +def test_migrate_incorrect_compressor(local_store: LocalStore) -> None: """Attempting to convert a compressor (which is the wrong type of codec) should always fail""" zarr.create_array( @@ -509,7 +515,9 @@ def test_migrate_incorrect_compressor(local_store: Store) -> None: @pytest.mark.parametrize("zarr_format", [2, 3]) -def test_remove_metadata_fails_without_force(local_store: Store, zarr_format: ZarrFormat) -> None: +def test_remove_metadata_fails_without_force( + local_store: LocalStore, zarr_format: ZarrFormat +) -> None: """Test removing metadata (when no alternate metadata is present) fails without --force.""" create_nested_zarr(local_store, zarr_format=zarr_format) @@ -522,7 +530,7 @@ def test_remove_metadata_fails_without_force(local_store: Store, zarr_format: Za @pytest.mark.parametrize("zarr_format", [2, 3]) def test_remove_metadata_succeeds_with_force( - local_store: Store, zarr_format: ZarrFormat, expected_paths_no_metadata: list[Path] + local_store: LocalStore, zarr_format: ZarrFormat, expected_paths_no_metadata: list[Path] ) -> None: """Test removing metadata (when no alternate metadata is present) succeeds with --force.""" @@ -539,7 +547,7 @@ def test_remove_metadata_succeeds_with_force( def test_remove_metadata_sub_group( - local_store: Store, expected_paths_no_metadata: list[Path] + local_store: LocalStore, expected_paths_no_metadata: list[Path] ) -> None: """Test only v2 metadata within group_1 is removed and rest remains un-changed.""" @@ -567,7 +575,7 @@ def test_remove_metadata_sub_group( [("v2", "expected_paths_v3_metadata"), ("v3", "expected_paths_v2_metadata")], ) def test_remove_metadata_after_conversion( - local_store: Store, + local_store: LocalStore, request: pytest.FixtureRequest, zarr_format: str, expected_output_paths: str, @@ -591,7 +599,7 @@ def test_remove_metadata_after_conversion( @pytest.mark.parametrize("cli_command", ["migrate", "remove-metadata"]) def test_dry_run( - local_store: Store, cli_command: str, expected_paths_v2_metadata: list[Path] + local_store: LocalStore, cli_command: str, expected_paths_v2_metadata: list[Path] ) -> None: """Test that all files are un-changed after a dry run""" From dba40739bb9069c01a7ea7e1e62fc28a3f6f3a6f Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Fri, 1 Aug 2025 09:34:20 +0100 Subject: [PATCH 47/57] update docstrings with review comments --- src/zarr/core/metadata/converter/cli.py | 20 +++++++++---------- .../core/metadata/converter/migrate_to_v3.py | 12 +++++------ 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/zarr/core/metadata/converter/cli.py b/src/zarr/core/metadata/converter/cli.py index 07132313c7..5aa61953e4 100644 --- a/src/zarr/core/metadata/converter/cli.py +++ b/src/zarr/core/metadata/converter/cli.py @@ -12,7 +12,7 @@ logger = logging.getLogger(__name__) -def _set_logging_config(verbose: bool) -> None: +def _set_logging_config(*, verbose: bool) -> None: if verbose: lvl = logging.INFO else: @@ -31,7 +31,7 @@ class ZarrFormat(str, Enum): class ZarrFormatV3(str, Enum): - """Limit CLI choice to only V3""" + """Limit CLI choice to only v3""" v3 = "v3" @@ -57,7 +57,7 @@ def migrate( str | None, typer.Argument( help=( - "Output location to write generated metadata (no chunks will be copied). If not provided, " + "Output location to write generated metadata (no array data will be copied). If not provided, " "metadata will be written to input_store. Should be a store, path to directory in file system " "or name of zip file e.g. 'data/example-1.zarr', 's3://example-bucket/example'..." ) @@ -66,7 +66,7 @@ def migrate( dry_run: Annotated[ bool, typer.Option( - help="Enable a dry-run: files that would be converted are logged, but no new files are actually created." + help="Enable a dry-run: files that would be converted are logged, but no new files are created or changed." ), ] = False, overwrite: Annotated[ @@ -92,15 +92,15 @@ def migrate( ] = False, ) -> None: """Migrate all v2 metadata in a zarr hierarchy to v3. This will create a zarr.json file for each level - (every group / array). V2 files (.zarray, .zattrs etc.) will be left as-is. + (every group / array). v2 files (.zarray, .zattrs etc.) will be left as-is. """ if dry_run: _set_verbose_level() logger.info( - "Dry run enabled - no new files will be created. Log of files that would be created on a real run:" + "Dry run enabled - no new files will be created or changed. Log of files that would be created on a real run:" ) - write_store = output_store if output_store is not None else input_store + write_store = input_store if output_store is None else output_store if overwrite: sync(migrate_metadata.remove_metadata(write_store, 3, force=force, dry_run=dry_run)) @@ -138,7 +138,7 @@ def remove_metadata( dry_run: Annotated[ bool, typer.Option( - help="Enable a dry-run: files that would be deleted are logged, but no files are actually removed." + help="Enable a dry-run: files that would be deleted are logged, but no files are removed or changed." ), ] = False, ) -> None: @@ -148,7 +148,7 @@ def remove_metadata( if dry_run: _set_verbose_level() logger.info( - "Dry run enabled - no files will be deleted. Log of files that would be deleted on a real run:" + "Dry run enabled - no files will be deleted or changed. Log of files that would be deleted on a real run:" ) sync( @@ -173,7 +173,7 @@ def main( """ See available commands below - access help for individual commands with zarr COMMAND --help. """ - _set_logging_config(verbose) + _set_logging_config(verbose=verbose) if __name__ == "__main__": diff --git a/src/zarr/core/metadata/converter/migrate_to_v3.py b/src/zarr/core/metadata/converter/migrate_to_v3.py index d4cdebe3e7..e850d6e5a4 100644 --- a/src/zarr/core/metadata/converter/migrate_to_v3.py +++ b/src/zarr/core/metadata/converter/migrate_to_v3.py @@ -42,7 +42,7 @@ def migrate_v2_to_v3( ) -> None: """Migrate all v2 metadata in a zarr hierarchy to v3. - This will create a zarr.json file at each level (for every group / array). V2 files (.zarray, .zattrs etc.) + This will create a zarr.json file at each level (for every group / array). v2 files (.zarray, .zattrs etc.) will be left as-is. Parameters @@ -50,14 +50,14 @@ def migrate_v2_to_v3( input_store : StoreLike Input Zarr to migrate - should be a store, path to directory in file system or name of zip file. output_store : StoreLike - Output location to write v3 metadata (no chunks will be copied). If not provided, v3 metadata will be + Output location to write v3 metadata (no array data will be copied). If not provided, v3 metadata will be written to input_store. Should be a store, path to directory in file system or name of zip file. storage_options : dict | None, optional If the store is backed by an fsspec-based implementation, then this dict will be passed to the Store constructor for that implementation. Ignored otherwise. Note - the same storage_options will be passed to both input_store and output_store (if provided). dry_run : bool, optional - Enable a 'dry run' - files that would be created are logged, but no files are actually created. + Enable a 'dry run' - files that would be created are logged, but no files are created or changed. """ zarr_v2 = zarr.open(store=input_store, mode="r+", storage_options=storage_options) @@ -77,7 +77,7 @@ def migrate_to_v3(zarr_v2: Array | Group, output_path: StorePath, dry_run: bool """Migrate all v2 metadata in a zarr array/group to v3. Note - if a group is provided, then all arrays / groups within this group will also be converted. - A zarr.json file will be created for each level and written to output_path, with any V2 files + A zarr.json file will be created for each level and written to output_path, with any v2 files (.zarray, .zattrs etc.) left as-is. Parameters @@ -87,7 +87,7 @@ def migrate_to_v3(zarr_v2: Array | Group, output_path: StorePath, dry_run: bool output_path : StorePath The store path to write generated v3 metadata to. dry_run : bool, optional - Enable a 'dry run' - files that would be created are logged, but no files are actually created. + Enable a 'dry run' - files that would be created are logged, but no files are created or changed. """ if not zarr_v2.metadata.zarr_format == 2: raise TypeError("Only arrays / groups with zarr v2 metadata can be converted") @@ -123,7 +123,7 @@ async def remove_metadata( only be allowed when v3 metadata is also present. When True, metadata can be removed when there is no alternative. dry_run : bool, optional - Enable a 'dry run' - files that would be deleted are logged, but no files are actually removed. + Enable a 'dry run' - files that would be deleted are logged, but no files are removed or changed. """ store_path = await make_store_path(store, mode="r+", storage_options=storage_options) if not store_path.store.supports_deletes: From b702060f7d38f773f7a205070792c9486baeabac Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Fri, 1 Aug 2025 10:05:02 +0100 Subject: [PATCH 48/57] pass filters and compressors to processing functions, rather than full metadata --- .../core/metadata/converter/migrate_to_v3.py | 46 +++++++++---------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/src/zarr/core/metadata/converter/migrate_to_v3.py b/src/zarr/core/metadata/converter/migrate_to_v3.py index e850d6e5a4..10c8012e9d 100644 --- a/src/zarr/core/metadata/converter/migrate_to_v3.py +++ b/src/zarr/core/metadata/converter/migrate_to_v3.py @@ -23,6 +23,7 @@ ZarrFormat, ) from zarr.core.dtype.common import HasEndianness +from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType from zarr.core.group import Group, GroupMetadata from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata @@ -35,6 +36,7 @@ def migrate_v2_to_v3( + *, input_store: StoreLike, output_store: StoreLike | None = None, storage_options: dict[str, Any] | None = None, @@ -197,7 +199,9 @@ def _convert_array_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: if metadata_v2.order == "F": # F is equivalent to order: n-1, ... 1, 0 codecs.append(TransposeCodec(order=list(range(len(metadata_v2.shape) - 1, -1, -1)))) - codecs.extend(_convert_filters(metadata_v2)) + + if metadata_v2.filters is not None: + codecs.extend(_convert_filters(metadata_v2.filters)) # array-bytes codecs if not isinstance(metadata_v2.dtype, HasEndianness): @@ -206,8 +210,8 @@ def _convert_array_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: codecs.append(BytesCodec(endian=metadata_v2.dtype.endianness)) # bytes-bytes codecs - bytes_bytes_codec = _convert_compressor(metadata_v2) - if bytes_bytes_codec is not None: + if metadata_v2.compressor is not None: + bytes_bytes_codec = _convert_compressor(metadata_v2.compressor, metadata_v2.dtype) codecs.append(bytes_bytes_codec) return ArrayV3Metadata( @@ -223,11 +227,8 @@ def _convert_array_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: ) -def _convert_filters(metadata_v2: ArrayV2Metadata) -> list[ArrayArrayCodec]: - if metadata_v2.filters is None: - return [] - - filters_codecs = [_find_numcodecs_zarr3(filter) for filter in metadata_v2.filters] +def _convert_filters(filters: tuple[numcodecs.abc.Codec, ...]) -> list[ArrayArrayCodec]: + filters_codecs = [_find_numcodecs_zarr3(filter) for filter in filters] for codec in filters_codecs: if not isinstance(codec, ArrayArrayCodec): raise TypeError(f"Filter {type(codec)} is not an ArrayArrayCodec") @@ -235,34 +236,31 @@ def _convert_filters(metadata_v2: ArrayV2Metadata) -> list[ArrayArrayCodec]: return cast(list[ArrayArrayCodec], filters_codecs) -def _convert_compressor(metadata_v2: ArrayV2Metadata) -> BytesBytesCodec | None: - if metadata_v2.compressor is None: - return None - - compressor_name = metadata_v2.compressor.codec_id - - match compressor_name: +def _convert_compressor( + compressor: numcodecs.abc.Codec, dtype: ZDType[TBaseDType, TBaseScalar] +) -> BytesBytesCodec: + match compressor.codec_id: case "blosc": return BloscCodec( - typesize=metadata_v2.dtype.to_native_dtype().itemsize, - cname=metadata_v2.compressor.cname, - clevel=metadata_v2.compressor.clevel, - shuffle=BloscShuffle.from_int(metadata_v2.compressor.shuffle), - blocksize=metadata_v2.compressor.blocksize, + typesize=dtype.to_native_dtype().itemsize, + cname=compressor.cname, + clevel=compressor.clevel, + shuffle=BloscShuffle.from_int(compressor.shuffle), + blocksize=compressor.blocksize, ) case "zstd": return ZstdCodec( - level=metadata_v2.compressor.level, - checksum=metadata_v2.compressor.checksum, + level=compressor.level, + checksum=compressor.checksum, ) case "gzip": - return GzipCodec(level=metadata_v2.compressor.level) + return GzipCodec(level=compressor.level) case _: # If possible, find matching numcodecs.zarr3 codec - compressor_codec = _find_numcodecs_zarr3(metadata_v2.compressor) + compressor_codec = _find_numcodecs_zarr3(compressor) if not isinstance(compressor_codec, BytesBytesCodec): raise TypeError(f"Compressor {type(compressor_codec)} is not a BytesBytesCodec") From b900a0e4657505c02569e42f0bff935944db8b1e Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Fri, 1 Aug 2025 12:06:56 +0100 Subject: [PATCH 49/57] use Store as input rather than StoreLike --- src/zarr/core/metadata/converter/cli.py | 14 +- .../core/metadata/converter/migrate_to_v3.py | 46 ++--- src/zarr/storage/_common.py | 174 +++++++++++------- 3 files changed, 136 insertions(+), 98 deletions(-) diff --git a/src/zarr/core/metadata/converter/cli.py b/src/zarr/core/metadata/converter/cli.py index 5aa61953e4..f59a925789 100644 --- a/src/zarr/core/metadata/converter/cli.py +++ b/src/zarr/core/metadata/converter/cli.py @@ -6,6 +6,7 @@ import zarr.core.metadata.converter.migrate_to_v3 as migrate_metadata from zarr.core.sync import sync +from zarr.storage._common import make_store app = typer.Typer() @@ -100,13 +101,19 @@ def migrate( "Dry run enabled - no new files will be created or changed. Log of files that would be created on a real run:" ) - write_store = input_store if output_store is None else output_store + input_zarr_store = sync(make_store(input_store, mode="r+")) + + if output_store is not None: + output_zarr_store = sync(make_store(output_store, mode="w-")) + write_store = output_zarr_store + else: + write_store = input_zarr_store if overwrite: sync(migrate_metadata.remove_metadata(write_store, 3, force=force, dry_run=dry_run)) migrate_metadata.migrate_v2_to_v3( - input_store=input_store, output_store=output_store, dry_run=dry_run + input_store=input_zarr_store, output_store=output_zarr_store, dry_run=dry_run ) if remove_v2_metadata: @@ -150,10 +157,11 @@ def remove_metadata( logger.info( "Dry run enabled - no files will be deleted or changed. Log of files that would be deleted on a real run:" ) + input_zarr_store = sync(make_store(store, mode="r+")) sync( migrate_metadata.remove_metadata( - store=store, + store=input_zarr_store, zarr_format=cast(Literal[2, 3], int(zarr_format[1:])), force=force, dry_run=dry_run, diff --git a/src/zarr/core/metadata/converter/migrate_to_v3.py b/src/zarr/core/metadata/converter/migrate_to_v3.py index 10c8012e9d..3d1d754db4 100644 --- a/src/zarr/core/metadata/converter/migrate_to_v3.py +++ b/src/zarr/core/metadata/converter/migrate_to_v3.py @@ -1,11 +1,12 @@ import asyncio import logging -from typing import Any, Literal, cast +from typing import Literal, cast import numcodecs.abc import zarr from zarr.abc.codec import ArrayArrayCodec, BytesBytesCodec, Codec +from zarr.abc.store import Store from zarr.codecs.blosc import BloscCodec, BloscShuffle from zarr.codecs.bytes import BytesCodec from zarr.codecs.gzip import GzipCodec @@ -29,17 +30,15 @@ from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync from zarr.registry import get_codec_class -from zarr.storage import StoreLike -from zarr.storage._common import StorePath, make_store_path +from zarr.storage import StorePath logger = logging.getLogger(__name__) def migrate_v2_to_v3( *, - input_store: StoreLike, - output_store: StoreLike | None = None, - storage_options: dict[str, Any] | None = None, + input_store: Store, + output_store: Store | None = None, dry_run: bool = False, ) -> None: """Migrate all v2 metadata in a zarr hierarchy to v3. @@ -49,26 +48,20 @@ def migrate_v2_to_v3( Parameters ---------- - input_store : StoreLike - Input Zarr to migrate - should be a store, path to directory in file system or name of zip file. - output_store : StoreLike + input_store : Store + Input Zarr to migrate. + output_store : Store, optional Output location to write v3 metadata (no array data will be copied). If not provided, v3 metadata will be - written to input_store. Should be a store, path to directory in file system or name of zip file. - storage_options : dict | None, optional - If the store is backed by an fsspec-based implementation, then this dict will be passed to - the Store constructor for that implementation. Ignored otherwise. Note - the same storage_options will - be passed to both input_store and output_store (if provided). + written to input_store. dry_run : bool, optional Enable a 'dry run' - files that would be created are logged, but no files are created or changed. """ - zarr_v2 = zarr.open(store=input_store, mode="r+", storage_options=storage_options) + zarr_v2 = zarr.open(store=input_store, mode="r+") if output_store is not None: # w- access to not allow overwrite of existing data - output_path = sync( - make_store_path(output_store, mode="w-", storage_options=storage_options) - ) + output_path = sync(StorePath.open(output_store, path="", mode="w-")) else: output_path = zarr_v2.store_path @@ -101,9 +94,8 @@ def migrate_to_v3(zarr_v2: Array | Group, output_path: StorePath, dry_run: bool async def remove_metadata( - store: StoreLike, + store: Store, zarr_format: ZarrFormat, - storage_options: dict[str, Any] | None = None, force: bool = False, dry_run: bool = False, ) -> None: @@ -113,13 +105,10 @@ async def remove_metadata( Parameters ---------- - store : StoreLike - Store or path to directory in file system or name of zip file. + store : Store + Zarr to remove metadata from. zarr_format : ZarrFormat Which format's metadata to remove - 2 or 3. - storage_options : dict | None, optional - If the store is backed by an fsspec-based implementation, then this dict will be passed to - the Store constructor for that implementation. Ignored otherwise. force : bool, optional When False, metadata can only be removed if a valid alternative exists e.g. deletion of v2 metadata will only be allowed when v3 metadata is also present. When True, metadata can be removed when there is no @@ -127,9 +116,10 @@ async def remove_metadata( dry_run : bool, optional Enable a 'dry run' - files that would be deleted are logged, but no files are removed or changed. """ - store_path = await make_store_path(store, mode="r+", storage_options=storage_options) - if not store_path.store.supports_deletes: + + if not store.supports_deletes: raise ValueError("Store must support deletes to remove metadata") + store_path = await StorePath.open(store, path="", mode="r+") metadata_files_all = { 2: [ZARRAY_JSON, ZATTRS_JSON, ZGROUP_JSON, ZMETADATA_V2_JSON], @@ -142,7 +132,7 @@ async def remove_metadata( alternative_metadata = 2 awaitables = [] - async for file_path in store_path.store.list(): + async for file_path in store.list(): parent_path, _, file_name = file_path.rpartition("/") if file_name not in metadata_files_all[zarr_format]: diff --git a/src/zarr/storage/_common.py b/src/zarr/storage/_common.py index e25fa28424..cf0ab45ca8 100644 --- a/src/zarr/storage/_common.py +++ b/src/zarr/storage/_common.py @@ -267,35 +267,105 @@ def __eq__(self, other: object) -> bool: StoreLike: TypeAlias = Store | StorePath | FSMap | Path | str | dict[str, Buffer] -async def make_store_path( +async def make_store( store_like: StoreLike | None, *, - path: str | None = "", mode: AccessModeLiteral | None = None, storage_options: dict[str, Any] | None = None, -) -> StorePath: +) -> Store: """ - Convert a `StoreLike` object into a StorePath object. + Convert a `StoreLike` object into a Store object. + + `StoreLike` objects are converted to `Store` as follows: + + - `Store` or `StorePath` = `Store` object. + - `Path` or `str` = `LocalStore` object. + - `str` that starts with a protocol = `FsspecStore` object. + - `dict[str, Buffer]` = `MemoryStore` object. + - `None` = `MemoryStore` object. + - `FSMap` = `FsspecStore` object. + + Parameters + ---------- + store_like : StoreLike | None + The object to convert to a `Store` object. + mode : StoreAccessMode | None, optional + The mode to use when creating the `Store` object. If None, the + default mode is 'r'. + storage_options : dict[str, Any] | None, optional + The storage options to use when creating the `RemoteStore` object. If + None, the default storage options are used. + + Returns + ------- + Store + The converted Store object. + + Raises + ------ + TypeError + If the StoreLike object is not one of the supported types, or if storage_options is provided but not used. + ValueError + If storage_options is provided for a store that does not support it. + """ + from zarr.storage._fsspec import FsspecStore # circular import - This function takes a `StoreLike` object and returns a `StorePath` object. The - `StoreLike` object can be a `Store`, `StorePath`, `Path`, `str`, or `dict[str, Buffer]`. - If the `StoreLike` object is a Store or `StorePath`, it is converted to a - `StorePath` object. If the `StoreLike` object is a Path or str, it is converted - to a LocalStore object and then to a `StorePath` object. If the `StoreLike` - object is a dict[str, Buffer], it is converted to a `MemoryStore` object and - then to a `StorePath` object. + used_storage_options = False + assert mode in (None, "r", "r+", "a", "w", "w-") + + # if mode 'r' was provided, we'll open any new stores as read-only + _read_only = mode == "r" - If the `StoreLike` object is None, a `MemoryStore` object is created and - converted to a `StorePath` object. + if isinstance(store_like, StorePath): + store = store_like.store + elif isinstance(store_like, Store): + store = store_like + elif store_like is None: + store = await MemoryStore.open(read_only=_read_only) + elif isinstance(store_like, Path): + store = await LocalStore.open(root=store_like, read_only=_read_only) + elif isinstance(store_like, str): + storage_options = storage_options or {} + + if _is_fsspec_uri(store_like): + used_storage_options = True + store = FsspecStore.from_url( + store_like, storage_options=storage_options, read_only=_read_only + ) + else: + store = await LocalStore.open(root=Path(store_like), read_only=_read_only) + elif isinstance(store_like, dict): + # We deliberate only consider dict[str, Buffer] here, and not arbitrary mutable mappings. + # By only allowing dictionaries, which are in-memory, we know that MemoryStore appropriate. + store = await MemoryStore.open(store_dict=store_like, read_only=_read_only) + elif _has_fsspec and isinstance(store_like, FSMap): + if storage_options: + raise ValueError( + "'storage_options was provided but is not used for FSMap store_like objects. Specify the storage options when creating the FSMap instance instead." + ) + store = FsspecStore.from_mapper(store_like, read_only=_read_only) + else: + raise TypeError(f"Unsupported type for store_like: '{type(store_like).__name__}'") - If the `StoreLike` object is a str and starts with a protocol, it is - converted to a RemoteStore object and then to a `StorePath` object. + if storage_options and not used_storage_options: + msg = "'storage_options' was provided but unused. 'storage_options' is only used for fsspec filesystem stores." + raise TypeError(msg) - If the `StoreLike` object is a dict[str, Buffer] and the mode is not None, - the `MemoryStore` object is created with the given mode. + return store - If the `StoreLike` object is a str and starts with a protocol, the - RemoteStore object is created with the given mode and storage options. + +async def make_store_path( + store_like: StoreLike | None, + *, + path: str | None = "", + mode: AccessModeLiteral | None = None, + storage_options: dict[str, Any] | None = None, +) -> StorePath: + """ + Convert a `StoreLike` object into a StorePath object. + + This function takes a `StoreLike` object and returns a `StorePath` object. See `make_store` for details + of which `Store` is used for each type of `store_like` object. Parameters ---------- @@ -319,58 +389,28 @@ async def make_store_path( Raises ------ TypeError - If the StoreLike object is not one of the supported types. - """ - from zarr.storage._fsspec import FsspecStore # circular import + If the StoreLike object is not one of the supported types, or if storage_options is provided but not used. + ValueError + If storage_options is provided for a store that does not support it. - used_storage_options = False + See Also + -------- + make_store + """ path_normalized = normalize_path(path) + if isinstance(store_like, StorePath): - result = store_like / path_normalized + if storage_options: + msg = "'storage_options' was provided but unused. 'storage_options' is only used for fsspec filesystem stores." + raise TypeError(msg) + return store_like / path_normalized + elif _has_fsspec and isinstance(store_like, FSMap) and path: + raise ValueError( + "'path' was provided but is not used for FSMap store_like objects. Specify the path when creating the FSMap instance instead." + ) else: - assert mode in (None, "r", "r+", "a", "w", "w-") - # if mode 'r' was provided, we'll open any new stores as read-only - _read_only = mode == "r" - if isinstance(store_like, Store): - store = store_like - elif store_like is None: - store = await MemoryStore.open(read_only=_read_only) - elif isinstance(store_like, Path): - store = await LocalStore.open(root=store_like, read_only=_read_only) - elif isinstance(store_like, str): - storage_options = storage_options or {} - - if _is_fsspec_uri(store_like): - used_storage_options = True - store = FsspecStore.from_url( - store_like, storage_options=storage_options, read_only=_read_only - ) - else: - store = await LocalStore.open(root=Path(store_like), read_only=_read_only) - elif isinstance(store_like, dict): - # We deliberate only consider dict[str, Buffer] here, and not arbitrary mutable mappings. - # By only allowing dictionaries, which are in-memory, we know that MemoryStore appropriate. - store = await MemoryStore.open(store_dict=store_like, read_only=_read_only) - elif _has_fsspec and isinstance(store_like, FSMap): - if path: - raise ValueError( - "'path' was provided but is not used for FSMap store_like objects. Specify the path when creating the FSMap instance instead." - ) - if storage_options: - raise ValueError( - "'storage_options was provided but is not used for FSMap store_like objects. Specify the storage options when creating the FSMap instance instead." - ) - store = FsspecStore.from_mapper(store_like, read_only=_read_only) - else: - raise TypeError(f"Unsupported type for store_like: '{type(store_like).__name__}'") - - result = await StorePath.open(store, path=path_normalized, mode=mode) - - if storage_options and not used_storage_options: - msg = "'storage_options' was provided but unused. 'storage_options' is only used for fsspec filesystem stores." - raise TypeError(msg) - - return result + store = await make_store(store_like, mode=mode, storage_options=storage_options) + return await StorePath.open(store, path=path_normalized, mode=mode) def _is_fsspec_uri(uri: str) -> bool: From 42aa7dbde52a6a58108daf8b587f96df132cd7c0 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Fri, 1 Aug 2025 13:16:45 +0100 Subject: [PATCH 50/57] move conversion functions into public api --- pyproject.toml | 2 +- src/zarr/{core/metadata/converter => _cli}/__init__.py | 0 src/zarr/{core/metadata/converter => _cli}/cli.py | 3 ++- src/zarr/metadata/__init__.py | 3 +++ .../converter/migrate_to_v3.py => metadata/migrate_v3.py} | 0 tests/test_cli/{test_migrate_to_v3.py => test_migrate_v3.py} | 4 +--- 6 files changed, 7 insertions(+), 5 deletions(-) rename src/zarr/{core/metadata/converter => _cli}/__init__.py (100%) rename src/zarr/{core/metadata/converter => _cli}/cli.py (98%) create mode 100644 src/zarr/metadata/__init__.py rename src/zarr/{core/metadata/converter/migrate_to_v3.py => metadata/migrate_v3.py} (100%) rename tests/test_cli/{test_migrate_to_v3.py => test_migrate_v3.py} (99%) diff --git a/pyproject.toml b/pyproject.toml index df131e7a21..88a53a8089 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -115,7 +115,7 @@ docs = [ ] [project.scripts] -zarr = "zarr.core.metadata.converter.cli:app" +zarr = "zarr._cli.cli:app" [project.urls] diff --git a/src/zarr/core/metadata/converter/__init__.py b/src/zarr/_cli/__init__.py similarity index 100% rename from src/zarr/core/metadata/converter/__init__.py rename to src/zarr/_cli/__init__.py diff --git a/src/zarr/core/metadata/converter/cli.py b/src/zarr/_cli/cli.py similarity index 98% rename from src/zarr/core/metadata/converter/cli.py rename to src/zarr/_cli/cli.py index f59a925789..a228d0998f 100644 --- a/src/zarr/core/metadata/converter/cli.py +++ b/src/zarr/_cli/cli.py @@ -4,7 +4,7 @@ import typer -import zarr.core.metadata.converter.migrate_to_v3 as migrate_metadata +import zarr.metadata.migrate_v3 as migrate_metadata from zarr.core.sync import sync from zarr.storage._common import make_store @@ -107,6 +107,7 @@ def migrate( output_zarr_store = sync(make_store(output_store, mode="w-")) write_store = output_zarr_store else: + output_zarr_store = None write_store = input_zarr_store if overwrite: diff --git a/src/zarr/metadata/__init__.py b/src/zarr/metadata/__init__.py new file mode 100644 index 0000000000..9c4e834458 --- /dev/null +++ b/src/zarr/metadata/__init__.py @@ -0,0 +1,3 @@ +from zarr.metadata.migrate_v3 import migrate_to_v3, migrate_v2_to_v3, remove_metadata + +__all__ = ["migrate_to_v3", "migrate_v2_to_v3", "remove_metadata"] diff --git a/src/zarr/core/metadata/converter/migrate_to_v3.py b/src/zarr/metadata/migrate_v3.py similarity index 100% rename from src/zarr/core/metadata/converter/migrate_to_v3.py rename to src/zarr/metadata/migrate_v3.py diff --git a/tests/test_cli/test_migrate_to_v3.py b/tests/test_cli/test_migrate_v3.py similarity index 99% rename from tests/test_cli/test_migrate_to_v3.py rename to tests/test_cli/test_migrate_v3.py index 6ce125c755..c3ccd02601 100644 --- a/tests/test_cli/test_migrate_to_v3.py +++ b/tests/test_cli/test_migrate_v3.py @@ -27,9 +27,7 @@ typer_testing = pytest.importorskip( "typer.testing", reason="optional cli dependencies aren't installed" ) -cli = pytest.importorskip( - "zarr.core.metadata.converter.cli", reason="optional cli dependencies aren't installed" -) +cli = pytest.importorskip("zarr._cli.cli", reason="optional cli dependencies aren't installed") runner = typer_testing.CliRunner() From f62fe3171d332d48c29afd4808773e443a83b1d5 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Mon, 4 Aug 2025 13:42:13 +0100 Subject: [PATCH 51/57] fail on discovery of consolidated metadata --- src/zarr/metadata/migrate_v3.py | 3 +++ tests/test_cli/test_migrate_v3.py | 13 +++++++++++++ 2 files changed, 16 insertions(+) diff --git a/src/zarr/metadata/migrate_v3.py b/src/zarr/metadata/migrate_v3.py index 3d1d754db4..d5c18acf85 100644 --- a/src/zarr/metadata/migrate_v3.py +++ b/src/zarr/metadata/migrate_v3.py @@ -154,6 +154,9 @@ async def remove_metadata( def _convert_group(zarr_v2: Group, output_path: StorePath, dry_run: bool) -> None: + if zarr_v2.metadata.consolidated_metadata is not None: + raise NotImplementedError("Migration of consolidated metadata isn't supported.") + # process members of the group for key in zarr_v2: migrate_to_v3(zarr_v2[key], output_path=output_path / key, dry_run=dry_run) diff --git a/tests/test_cli/test_migrate_v3.py b/tests/test_cli/test_migrate_v3.py index c3ccd02601..187330323e 100644 --- a/tests/test_cli/test_migrate_v3.py +++ b/tests/test_cli/test_migrate_v3.py @@ -448,6 +448,19 @@ def test_migrate_v3(local_store: LocalStore, node_type: str) -> None: assert str(result.exception) == "Only arrays / groups with zarr v2 metadata can be converted" +def test_migrate_consolidated_metadata(local_store: LocalStore) -> None: + """Attempting to convert a group with consolidated metadata should always fail""" + + group = zarr.create_group(store=local_store, zarr_format=2) + group.create_array(shape=(1,), name="a", dtype="uint8") + zarr.consolidate_metadata(local_store) + + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) + assert result.exit_code == 1 + assert isinstance(result.exception, NotImplementedError) + assert str(result.exception) == "Migration of consolidated metadata isn't supported." + + def test_migrate_unknown_codec(local_store: LocalStore) -> None: """Attempting to convert a codec without a v3 equivalent should always fail""" From 71067bad8fc5b552a40f5600f65dd21514605bee Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Wed, 6 Aug 2025 11:15:18 +0100 Subject: [PATCH 52/57] minor changes from review --- src/zarr/storage/_common.py | 2 +- tests/test_cli/test_migrate_v3.py | 41 ++++++++++++++++--------------- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/src/zarr/storage/_common.py b/src/zarr/storage/_common.py index 99ca98965c..d3c0ade804 100644 --- a/src/zarr/storage/_common.py +++ b/src/zarr/storage/_common.py @@ -321,7 +321,7 @@ async def make_store( _read_only = mode == "r" if isinstance(store_like, StorePath): - # Already a StorePath + # Get underlying store return store_like.store elif isinstance(store_like, Store): diff --git a/tests/test_cli/test_migrate_v3.py b/tests/test_cli/test_migrate_v3.py index 187330323e..ef974d8b54 100644 --- a/tests/test_cli/test_migrate_v3.py +++ b/tests/test_cli/test_migrate_v3.py @@ -21,7 +21,8 @@ from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding from zarr.core.common import JSON, ZarrFormat from zarr.core.dtype.npy.int import UInt8, UInt16 -from zarr.core.group import Group +from zarr.core.group import Group, GroupMetadata +from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.storage._local import LocalStore typer_testing = pytest.importorskip( @@ -56,21 +57,22 @@ def test_migrate_array(local_store: LocalStore) -> None: assert (local_store.root / "zarr.json").exists() zarr_array = zarr.open(local_store.root, zarr_format=3) - metadata = zarr_array.metadata - assert metadata.zarr_format == 3 - assert metadata.node_type == "array" - assert metadata.shape == shape - assert metadata.chunk_grid == RegularChunkGrid(chunk_shape=chunks) - assert metadata.chunk_key_encoding == V2ChunkKeyEncoding(separator=".") - assert metadata.data_type == UInt16(endianness="little") - assert metadata.codecs == ( - BytesCodec(endian="little"), - BloscCodec(typesize=2, cname="zstd", clevel=3, shuffle="shuffle", blocksize=0), + + expected_metadata = ArrayV3Metadata( + shape=shape, + data_type=UInt16(endianness="little"), + chunk_grid=RegularChunkGrid(chunk_shape=chunks), + chunk_key_encoding=V2ChunkKeyEncoding(separator="."), + fill_value=fill_value, + codecs=( + BytesCodec(endian="little"), + BloscCodec(typesize=2, cname="zstd", clevel=3, shuffle="shuffle", blocksize=0), + ), + attributes=attributes, + dimension_names=None, + storage_transformers=None, ) - assert metadata.fill_value == fill_value - assert metadata.attributes == attributes - assert metadata.dimension_names is None - assert metadata.storage_transformers == () + assert zarr_array.metadata == expected_metadata def test_migrate_group(local_store: LocalStore) -> None: @@ -82,11 +84,10 @@ def test_migrate_group(local_store: LocalStore) -> None: assert (local_store.root / "zarr.json").exists() zarr_array = zarr.open(local_store.root, zarr_format=3) - metadata = zarr_array.metadata - assert metadata.zarr_format == 3 - assert metadata.node_type == "group" - assert metadata.attributes == attributes - assert metadata.consolidated_metadata is None + expected_metadata = GroupMetadata( + attributes=attributes, zarr_format=3, consolidated_metadata=None + ) + assert zarr_array.metadata == expected_metadata @pytest.mark.parametrize("separator", [".", "/"]) From 34e97f0fb4cfbaaa22248ff6004681a08693b72c Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Wed, 6 Aug 2025 13:18:37 +0100 Subject: [PATCH 53/57] use same logger throughout zarr-python --- src/zarr/__init__.py | 58 ++++++++++++++++++++++++++++++++++++++++++++ src/zarr/_cli/cli.py | 21 +++++++--------- 2 files changed, 67 insertions(+), 12 deletions(-) diff --git a/src/zarr/__init__.py b/src/zarr/__init__.py index 0d58ecf8e8..3c6195c28f 100644 --- a/src/zarr/__init__.py +++ b/src/zarr/__init__.py @@ -1,3 +1,7 @@ +import functools +import logging +from typing import Literal + from zarr._version import version as __version__ from zarr.api.synchronous import ( array, @@ -37,6 +41,8 @@ # in case setuptools scm screw up and find version to be 0.0.0 assert not __version__.startswith("0.0.0") +_logger = logging.getLogger(__name__) + def print_debug_info() -> None: """ @@ -85,6 +91,58 @@ def print_packages(packages: list[str]) -> None: print_packages(optional) +# The decorator ensures this always returns the same handler (and it is only +# attached once). +@functools.cache +def _ensure_handler() -> logging.Handler: + """ + The first time this function is called, attach a `StreamHandler` using the + same format as `logging.basicConfig` to the Zarr-Python root logger. + + Return this handler every time this function is called. + """ + handler = logging.StreamHandler() + handler.setFormatter(logging.Formatter(logging.BASIC_FORMAT)) + _logger.addHandler(handler) + return handler + + +def set_log_level( + level: Literal["NOTSET", "DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], +) -> None: + """Set the logging level for Zarr-Python. + + Zarr-Python uses the standard library `logging` framework under the root + logger 'zarr'. This is a helper function to: + + - set Zarr-Python's root logger level + - set the root logger handler's level, creating the handler + if it does not exist yet + + Parameters + ---------- + level : str + The logging level to set. + """ + _logger.setLevel(level) + _ensure_handler().setLevel(level) + + +def set_format(log_format: str) -> None: + """Set the format of logging messages from Zarr-Python. + + Zarr-Python uses the standard library `logging` framework under the root + logger 'zarr'. This sets the format of log messages from the root logger's StreamHandler. + + Parameters + ---------- + log_format : str + A string determining the log format (as defined in the standard library's `logging` module + for logging.Formatter) + """ + _ensure_handler().setFormatter(logging.Formatter(fmt=log_format)) + + __all__ = [ "Array", "AsyncArray", diff --git a/src/zarr/_cli/cli.py b/src/zarr/_cli/cli.py index a228d0998f..785efe505b 100644 --- a/src/zarr/_cli/cli.py +++ b/src/zarr/_cli/cli.py @@ -4,6 +4,7 @@ import typer +import zarr import zarr.metadata.migrate_v3 as migrate_metadata from zarr.core.sync import sync from zarr.storage._common import make_store @@ -13,17 +14,13 @@ logger = logging.getLogger(__name__) -def _set_logging_config(*, verbose: bool) -> None: +def _set_logging_level(*, verbose: bool) -> None: if verbose: - lvl = logging.INFO + lvl = "INFO" else: - lvl = logging.WARNING - fmt = "%(message)s" - logging.basicConfig(level=lvl, format=fmt) - - -def _set_verbose_level() -> None: - logging.getLogger().setLevel(logging.INFO) + lvl = "WARNING" + zarr.set_log_level(cast(Literal["INFO", "WARNING"], lvl)) + zarr.set_format("%(message)s") class ZarrFormat(str, Enum): @@ -96,7 +93,7 @@ def migrate( (every group / array). v2 files (.zarray, .zattrs etc.) will be left as-is. """ if dry_run: - _set_verbose_level() + _set_logging_level(verbose=True) logger.info( "Dry run enabled - no new files will be created or changed. Log of files that would be created on a real run:" ) @@ -154,7 +151,7 @@ def remove_metadata( Note - this will remove metadata files at all levels of the hierarchy (every group and array). """ if dry_run: - _set_verbose_level() + _set_logging_level(verbose=True) logger.info( "Dry run enabled - no files will be deleted or changed. Log of files that would be deleted on a real run:" ) @@ -182,7 +179,7 @@ def main( """ See available commands below - access help for individual commands with zarr COMMAND --help. """ - _set_logging_config(verbose=verbose) + _set_logging_level(verbose=verbose) if __name__ == "__main__": From 9f6b87513fed2289c01d28595876468ddcda3b72 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Wed, 6 Aug 2025 15:27:49 +0100 Subject: [PATCH 54/57] add release notes and docs for the cli --- changes/1798.feature.rst | 2 + docs/user-guide/cli.rst | 114 ++++++++++++++++++++++++++++++++++++++ docs/user-guide/index.rst | 1 + 3 files changed, 117 insertions(+) create mode 100644 changes/1798.feature.rst create mode 100644 docs/user-guide/cli.rst diff --git a/changes/1798.feature.rst b/changes/1798.feature.rst new file mode 100644 index 0000000000..64d4efdf08 --- /dev/null +++ b/changes/1798.feature.rst @@ -0,0 +1,2 @@ +Add a command-line interface to migrate v2 Zarr metadata to v3. Corresponding functions are also +provided under zarr.metadata. diff --git a/docs/user-guide/cli.rst b/docs/user-guide/cli.rst new file mode 100644 index 0000000000..90d06e2d83 --- /dev/null +++ b/docs/user-guide/cli.rst @@ -0,0 +1,114 @@ +.. _user-guide-cli: + +Command-line interface +======================== + +Zarr-Python provides a command-line interface that enables: + +- migration of Zarr v2 metadata to v3 +- removal of v2 or v3 metadata + +To see available commands run the following in a terminal: + +.. code-block:: bash + + $ zarr --help + +or to get help on individual commands: + +.. code-block:: bash + + $ zarr migrate --help + + $ zarr remove-metadata --help + + +Migrate metadata from v2 to v3 +------------------------------ + +Migrate to a separate location +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To migrate a Zarr array/group's metadata from v2 to v3 run: + +.. code-block:: bash + + $ zarr migrate v3 path/to/input.zarr path/to/output.zarr + +This will write new ``zarr.json`` files to ``output.zarr``, leaving ``input.zarr`` un-touched. +Note - this will migrate the entire Zarr hierarchy, so if ``input.zarr`` contains multiple groups/arrays, +new ``zarr.json`` will be made for all of them. + +Migrate in-place +~~~~~~~~~~~~~~~~ + +If you'd prefer to migrate the metadata in-place run: + +.. code-block:: bash + + $ zarr migrate v3 path/to/input.zarr + +This will write new ``zarr.json`` files to ``input.zarr``, leaving the existing v2 metadata un-touched. + +To open the array/group using the new metadata use: + +.. code-block:: python + + >>> import zarr + >>> zarr_with_v3_metadata = zarr.open('path/to/input.zarr', zarr_format=3) + +Once you are happy with the conversion, you can run the following to remove the old v2 metadata: + +.. code-block:: bash + + $ zarr remove-metadata v2 path/to/input.zarr + +Note there is also a shortcut to migrate and remove v2 metadata in one step: + +.. code-block:: bash + + $ zarr migrate v3 path/to/input.zarr --remove-v2-metadata + + +Dry run +-------- +All commands provide a ``--dry-run`` option that will log changes that would be made on a real run, without creating +or modifying any files. + +.. code-block:: bash + + $ zarr migrate v3 path/to/input.zarr --dry-run + + Dry run enabled - no new files will be created or changed. Log of files that would be created on a real run: + Saving metadata to path/to/input.zarr/zarr.json + +Remove metadata +---------------- + +Remove v2 metadata using: + +.. code-block:: bash + + $ zarr remove-metadata v2 path/to/input.zarr + +or v3 with: + +.. code-block:: bash + + $ zarr remove-metadata v3 path/to/input.zarr + +By default, this will only allow removal of metadata if a valid alternative exists. For example, you can't +remove v2 metadata unless v3 metadata exists at that location. + +To override this behaviour use ``--force``: + +.. code-block:: bash + + $ zarr remove-metadata v3 path/to/input.zarr --force + +Equivalent functions +-------------------- +All features of the command-line interface are also available via functions under +:mod:`zarr.metadata`. + + diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst index f92c576f32..a83a30172b 100644 --- a/docs/user-guide/index.rst +++ b/docs/user-guide/index.rst @@ -13,6 +13,7 @@ User guide storage config v3_migration + cli Advanced Topics --------------- From 1362cc6edd275fabb0a555c07528750e6808fc76 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Wed, 6 Aug 2025 15:52:28 +0100 Subject: [PATCH 55/57] tidy up formatting of zarr.metadata api docs --- src/zarr/metadata/__init__.py | 3 --- src/zarr/metadata/migrate_v3.py | 18 +++++++++--------- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/src/zarr/metadata/__init__.py b/src/zarr/metadata/__init__.py index 9c4e834458..e69de29bb2 100644 --- a/src/zarr/metadata/__init__.py +++ b/src/zarr/metadata/__init__.py @@ -1,3 +0,0 @@ -from zarr.metadata.migrate_v3 import migrate_to_v3, migrate_v2_to_v3, remove_metadata - -__all__ = ["migrate_to_v3", "migrate_v2_to_v3", "remove_metadata"] diff --git a/src/zarr/metadata/migrate_v3.py b/src/zarr/metadata/migrate_v3.py index d5c18acf85..c2a2f4494a 100644 --- a/src/zarr/metadata/migrate_v3.py +++ b/src/zarr/metadata/migrate_v3.py @@ -5,6 +5,7 @@ import numcodecs.abc import zarr +from zarr import Array, Group from zarr.abc.codec import ArrayArrayCodec, BytesBytesCodec, Codec from zarr.abc.store import Store from zarr.codecs.blosc import BloscCodec, BloscShuffle @@ -12,7 +13,6 @@ from zarr.codecs.gzip import GzipCodec from zarr.codecs.transpose import TransposeCodec from zarr.codecs.zstd import ZstdCodec -from zarr.core.array import Array from zarr.core.buffer.core import default_buffer_prototype from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding from zarr.core.common import ( @@ -25,14 +25,14 @@ ) from zarr.core.dtype.common import HasEndianness from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType -from zarr.core.group import Group, GroupMetadata +from zarr.core.group import GroupMetadata from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.core.sync import sync from zarr.registry import get_codec_class from zarr.storage import StorePath -logger = logging.getLogger(__name__) +_logger = logging.getLogger(__name__) def migrate_v2_to_v3( @@ -41,10 +41,10 @@ def migrate_v2_to_v3( output_store: Store | None = None, dry_run: bool = False, ) -> None: - """Migrate all v2 metadata in a zarr hierarchy to v3. + """Migrate all v2 metadata in a Zarr store to v3. - This will create a zarr.json file at each level (for every group / array). v2 files (.zarray, .zattrs etc.) - will be left as-is. + This will create a zarr.json file at each level of a Zarr hierarchy (for every group / array). + v2 files (.zarray, .zattrs etc.) will be left as-is. Parameters ---------- @@ -69,7 +69,7 @@ def migrate_v2_to_v3( def migrate_to_v3(zarr_v2: Array | Group, output_path: StorePath, dry_run: bool = False) -> None: - """Migrate all v2 metadata in a zarr array/group to v3. + """Migrate all v2 metadata in a Zarr array/group to v3. Note - if a group is provided, then all arrays / groups within this group will also be converted. A zarr.json file will be created for each level and written to output_path, with any v2 files @@ -141,7 +141,7 @@ async def remove_metadata( if force or await _metadata_exists( cast(Literal[2, 3], alternative_metadata), store_path / parent_path ): - logger.info("Deleting metadata at %s", store_path / file_path) + _logger.info("Deleting metadata at %s", store_path / file_path) if not dry_run: awaitables.append((store_path / file_path).delete()) else: @@ -287,7 +287,7 @@ async def _save_v3_metadata( if await zarr_json_path.exists(): raise ValueError(f"{ZARR_JSON} already exists at {zarr_json_path}") - logger.info("Saving metadata to %s", zarr_json_path) + _logger.info("Saving metadata to %s", zarr_json_path) to_save = metadata_v3.to_buffer_dict(default_buffer_prototype()) if not dry_run: From f30117241a1a9582864b83ae6feb6f4f6e8e16a1 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Wed, 6 Aug 2025 17:53:27 +0100 Subject: [PATCH 56/57] fix failing tests --- tests/test_cli/test_migrate_v3.py | 80 +++++++++++++++++++++---------- 1 file changed, 55 insertions(+), 25 deletions(-) diff --git a/tests/test_cli/test_migrate_v3.py b/tests/test_cli/test_migrate_v3.py index ef974d8b54..b72562e5a3 100644 --- a/tests/test_cli/test_migrate_v3.py +++ b/tests/test_cli/test_migrate_v3.py @@ -6,7 +6,6 @@ import numcodecs.abc import numpy as np import pytest -from numcodecs.zarr3 import LZMA, Delta import zarr from tests.test_cli.conftest import create_nested_zarr @@ -32,6 +31,8 @@ runner = typer_testing.CliRunner() +NUMCODECS_USER_WARNING = "Numcodecs codecs are not in the Zarr version 3 specification and may not be supported by other zarr implementations." + def test_migrate_array(local_store: LocalStore) -> None: shape = (10, 10) @@ -283,28 +284,8 @@ def test_migrate_sub_group( ), (numcodecs.Zstd(level=3), ZstdCodec(level=3)), (numcodecs.GZip(level=3), GzipCodec(level=3)), - ( - numcodecs.LZMA( - format=lzma.FORMAT_RAW, - check=-1, - preset=None, - filters=[ - {"id": lzma.FILTER_DELTA, "dist": 4}, - {"id": lzma.FILTER_LZMA2, "preset": 1}, - ], - ), - LZMA( - format=lzma.FORMAT_RAW, - check=-1, - preset=None, - filters=[ - {"id": lzma.FILTER_DELTA, "dist": 4}, - {"id": lzma.FILTER_LZMA2, "preset": 1}, - ], - ), - ), ], - ids=["blosc", "zstd", "gzip", "numcodecs-compressor"], + ids=["blosc", "zstd", "gzip"], ) def test_migrate_compressor( local_store: LocalStore, compressor_v2: numcodecs.abc.Codec, compressor_v3: Codec @@ -334,9 +315,54 @@ def test_migrate_compressor( assert np.all(zarr_array[:] == 1) +@pytest.mark.filterwarnings(f"ignore:{NUMCODECS_USER_WARNING}:UserWarning") +def test_migrate_numcodecs_compressor(local_store: LocalStore) -> None: + """Test migration of a numcodecs compressor without a zarr.codecs equivalent.""" + + lzma_settings = { + "format": lzma.FORMAT_RAW, + "check": -1, + "preset": None, + "filters": [ + {"id": lzma.FILTER_DELTA, "dist": 4}, + {"id": lzma.FILTER_LZMA2, "preset": 1}, + ], + } + + zarr_array = zarr.create_array( + store=local_store, + shape=(10, 10), + chunks=(10, 10), + dtype="uint16", + compressors=numcodecs.LZMA.from_config(lzma_settings), + zarr_format=2, + fill_value=0, + ) + zarr_array[:] = 1 + + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) + assert result.exit_code == 0 + assert (local_store.root / "zarr.json").exists() + + zarr_array = zarr.open_array(local_store.root, zarr_format=3) + metadata = zarr_array.metadata + assert metadata.zarr_format == 3 + assert metadata.codecs == ( + BytesCodec(endian="little"), + numcodecs.zarr3.LZMA( + format=lzma_settings["format"], + check=lzma_settings["check"], + preset=lzma_settings["preset"], + filters=lzma_settings["filters"], + ), + ) + assert np.all(zarr_array[:] == 1) + + +@pytest.mark.filterwarnings(f"ignore:{NUMCODECS_USER_WARNING}:UserWarning") def test_migrate_filter(local_store: LocalStore) -> None: filter_v2 = numcodecs.Delta(dtype=" None: fill_value=0, ) - result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) + with pytest.warns(UserWarning, match=NUMCODECS_USER_WARNING): + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) + assert result.exit_code == 1 assert isinstance(result.exception, TypeError) assert ( @@ -517,7 +545,9 @@ def test_migrate_incorrect_compressor(local_store: LocalStore) -> None: fill_value=0, ) - result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) + with pytest.warns(UserWarning, match=NUMCODECS_USER_WARNING): + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) + assert result.exit_code == 1 assert isinstance(result.exception, TypeError) assert ( From 0449ef70e9aa042b34814167c643ec6023adf09a Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Thu, 7 Aug 2025 09:13:21 +0100 Subject: [PATCH 57/57] add a section about --verbose to the docs --- docs/user-guide/cli.rst | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/docs/user-guide/cli.rst b/docs/user-guide/cli.rst index 90d06e2d83..822b60d389 100644 --- a/docs/user-guide/cli.rst +++ b/docs/user-guide/cli.rst @@ -70,18 +70,6 @@ Note there is also a shortcut to migrate and remove v2 metadata in one step: $ zarr migrate v3 path/to/input.zarr --remove-v2-metadata -Dry run --------- -All commands provide a ``--dry-run`` option that will log changes that would be made on a real run, without creating -or modifying any files. - -.. code-block:: bash - - $ zarr migrate v3 path/to/input.zarr --dry-run - - Dry run enabled - no new files will be created or changed. Log of files that would be created on a real run: - Saving metadata to path/to/input.zarr/zarr.json - Remove metadata ---------------- @@ -106,6 +94,31 @@ To override this behaviour use ``--force``: $ zarr remove-metadata v3 path/to/input.zarr --force + +Dry run +-------- +All commands provide a ``--dry-run`` option that will log changes that would be made on a real run, without creating +or modifying any files. + +.. code-block:: bash + + $ zarr migrate v3 path/to/input.zarr --dry-run + + Dry run enabled - no new files will be created or changed. Log of files that would be created on a real run: + Saving metadata to path/to/input.zarr/zarr.json + + +Verbose +-------- +You can also add ``--verbose`` **before** any command, to see a full log of its actions: + +.. code-block:: bash + + $ zarr --verbose migrate v3 path/to/input.zarr + + $ zarr --verbose remove-metadata v2 path/to/input.zarr + + Equivalent functions -------------------- All features of the command-line interface are also available via functions under