Skip to content

Commit 80b5a10

Browse files
committed
Merge branch 'feat/read-funcs' of github.com:d-v-b/zarr-python into feat/read-funcs
2 parents df35d13 + fb286a7 commit 80b5a10

File tree

16 files changed

+221
-100
lines changed

16 files changed

+221
-100
lines changed

src/zarr/core/array.py

Lines changed: 81 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,8 @@
8989
from zarr.core.metadata.v2 import (
9090
_default_compressor,
9191
_default_filters,
92+
parse_compressor,
93+
parse_filters,
9294
)
9395
from zarr.core.metadata.v3 import DataType, parse_node_type_array
9496
from zarr.core.sync import sync
@@ -164,7 +166,7 @@ async def get_array_metadata(
164166
)
165167
if zarr_json_bytes is not None and zarray_bytes is not None:
166168
# warn and favor v3
167-
msg = f"Both zarr.json (zarr v3) and .zarray (zarr v2) metadata objects exist at {store_path}."
169+
msg = f"Both zarr.json (Zarr v3) and .zarray (Zarr v2) metadata objects exist at {store_path}. Zarr v3 will be used."
168170
warnings.warn(msg, stacklevel=1)
169171
if zarr_json_bytes is None and zarray_bytes is None:
170172
raise FileNotFoundError(store_path)
@@ -667,8 +669,8 @@ async def _create_v2(
667669
config: ArrayConfig,
668670
dimension_separator: Literal[".", "/"] | None = None,
669671
fill_value: float | None = None,
670-
filters: list[dict[str, JSON]] | None = None,
671-
compressor: dict[str, JSON] | None = None,
672+
filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None,
673+
compressor: dict[str, JSON] | numcodecs.abc.Codec | None = None,
672674
attributes: dict[str, JSON] | None = None,
673675
overwrite: bool = False,
674676
) -> AsyncArray[ArrayV2Metadata]:
@@ -803,6 +805,7 @@ def shape(self) -> ChunkCoords:
803805
@property
804806
def chunks(self) -> ChunkCoords:
805807
"""Returns the chunk shape of the Array.
808+
If sharding is used the inner chunk shape is returned.
806809
807810
Only defined for arrays using using `RegularChunkGrid`.
808811
If array doesn't use `RegularChunkGrid`, `NotImplementedError` is raised.
@@ -812,14 +815,22 @@ def chunks(self) -> ChunkCoords:
812815
ChunkCoords:
813816
The chunk shape of the Array.
814817
"""
815-
if isinstance(self.metadata.chunk_grid, RegularChunkGrid):
816-
return self.metadata.chunk_grid.chunk_shape
818+
return self.metadata.chunks
817819

818-
msg = (
819-
f"The `chunks` attribute is only defined for arrays using `RegularChunkGrid`."
820-
f"This array has a {self.metadata.chunk_grid} instead."
821-
)
822-
raise NotImplementedError(msg)
820+
@property
821+
def shards(self) -> ChunkCoords | None:
822+
"""Returns the shard shape of the Array.
823+
Returns None if sharding is not used.
824+
825+
Only defined for arrays using using `RegularChunkGrid`.
826+
If array doesn't use `RegularChunkGrid`, `NotImplementedError` is raised.
827+
828+
Returns
829+
-------
830+
ChunkCoords:
831+
The shard shape of the Array.
832+
"""
833+
return self.metadata.shards
823834

824835
@property
825836
def size(self) -> int:
@@ -1733,6 +1744,10 @@ def shape(self, value: ChunkCoords) -> None:
17331744
@property
17341745
def chunks(self) -> ChunkCoords:
17351746
"""Returns a tuple of integers describing the length of each dimension of a chunk of the array.
1747+
If sharding is used the inner chunk shape is returned.
1748+
1749+
Only defined for arrays using using `RegularChunkGrid`.
1750+
If array doesn't use `RegularChunkGrid`, `NotImplementedError` is raised.
17361751
17371752
Returns
17381753
-------
@@ -1741,6 +1756,21 @@ def chunks(self) -> ChunkCoords:
17411756
"""
17421757
return self._async_array.chunks
17431758

1759+
@property
1760+
def shards(self) -> ChunkCoords | None:
1761+
"""Returns a tuple of integers describing the length of each dimension of a shard of the array.
1762+
Returns None if sharding is not used.
1763+
1764+
Only defined for arrays using using `RegularChunkGrid`.
1765+
If array doesn't use `RegularChunkGrid`, `NotImplementedError` is raised.
1766+
1767+
Returns
1768+
-------
1769+
tuple | None
1770+
A tuple of integers representing the length of each dimension of a shard or None if sharding is not used.
1771+
"""
1772+
return self._async_array.shards
1773+
17441774
@property
17451775
def size(self) -> int:
17461776
"""Returns the total number of elements in the array.
@@ -3464,7 +3494,7 @@ def _get_default_codecs(
34643494
else:
34653495
dtype_key = "numeric"
34663496

3467-
return default_codecs[dtype_key]
3497+
return cast(list[dict[str, JSON]], default_codecs[dtype_key])
34683498

34693499

34703500
FiltersParam: TypeAlias = (
@@ -3473,7 +3503,7 @@ def _get_default_codecs(
34733503
| Iterable[numcodecs.abc.Codec]
34743504
| Literal["auto"]
34753505
)
3476-
CompressionParam: TypeAlias = (
3506+
CompressorsParam: TypeAlias = (
34773507
Iterable[dict[str, JSON] | BytesBytesCodec]
34783508
| BytesBytesCodec
34793509
| numcodecs.abc.Codec
@@ -3490,7 +3520,7 @@ async def create_array(
34903520
chunks: ChunkCoords | Literal["auto"] = "auto",
34913521
shards: ChunkCoords | Literal["auto"] | None = None,
34923522
filters: FiltersParam = "auto",
3493-
compressors: CompressionParam = "auto",
3523+
compressors: CompressorsParam = "auto",
34943524
fill_value: Any | None = 0,
34953525
order: MemoryOrder | None = None,
34963526
zarr_format: ZarrFormat | None = 3,
@@ -3522,16 +3552,16 @@ async def create_array(
35223552
filters : Iterable[Codec], optional
35233553
Iterable of filters to apply to each chunk of the array, in order, before serializing that
35243554
chunk to bytes.
3525-
For Zarr v3, a "filter" is a transformation that takes an array and returns an array,
3555+
For Zarr v3, a "filter" is a codec that takes an array and returns an array,
35263556
and these values must be instances of ``ArrayArrayCodec``, or dict representations
35273557
of ``ArrayArrayCodec``.
35283558
For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the
35293559
the order if your filters is consistent with the behavior of each filter.
35303560
compressors : Iterable[Codec], optional
35313561
List of compressors to apply to the array. Compressors are applied in order, and after any
35323562
filters are applied (if any are specified).
3533-
For Zarr v3, a "compressor" is a transformation that takes a string of bytes and
3534-
returns another string of bytes.
3563+
For Zarr v3, a "compressor" is a codec that takes a bytestrea, and
3564+
returns another bytestream.
35353565
For Zarr v2, a "compressor" can be any numcodecs codec.
35363566
fill_value : Any, optional
35373567
Fill value for the array.
@@ -3589,13 +3619,8 @@ async def create_array(
35893619
)
35903620

35913621
raise ValueError(msg)
3592-
if filters != "auto" and not all(isinstance(f, numcodecs.abc.Codec) for f in filters):
3593-
raise TypeError(
3594-
"For Zarr v2 arrays, all elements of `filters` must be numcodecs codecs."
3595-
)
3596-
filters = cast(Iterable[numcodecs.abc.Codec] | Literal["auto"], filters)
35973622
filters_parsed, compressor_parsed = _parse_chunk_encoding_v2(
3598-
compression=compressors, filters=filters, dtype=dtype_parsed
3623+
compressor=compressors, filters=filters, dtype=dtype_parsed
35993624
)
36003625
if dimension_names is not None:
36013626
raise ValueError("Zarr v2 arrays do not support dimension names.")
@@ -3622,7 +3647,7 @@ async def create_array(
36223647
array_array, array_bytes, bytes_bytes = _parse_chunk_encoding_v3(
36233648
compressors=compressors, filters=filters, dtype=dtype_parsed
36243649
)
3625-
sub_codecs = (*array_array, array_bytes, *bytes_bytes)
3650+
sub_codecs = cast(tuple[Codec, ...], (*array_array, array_bytes, *bytes_bytes))
36263651
codecs_out: tuple[Codec, ...]
36273652
if shard_shape_parsed is not None:
36283653
sharding_codec = ShardingCodec(chunk_shape=chunk_shape_parsed, codecs=sub_codecs)
@@ -3666,7 +3691,7 @@ def _parse_chunk_key_encoding(
36663691
"""
36673692
if data is None:
36683693
if zarr_format == 2:
3669-
result = ChunkKeyEncoding.from_dict({"name": "v2", "separator": "/"})
3694+
result = ChunkKeyEncoding.from_dict({"name": "v2", "separator": "."})
36703695
else:
36713696
result = ChunkKeyEncoding.from_dict({"name": "default", "separator": "/"})
36723697
elif isinstance(data, ChunkKeyEncoding):
@@ -3726,61 +3751,77 @@ def _get_default_encoding_v3(
37263751

37273752
def _get_default_chunk_encoding_v2(
37283753
dtype: np.dtype[Any],
3729-
) -> tuple[tuple[numcodecs.abc.Codec, ...], numcodecs.abc.Codec | None]:
3754+
) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]:
37303755
"""
37313756
Get the default chunk encoding for zarr v2 arrays, given a dtype
37323757
"""
37333758

37343759
compressor_dict = _default_compressor(dtype)
37353760
filter_dicts = _default_filters(dtype)
37363761

3737-
compressor = numcodecs.get_codec(compressor_dict)
3738-
filters = tuple(numcodecs.get_codec(f) for f in filter_dicts)
3762+
compressor = None
3763+
if compressor_dict is not None:
3764+
compressor = numcodecs.get_codec(compressor_dict)
3765+
3766+
filters = None
3767+
if filter_dicts is not None:
3768+
filters = tuple(numcodecs.get_codec(f) for f in filter_dicts)
3769+
37393770
return filters, compressor
37403771

37413772

37423773
def _parse_chunk_encoding_v2(
37433774
*,
3744-
compression: numcodecs.abc.Codec | Literal["auto"],
3745-
filters: tuple[numcodecs.abc.Codec, ...] | Literal["auto"],
3775+
compressor: CompressorsParam,
3776+
filters: FiltersParam,
37463777
dtype: np.dtype[Any],
3747-
) -> tuple[tuple[numcodecs.abc.Codec, ...], numcodecs.abc.Codec]:
3778+
) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]:
37483779
"""
37493780
Generate chunk encoding classes for v2 arrays with optional defaults.
37503781
"""
37513782
default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype)
3752-
_filters: tuple[numcodecs.abc.Codec, ...] = ()
3753-
if compression == "auto":
3783+
3784+
_filters: tuple[numcodecs.abc.Codec, ...] | None = None
3785+
_compressor: numcodecs.abc.Codec | None = None
3786+
3787+
if compressor == "auto":
37543788
_compressor = default_compressor
37553789
else:
3756-
_compressor = compression
3790+
if isinstance(compressor, Iterable):
3791+
raise TypeError("For Zarr v2 arrays, the `compressor` must be a single codec.")
3792+
_compressor = parse_compressor(compressor)
37573793
if filters == "auto":
37583794
_filters = default_filters
37593795
else:
3760-
_filters = filters
3796+
if not all(isinstance(f, numcodecs.abc.Codec) for f in filters):
3797+
raise TypeError(
3798+
"For Zarr v2 arrays, all elements of `filters` must be numcodecs codecs."
3799+
)
3800+
_filters = parse_filters(filters)
3801+
37613802
return _filters, _compressor
37623803

37633804

37643805
def _parse_chunk_encoding_v3(
37653806
*,
3766-
compressors: Iterable[BytesBytesCodec | dict[str, JSON]] | Literal["auto"],
3767-
filters: Iterable[ArrayArrayCodec | dict[str, JSON]] | Literal["auto"],
3807+
compressors: CompressorsParam,
3808+
filters: FiltersParam,
37683809
dtype: np.dtype[Any],
37693810
) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]:
37703811
"""
37713812
Generate chunk encoding classes for v3 arrays with optional defaults.
37723813
"""
37733814
default_array_array, default_array_bytes, default_bytes_bytes = _get_default_encoding_v3(dtype)
3774-
maybe_bytes_bytes: Iterable[BytesBytesCodec | dict[str, JSON]]
3775-
maybe_array_array: Iterable[ArrayArrayCodec | dict[str, JSON]]
3815+
maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]]
3816+
maybe_array_array: Iterable[Codec | dict[str, JSON]]
37763817

37773818
if compressors == "auto":
37783819
out_bytes_bytes = default_bytes_bytes
37793820
else:
37803821
if isinstance(compressors, dict | Codec):
37813822
maybe_bytes_bytes = (compressors,)
37823823
else:
3783-
maybe_bytes_bytes = compressors
3824+
maybe_bytes_bytes = cast(Iterable[Codec | dict[str, JSON]], compressors)
37843825

37853826
out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes)
37863827

@@ -3790,7 +3831,7 @@ def _parse_chunk_encoding_v3(
37903831
if isinstance(filters, dict | Codec):
37913832
maybe_array_array = (filters,)
37923833
else:
3793-
maybe_array_array = filters
3834+
maybe_array_array = cast(Iterable[Codec | dict[str, JSON]], filters)
37943835
out_array_array = tuple(_parse_array_array_codec(c) for c in maybe_array_array)
37953836

37963837
return out_array_array, default_array_bytes, out_bytes_bytes

src/zarr/core/buffer/core.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,6 @@
1616
import numpy as np
1717
import numpy.typing as npt
1818

19-
from zarr.registry import (
20-
get_buffer_class,
21-
get_ndbuffer_class,
22-
)
23-
2419
if TYPE_CHECKING:
2520
from collections.abc import Iterable, Sequence
2621
from typing import Self
@@ -507,4 +502,9 @@ class BufferPrototype(NamedTuple):
507502

508503
# The default buffer prototype used throughout the Zarr codebase.
509504
def default_buffer_prototype() -> BufferPrototype:
505+
from zarr.registry import (
506+
get_buffer_class,
507+
get_ndbuffer_class,
508+
)
509+
510510
return BufferPrototype(buffer=get_buffer_class(), nd_buffer=get_ndbuffer_class())

src/zarr/core/chunk_key_encodings.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,16 @@ def __init__(self, *, separator: SeparatorLiteral) -> None:
3636
object.__setattr__(self, "separator", separator_parsed)
3737

3838
@classmethod
39-
def from_dict(cls, data: dict[str, JSON] | ChunkKeyEncoding) -> ChunkKeyEncoding:
39+
def from_dict(
40+
cls, data: dict[str, JSON] | ChunkKeyEncoding | ChunkKeyEncodingParams
41+
) -> ChunkKeyEncoding:
4042
if isinstance(data, ChunkKeyEncoding):
4143
return data
4244

45+
# handle ChunkKeyEncodingParams
46+
if "name" in data and "separator" in data:
47+
data = {"name": data["name"], "configuration": {"separator": data["separator"]}}
48+
4349
# configuration is optional for chunk key encodings
4450
name_parsed, config_parsed = parse_named_configuration(data, require_configuration=False)
4551
if name_parsed == "default":

src/zarr/core/config.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -67,27 +67,27 @@ def reset(self) -> None:
6767
"order": "C",
6868
"write_empty_chunks": False,
6969
"v2_default_compressor": {
70-
"numeric": {"id": "zstd", "level": 0, "checksum": True},
71-
"string": {"id": "zstd", "level": 0, "checksum": True},
72-
"bytes": {"id": "zstd", "level": 0, "checksum": True},
70+
"numeric": {"id": "zstd", "level": 0, "checksum": False},
71+
"string": {"id": "zstd", "level": 0, "checksum": False},
72+
"bytes": {"id": "zstd", "level": 0, "checksum": False},
7373
},
7474
"v2_default_filters": {
75-
"numeric": [],
75+
"numeric": None,
7676
"string": [{"id": "vlen-utf8"}],
7777
"bytes": [{"id": "vlen-bytes"}],
7878
},
7979
"v3_default_codecs": {
8080
"numeric": [
8181
{"name": "bytes", "configuration": {"endian": "little"}},
82-
{"name": "zstd", "configuration": {"level": 0, "checksum": True}},
82+
{"name": "zstd", "configuration": {"level": 0, "checksum": False}},
8383
],
8484
"string": [
8585
{"name": "vlen-utf8"},
86-
{"name": "zstd", "configuration": {"level": 0, "checksum": True}},
86+
{"name": "zstd", "configuration": {"level": 0, "checksum": False}},
8787
],
8888
"bytes": [
8989
{"name": "vlen-bytes"},
90-
{"name": "zstd", "configuration": {"level": 0, "checksum": True}},
90+
{"name": "zstd", "configuration": {"level": 0, "checksum": False}},
9191
],
9292
},
9393
},

0 commit comments

Comments
 (0)