Skip to content

Commit f44601b

Browse files
authored
Merge branch 'v3' into doc/storage
2 parents f2137fb + 9bce890 commit f44601b

31 files changed

+831
-219
lines changed

.pre-commit-config.yaml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ default_language_version:
77
python: python3
88
repos:
99
- repo: https://github.com/astral-sh/ruff-pre-commit
10-
rev: v0.6.8
10+
rev: v0.6.9
1111
hooks:
1212
- id: ruff
1313
args: ["--fix", "--show-fixes"]
@@ -18,7 +18,7 @@ repos:
1818
- id: codespell
1919
args: ["-L", "ba,ihs,kake,nd,noe,nwo,te,fo,zar", "-S", "fixture"]
2020
- repo: https://github.com/pre-commit/pre-commit-hooks
21-
rev: v4.6.0
21+
rev: v5.0.0
2222
hooks:
2323
- id: check-yaml
2424
- repo: https://github.com/pre-commit/mirrors-mypy
@@ -49,3 +49,7 @@ repos:
4949
hooks:
5050
- id: rst-directive-colons
5151
- id: rst-inline-touching-normal
52+
- repo: https://github.com/numpy/numpydoc
53+
rev: v1.8.0
54+
hooks:
55+
- id: numpydoc-validation

pyproject.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,3 +319,7 @@ ignore = [
319319
"PC111", # fix Python code in documentation - enable later
320320
"PC180", # for JavaScript - not interested
321321
]
322+
323+
[tool.numpydoc_validation]
324+
# See https://numpydoc.readthedocs.io/en/latest/validation.html#built-in-validation-checks for list of checks
325+
checks = ["GL06", "GL07", "GL10", "PR03", "PR05", "PR06"]

src/zarr/abc/codec.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,11 @@
2020
from zarr.core.indexing import SelectorTuple
2121

2222
__all__ = [
23-
"BaseCodec",
2423
"ArrayArrayCodec",
2524
"ArrayBytesCodec",
2625
"ArrayBytesCodecPartialDecodeMixin",
2726
"ArrayBytesCodecPartialEncodeMixin",
27+
"BaseCodec",
2828
"BytesBytesCodec",
2929
"CodecInput",
3030
"CodecOutput",

src/zarr/abc/metadata.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ def to_dict(self) -> dict[str, JSON]:
2222
are instances of `Metadata`. Sequences of `Metadata` are similarly recursed into, and
2323
the output of that recursion is collected in a list.
2424
"""
25-
...
2625
out_dict = {}
2726
for field in fields(self):
2827
key = field.name

src/zarr/abc/store.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ class Store(ABC):
7474
_mode: AccessMode
7575
_is_open: bool
7676

77-
def __init__(self, mode: AccessModeLiteral = "r", *args: Any, **kwargs: Any) -> None:
77+
def __init__(self, *args: Any, mode: AccessModeLiteral = "r", **kwargs: Any) -> None:
7878
self._is_open = False
7979
self._mode = AccessMode.from_literal(mode)
8080

@@ -129,13 +129,10 @@ async def _open(self) -> None:
129129
"""
130130
if self._is_open:
131131
raise ValueError("store is already open")
132-
if not await self.empty():
133-
if self.mode.update or self.mode.readonly:
134-
pass
135-
elif self.mode.overwrite:
136-
await self.clear()
137-
else:
138-
raise FileExistsError("Store already exists")
132+
if self.mode.str == "w":
133+
await self.clear()
134+
elif self.mode.str == "w-" and not await self.empty():
135+
raise FileExistsError("Store already exists")
139136
self._is_open = True
140137

141138
async def _ensure_open(self) -> None:

src/zarr/api/asynchronous.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ async def load(
159159
160160
Parameters
161161
----------
162-
store : Store or string
162+
store : Store or str
163163
Store or path to directory in file system or name of zip file.
164164
path : str or None, optional
165165
The path within the store from which to load.
@@ -203,7 +203,7 @@ async def open(
203203
204204
Parameters
205205
----------
206-
store : Store or string, optional
206+
store : Store or str, optional
207207
Store or path to directory in file system or name of zip file.
208208
mode : {'r', 'r+', 'a', 'w', 'w-'}, optional
209209
Persistence mode: 'r' means read only (must exist); 'r+' means
@@ -267,7 +267,7 @@ async def save(
267267
268268
Parameters
269269
----------
270-
store : Store or string
270+
store : Store or str
271271
Store or path to directory in file system or name of zip file.
272272
args : ndarray
273273
NumPy arrays with data to save.
@@ -303,7 +303,7 @@ async def save_array(
303303
304304
Parameters
305305
----------
306-
store : Store or string
306+
store : Store or str
307307
Store or path to directory in file system or name of zip file.
308308
arr : ndarray
309309
NumPy array with data to save.
@@ -351,7 +351,7 @@ async def save_group(
351351
352352
Parameters
353353
----------
354-
store : Store or string
354+
store : Store or str
355355
Store or path to directory in file system or name of zip file.
356356
args : ndarray
357357
NumPy arrays with data to save.
@@ -467,7 +467,7 @@ async def group(
467467
468468
Parameters
469469
----------
470-
store : Store or string, optional
470+
store : Store or str, optional
471471
Store or path to directory in file system.
472472
overwrite : bool, optional
473473
If True, delete any pre-existing data in `store` at `path` before
@@ -481,7 +481,7 @@ async def group(
481481
to all attribute read operations.
482482
synchronizer : object, optional
483483
Array synchronizer.
484-
path : string, optional
484+
path : str, optional
485485
Group path within store.
486486
meta_array : array-like, optional
487487
An array instance to use for determining arrays to create and return
@@ -547,7 +547,7 @@ async def open_group(
547547
548548
Parameters
549549
----------
550-
store : Store, string, or mapping, optional
550+
store : Store, str, or mapping, optional
551551
Store or path to directory in file system or name of zip file.
552552
553553
Strings are interpreted as paths on the local file system
@@ -570,9 +570,9 @@ async def open_group(
570570
to all attribute read operations.
571571
synchronizer : object, optional
572572
Array synchronizer.
573-
path : string, optional
573+
path : str, optional
574574
Group path within store.
575-
chunk_store : Store or string, optional
575+
chunk_store : Store or str, optional
576576
Store or path to directory in file system or name of zip file.
577577
storage_options : dict
578578
If using an fsspec URL to create the store, these will be passed to
@@ -664,22 +664,22 @@ async def create(
664664
False, will be set to `shape`, i.e., single chunk for the whole array.
665665
If an int, the chunk size in each dimension will be given by the value
666666
of `chunks`. Default is True.
667-
dtype : string or dtype, optional
667+
dtype : str or dtype, optional
668668
NumPy dtype.
669669
compressor : Codec, optional
670670
Primary compressor.
671671
fill_value : object
672672
Default value to use for uninitialized portions of the array.
673673
order : {'C', 'F'}, optional
674674
Memory layout to be used within each chunk.
675-
store : Store or string
675+
store : Store or str
676676
Store or path to directory in file system or name of zip file.
677677
synchronizer : object, optional
678678
Array synchronizer.
679679
overwrite : bool, optional
680680
If True, delete all pre-existing data in `store` at `path` before
681681
creating the array.
682-
path : string, optional
682+
path : str, optional
683683
Path under which array is stored.
684684
chunk_store : MutableMapping, optional
685685
Separate storage for chunks. If not provided, `store` will be used
@@ -937,11 +937,11 @@ async def open_array(
937937
938938
Parameters
939939
----------
940-
store : Store or string
940+
store : Store or str
941941
Store or path to directory in file system or name of zip file.
942942
zarr_format : {2, 3, None}, optional
943943
The zarr format to use when saving.
944-
path : string, optional
944+
path : str, optional
945945
Path in store to array.
946946
storage_options : dict
947947
If using an fsspec URL to create the store, these will be passed to

src/zarr/codecs/__init__.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,20 @@
11
from __future__ import annotations
22

3+
from typing import TYPE_CHECKING, Any
4+
5+
if TYPE_CHECKING:
6+
import numpy as np
7+
38
from zarr.codecs.blosc import BloscCname, BloscCodec, BloscShuffle
49
from zarr.codecs.bytes import BytesCodec, Endian
510
from zarr.codecs.crc32c_ import Crc32cCodec
611
from zarr.codecs.gzip import GzipCodec
712
from zarr.codecs.pipeline import BatchedCodecPipeline
813
from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation
914
from zarr.codecs.transpose import TransposeCodec
15+
from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec
1016
from zarr.codecs.zstd import ZstdCodec
17+
from zarr.core.metadata.v3 import DataType
1118

1219
__all__ = [
1320
"BatchedCodecPipeline",
@@ -21,5 +28,19 @@
2128
"ShardingCodec",
2229
"ShardingCodecIndexLocation",
2330
"TransposeCodec",
31+
"VLenBytesCodec",
32+
"VLenUTF8Codec",
2433
"ZstdCodec",
2534
]
35+
36+
37+
def _get_default_array_bytes_codec(
38+
np_dtype: np.dtype[Any],
39+
) -> BytesCodec | VLenUTF8Codec | VLenBytesCodec:
40+
dtype = DataType.from_numpy(np_dtype)
41+
if dtype == DataType.string:
42+
return VLenUTF8Codec()
43+
elif dtype == DataType.bytes:
44+
return VLenBytesCodec()
45+
else:
46+
return BytesCodec()

src/zarr/codecs/vlen_utf8.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
from __future__ import annotations
2+
3+
from dataclasses import dataclass
4+
from typing import TYPE_CHECKING
5+
6+
import numpy as np
7+
from numcodecs.vlen import VLenBytes, VLenUTF8
8+
9+
from zarr.abc.codec import ArrayBytesCodec
10+
from zarr.core.buffer import Buffer, NDBuffer
11+
from zarr.core.common import JSON, parse_named_configuration
12+
from zarr.core.strings import cast_to_string_dtype
13+
from zarr.registry import register_codec
14+
15+
if TYPE_CHECKING:
16+
from typing import Self
17+
18+
from zarr.core.array_spec import ArraySpec
19+
20+
21+
# can use a global because there are no parameters
22+
_vlen_utf8_codec = VLenUTF8()
23+
_vlen_bytes_codec = VLenBytes()
24+
25+
26+
@dataclass(frozen=True)
27+
class VLenUTF8Codec(ArrayBytesCodec):
28+
@classmethod
29+
def from_dict(cls, data: dict[str, JSON]) -> Self:
30+
_, configuration_parsed = parse_named_configuration(
31+
data, "vlen-utf8", require_configuration=False
32+
)
33+
configuration_parsed = configuration_parsed or {}
34+
return cls(**configuration_parsed)
35+
36+
def to_dict(self) -> dict[str, JSON]:
37+
return {"name": "vlen-utf8", "configuration": {}}
38+
39+
def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
40+
return self
41+
42+
async def _decode_single(
43+
self,
44+
chunk_bytes: Buffer,
45+
chunk_spec: ArraySpec,
46+
) -> NDBuffer:
47+
assert isinstance(chunk_bytes, Buffer)
48+
49+
raw_bytes = chunk_bytes.as_array_like()
50+
decoded = _vlen_utf8_codec.decode(raw_bytes)
51+
assert decoded.dtype == np.object_
52+
decoded.shape = chunk_spec.shape
53+
# coming out of the code, we know this is safe, so don't issue a warning
54+
as_string_dtype = cast_to_string_dtype(decoded, safe=True)
55+
return chunk_spec.prototype.nd_buffer.from_numpy_array(as_string_dtype)
56+
57+
async def _encode_single(
58+
self,
59+
chunk_array: NDBuffer,
60+
chunk_spec: ArraySpec,
61+
) -> Buffer | None:
62+
assert isinstance(chunk_array, NDBuffer)
63+
return chunk_spec.prototype.buffer.from_bytes(
64+
_vlen_utf8_codec.encode(chunk_array.as_numpy_array())
65+
)
66+
67+
def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int:
68+
# what is input_byte_length for an object dtype?
69+
raise NotImplementedError("compute_encoded_size is not implemented for VLen codecs")
70+
71+
72+
@dataclass(frozen=True)
73+
class VLenBytesCodec(ArrayBytesCodec):
74+
@classmethod
75+
def from_dict(cls, data: dict[str, JSON]) -> Self:
76+
_, configuration_parsed = parse_named_configuration(
77+
data, "vlen-bytes", require_configuration=False
78+
)
79+
configuration_parsed = configuration_parsed or {}
80+
return cls(**configuration_parsed)
81+
82+
def to_dict(self) -> dict[str, JSON]:
83+
return {"name": "vlen-bytes", "configuration": {}}
84+
85+
def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
86+
return self
87+
88+
async def _decode_single(
89+
self,
90+
chunk_bytes: Buffer,
91+
chunk_spec: ArraySpec,
92+
) -> NDBuffer:
93+
assert isinstance(chunk_bytes, Buffer)
94+
95+
raw_bytes = chunk_bytes.as_array_like()
96+
decoded = _vlen_bytes_codec.decode(raw_bytes)
97+
assert decoded.dtype == np.object_
98+
decoded.shape = chunk_spec.shape
99+
return chunk_spec.prototype.nd_buffer.from_numpy_array(decoded)
100+
101+
async def _encode_single(
102+
self,
103+
chunk_array: NDBuffer,
104+
chunk_spec: ArraySpec,
105+
) -> Buffer | None:
106+
assert isinstance(chunk_array, NDBuffer)
107+
return chunk_spec.prototype.buffer.from_bytes(
108+
_vlen_bytes_codec.encode(chunk_array.as_numpy_array())
109+
)
110+
111+
def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int:
112+
# what is input_byte_length for an object dtype?
113+
raise NotImplementedError("compute_encoded_size is not implemented for VLen codecs")
114+
115+
116+
register_codec("vlen-utf8", VLenUTF8Codec)
117+
register_codec("vlen-bytes", VLenBytesCodec)

0 commit comments

Comments
 (0)