Skip to content

Commit 0406ea1

Browse files
committed
add vlen bytes
1 parent b90d8f3 commit 0406ea1

File tree

6 files changed

+92
-4
lines changed

6 files changed

+92
-4
lines changed

src/zarr/codecs/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from zarr.codecs.pipeline import BatchedCodecPipeline
88
from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation
99
from zarr.codecs.transpose import TransposeCodec
10-
from zarr.codecs.vlen_utf8 import VLenUTF8Codec
10+
from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec
1111
from zarr.codecs.zstd import ZstdCodec
1212

1313
__all__ = [
@@ -23,5 +23,6 @@
2323
"ShardingCodecIndexLocation",
2424
"TransposeCodec",
2525
"VLenUTF8Codec",
26+
"VLenBytesCodec",
2627
"ZstdCodec",
2728
]

src/zarr/codecs/vlen_utf8.py

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from typing import TYPE_CHECKING
55

66
import numpy as np
7-
from numcodecs.vlen import VLenUTF8
7+
from numcodecs.vlen import VLenBytes, VLenUTF8
88

99
from zarr.abc.codec import ArrayBytesCodec
1010
from zarr.core.buffer import Buffer, NDBuffer
@@ -20,6 +20,7 @@
2020

2121
# can use a global because there are no parameters
2222
vlen_utf8_codec = VLenUTF8()
23+
vlen_bytes_codec = VLenBytes()
2324

2425

2526
@dataclass(frozen=True)
@@ -68,4 +69,49 @@ def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -
6869
raise NotImplementedError("compute_encoded_size is not implemented for VLen codecs")
6970

7071

72+
@dataclass(frozen=True)
73+
class VLenBytesCodec(ArrayBytesCodec):
74+
@classmethod
75+
def from_dict(cls, data: dict[str, JSON]) -> Self:
76+
_, configuration_parsed = parse_named_configuration(
77+
data, "vlen-bytes", require_configuration=False
78+
)
79+
configuration_parsed = configuration_parsed or {}
80+
return cls(**configuration_parsed)
81+
82+
def to_dict(self) -> dict[str, JSON]:
83+
return {"name": "vlen-bytes", "configuration": {}}
84+
85+
def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
86+
return self
87+
88+
async def _decode_single(
89+
self,
90+
chunk_bytes: Buffer,
91+
chunk_spec: ArraySpec,
92+
) -> NDBuffer:
93+
assert isinstance(chunk_bytes, Buffer)
94+
95+
raw_bytes = chunk_bytes.as_array_like()
96+
decoded = vlen_bytes_codec.decode(raw_bytes)
97+
assert decoded.dtype == np.object_
98+
decoded.shape = chunk_spec.shape
99+
return chunk_spec.prototype.nd_buffer.from_numpy_array(decoded)
100+
101+
async def _encode_single(
102+
self,
103+
chunk_array: NDBuffer,
104+
chunk_spec: ArraySpec,
105+
) -> Buffer | None:
106+
assert isinstance(chunk_array, NDBuffer)
107+
return chunk_spec.prototype.buffer.from_bytes(
108+
vlen_bytes_codec.encode(chunk_array.as_numpy_array())
109+
)
110+
111+
def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int:
112+
# what is input_byte_length for an object dtype?
113+
raise NotImplementedError("compute_encoded_size is not implemented for VLen codecs")
114+
115+
71116
register_codec("vlen-utf8", VLenUTF8Codec)
117+
register_codec("vlen-bytes", VLenBytesCodec)

src/zarr/core/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ def reset(self) -> None:
5959
"sharding_indexed": "zarr.codecs.sharding.ShardingCodec",
6060
"transpose": "zarr.codecs.transpose.TransposeCodec",
6161
"vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec",
62+
"vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec",
6263
},
6364
"buffer": "zarr.core.buffer.cpu.Buffer",
6465
"ndbuffer": "zarr.core.buffer.cpu.NDBuffer",

src/zarr/core/metadata/v3.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -386,6 +386,8 @@ def parse_fill_value(
386386
"""
387387
if fill_value is None:
388388
return dtype.type(0)
389+
if dtype.kind == "O":
390+
return fill_value
389391
if isinstance(fill_value, Sequence) and not isinstance(fill_value, str):
390392
if dtype.type in (np.complex64, np.complex128):
391393
dtype = cast(COMPLEX_DTYPE, dtype)
@@ -451,6 +453,7 @@ class DataType(Enum):
451453
complex64 = "complex64"
452454
complex128 = "complex128"
453455
string = "string"
456+
bytes = "bytes"
454457

455458
@property
456459
def byte_count(self) -> int:
@@ -499,13 +502,19 @@ def to_numpy_shortname(self) -> str:
499502
def to_numpy(self) -> np.dtype[Any]:
500503
if self == DataType.string:
501504
return STRING_DTYPE
505+
elif self == DataType.bytes:
506+
# for now always use object dtype for bytestrings
507+
# TODO: consider whether we can use fixed-width types (e.g. '|S5') instead
508+
return np.dtype("O")
502509
else:
503510
return np.dtype(self.to_numpy_shortname())
504511

505512
@classmethod
506513
def from_numpy(cls, dtype: np.dtype[Any]) -> DataType:
507-
if np.issubdtype(np.str_, dtype):
514+
if dtype.kind in "UT":
508515
return DataType.string
516+
elif dtype.kind == "S":
517+
return DataType.bytes
509518
dtype_to_data_type = {
510519
"|b1": "bool",
511520
"bool": "bool",

tests/v3/test_codecs/test_vlen.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
from zarr import Array
77
from zarr.abc.store import Store
8-
from zarr.codecs import VLenUTF8Codec
8+
from zarr.codecs import VLenBytesCodec, VLenUTF8Codec
99
from zarr.core.metadata.v3 import ArrayV3Metadata, DataType
1010
from zarr.storage.common import StorePath
1111
from zarr.strings import NUMPY_SUPPORTS_VLEN_STRING
@@ -49,3 +49,33 @@ async def test_vlen_string(store: Store, dtype: None | np.dtype[Any]) -> None:
4949
assert np.array_equal(data, b[:, :])
5050
assert b.metadata.data_type == DataType.string
5151
assert a.dtype == expected_zarr_string_dtype
52+
53+
54+
@pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"])
55+
async def test_vlen_bytes(store: Store) -> None:
56+
bstrings = [b"hello", b"world", b"this", b"is", b"a", b"test"]
57+
data = np.array(bstrings).reshape((2, 3))
58+
assert data.dtype == "|S5"
59+
60+
sp = StorePath(store, path="string")
61+
a = Array.create(
62+
sp,
63+
shape=data.shape,
64+
chunk_shape=data.shape,
65+
dtype=data.dtype,
66+
fill_value=b"",
67+
codecs=[VLenBytesCodec()],
68+
)
69+
assert isinstance(a.metadata, ArrayV3Metadata) # needed for mypy
70+
71+
a[:, :] = data
72+
assert np.array_equal(data, a[:, :])
73+
# assert a.metadata.data_type == DataType.string
74+
# assert a.dtype == expected_zarr_string_dtype
75+
76+
# test round trip
77+
b = Array.open(sp)
78+
assert isinstance(b.metadata, ArrayV3Metadata) # needed for mypy
79+
assert np.array_equal(data, b[:, :])
80+
# assert b.metadata.data_type == DataType.string
81+
# assert a.dtype == expected_zarr_string_dtype

tests/v3/test_config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ def test_config_defaults_set() -> None:
5959
"sharding_indexed": "zarr.codecs.sharding.ShardingCodec",
6060
"transpose": "zarr.codecs.transpose.TransposeCodec",
6161
"vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec",
62+
"vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec",
6263
},
6364
}
6465
]

0 commit comments

Comments
 (0)