|
4 | 4 | from typing import TYPE_CHECKING |
5 | 5 |
|
6 | 6 | import numpy as np |
7 | | -from numcodecs.vlen import VLenUTF8 |
| 7 | +from numcodecs.vlen import VLenBytes, VLenUTF8 |
8 | 8 |
|
9 | 9 | from zarr.abc.codec import ArrayBytesCodec |
10 | 10 | from zarr.core.buffer import Buffer, NDBuffer |
|
20 | 20 |
|
21 | 21 | # can use a global because there are no parameters |
22 | 22 | vlen_utf8_codec = VLenUTF8() |
| 23 | +vlen_bytes_codec = VLenBytes() |
23 | 24 |
|
24 | 25 |
|
25 | 26 | @dataclass(frozen=True) |
@@ -68,4 +69,49 @@ def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) - |
68 | 69 | raise NotImplementedError("compute_encoded_size is not implemented for VLen codecs") |
69 | 70 |
|
70 | 71 |
|
| 72 | +@dataclass(frozen=True) |
| 73 | +class VLenBytesCodec(ArrayBytesCodec): |
| 74 | + @classmethod |
| 75 | + def from_dict(cls, data: dict[str, JSON]) -> Self: |
| 76 | + _, configuration_parsed = parse_named_configuration( |
| 77 | + data, "vlen-bytes", require_configuration=False |
| 78 | + ) |
| 79 | + configuration_parsed = configuration_parsed or {} |
| 80 | + return cls(**configuration_parsed) |
| 81 | + |
| 82 | + def to_dict(self) -> dict[str, JSON]: |
| 83 | + return {"name": "vlen-bytes", "configuration": {}} |
| 84 | + |
| 85 | + def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: |
| 86 | + return self |
| 87 | + |
| 88 | + async def _decode_single( |
| 89 | + self, |
| 90 | + chunk_bytes: Buffer, |
| 91 | + chunk_spec: ArraySpec, |
| 92 | + ) -> NDBuffer: |
| 93 | + assert isinstance(chunk_bytes, Buffer) |
| 94 | + |
| 95 | + raw_bytes = chunk_bytes.as_array_like() |
| 96 | + decoded = vlen_bytes_codec.decode(raw_bytes) |
| 97 | + assert decoded.dtype == np.object_ |
| 98 | + decoded.shape = chunk_spec.shape |
| 99 | + return chunk_spec.prototype.nd_buffer.from_numpy_array(decoded) |
| 100 | + |
| 101 | + async def _encode_single( |
| 102 | + self, |
| 103 | + chunk_array: NDBuffer, |
| 104 | + chunk_spec: ArraySpec, |
| 105 | + ) -> Buffer | None: |
| 106 | + assert isinstance(chunk_array, NDBuffer) |
| 107 | + return chunk_spec.prototype.buffer.from_bytes( |
| 108 | + vlen_bytes_codec.encode(chunk_array.as_numpy_array()) |
| 109 | + ) |
| 110 | + |
| 111 | + def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int: |
| 112 | + # what is input_byte_length for an object dtype? |
| 113 | + raise NotImplementedError("compute_encoded_size is not implemented for VLen codecs") |
| 114 | + |
| 115 | + |
71 | 116 | register_codec("vlen-utf8", VLenUTF8Codec) |
| 117 | +register_codec("vlen-bytes", VLenBytesCodec) |
0 commit comments