-
-
Notifications
You must be signed in to change notification settings - Fork 364
Add string
and bytes
dtypes plus vlen-utf8
and vlen-bytes
codecs
#2036
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 21 commits
c05b9d1
c86ddc6
a322124
2a1e2e3
1d3d7a5
cd40b08
988f9df
507161a
1ae5e63
94ecdb5
2c7d638
b1717d8
79b7d43
a5c2a37
717f0c7
b90d8f3
0406ea1
8e61a18
6cf7dde
28d58fa
c6de878
4f026db
e427c7a
7d9d897
0c21994
c12ac41
3aeea1e
cae7055
1aeb49a
6714bad
2edf3b8
12a0d65
7ba7077
1e828b4
ba0f093
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
from __future__ import annotations | ||
|
||
from dataclasses import dataclass | ||
from typing import TYPE_CHECKING | ||
|
||
import numpy as np | ||
from numcodecs.vlen import VLenBytes, VLenUTF8 | ||
|
||
from zarr.abc.codec import ArrayBytesCodec | ||
from zarr.core.buffer import Buffer, NDBuffer | ||
from zarr.core.common import JSON, parse_named_configuration | ||
from zarr.registry import register_codec | ||
from zarr.strings import cast_to_string_dtype | ||
|
||
if TYPE_CHECKING: | ||
from typing import Self | ||
|
||
from zarr.core.array_spec import ArraySpec | ||
|
||
|
||
# can use a global because there are no parameters | ||
vlen_utf8_codec = VLenUTF8() | ||
vlen_bytes_codec = VLenBytes() | ||
rabernat marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
||
|
||
@dataclass(frozen=True) | ||
class VLenUTF8Codec(ArrayBytesCodec): | ||
@classmethod | ||
def from_dict(cls, data: dict[str, JSON]) -> Self: | ||
_, configuration_parsed = parse_named_configuration( | ||
data, "vlen-utf8", require_configuration=False | ||
) | ||
configuration_parsed = configuration_parsed or {} | ||
return cls(**configuration_parsed) | ||
|
||
def to_dict(self) -> dict[str, JSON]: | ||
return {"name": "vlen-utf8", "configuration": {}} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One thought I had while implementing this: often the original numpy array is a fixed-length type (e.g. In the future, we may want to try to resurface this information. |
||
|
||
def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: | ||
return self | ||
|
||
async def _decode_single( | ||
self, | ||
chunk_bytes: Buffer, | ||
chunk_spec: ArraySpec, | ||
) -> NDBuffer: | ||
assert isinstance(chunk_bytes, Buffer) | ||
|
||
raw_bytes = chunk_bytes.as_array_like() | ||
decoded = vlen_utf8_codec.decode(raw_bytes) | ||
assert decoded.dtype == np.object_ | ||
decoded.shape = chunk_spec.shape | ||
# coming out of the code, we know this is safe, so don't issue a warning | ||
as_string_dtype = cast_to_string_dtype(decoded, safe=True) | ||
return chunk_spec.prototype.nd_buffer.from_numpy_array(as_string_dtype) | ||
|
||
async def _encode_single( | ||
self, | ||
chunk_array: NDBuffer, | ||
chunk_spec: ArraySpec, | ||
) -> Buffer | None: | ||
assert isinstance(chunk_array, NDBuffer) | ||
return chunk_spec.prototype.buffer.from_bytes( | ||
vlen_utf8_codec.encode(chunk_array.as_numpy_array()) | ||
) | ||
|
||
def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int: | ||
# what is input_byte_length for an object dtype? | ||
raise NotImplementedError("compute_encoded_size is not implemented for VLen codecs") | ||
|
||
|
||
@dataclass(frozen=True) | ||
class VLenBytesCodec(ArrayBytesCodec): | ||
@classmethod | ||
def from_dict(cls, data: dict[str, JSON]) -> Self: | ||
_, configuration_parsed = parse_named_configuration( | ||
data, "vlen-bytes", require_configuration=False | ||
) | ||
configuration_parsed = configuration_parsed or {} | ||
return cls(**configuration_parsed) | ||
|
||
def to_dict(self) -> dict[str, JSON]: | ||
return {"name": "vlen-bytes", "configuration": {}} | ||
|
||
def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: | ||
return self | ||
|
||
async def _decode_single( | ||
self, | ||
chunk_bytes: Buffer, | ||
chunk_spec: ArraySpec, | ||
) -> NDBuffer: | ||
assert isinstance(chunk_bytes, Buffer) | ||
|
||
raw_bytes = chunk_bytes.as_array_like() | ||
decoded = vlen_bytes_codec.decode(raw_bytes) | ||
assert decoded.dtype == np.object_ | ||
decoded.shape = chunk_spec.shape | ||
return chunk_spec.prototype.nd_buffer.from_numpy_array(decoded) | ||
|
||
async def _encode_single( | ||
self, | ||
chunk_array: NDBuffer, | ||
chunk_spec: ArraySpec, | ||
) -> Buffer | None: | ||
assert isinstance(chunk_array, NDBuffer) | ||
return chunk_spec.prototype.buffer.from_bytes( | ||
vlen_bytes_codec.encode(chunk_array.as_numpy_array()) | ||
) | ||
|
||
def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int: | ||
# what is input_byte_length for an object dtype? | ||
raise NotImplementedError("compute_encoded_size is not implemented for VLen codecs") | ||
|
||
|
||
register_codec("vlen-utf8", VLenUTF8Codec) | ||
register_codec("vlen-bytes", VLenBytesCodec) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -313,8 +313,7 @@ class NDBuffer: | |
""" | ||
|
||
def __init__(self, array: NDArrayLike) -> None: | ||
# assert array.ndim > 0 | ||
assert array.dtype != object | ||
# assert array.dtype != object | ||
rabernat marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
self._data = array | ||
|
||
@classmethod | ||
|
@@ -467,9 +466,12 @@ def all_equal(self, other: Any, equal_nan: bool = True) -> bool: | |
# Handle None fill_value for Zarr V2 | ||
return False | ||
# use array_equal to obtain equal_nan=True functionality | ||
# Note from Ryan: doesn't this lead to a huge amount of unnecessary memory allocation on every single chunk? | ||
|
||
# Since fill-value is a scalar, isn't there a faster path than allocating a new array for fill value | ||
# every single time we have to write data? | ||
_data, other = np.broadcast_arrays(self._data, other) | ||
return np.array_equal( | ||
self._data, other, equal_nan=equal_nan if self._data.dtype.kind not in "US" else False | ||
self._data, other, equal_nan=equal_nan if self._data.dtype.kind not in "USTO" else False | ||
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
) | ||
|
||
def fill(self, value: Any) -> None: | ||
|
Uh oh!
There was an error while loading. Please reload this page.