Skip to content

Commit 1e30a28

Browse files
authored
Merge branch 'main' into ci/expand-ci-matrix
2 parents f4f74fc + f4af51c commit 1e30a28

File tree

14 files changed

+485
-109
lines changed

14 files changed

+485
-109
lines changed

src/zarr/api/asynchronous.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -396,12 +396,16 @@ async def save_array(
396396

397397
mode = kwargs.pop("mode", None)
398398
store_path = await make_store_path(store, path=path, mode=mode, storage_options=storage_options)
399+
if np.isscalar(arr):
400+
arr = np.array(arr)
401+
shape = arr.shape
402+
chunks = getattr(arr, "chunks", None) # for array-likes with chunks attribute
399403
new = await AsyncArray.create(
400404
store_path,
401405
zarr_format=zarr_format,
402-
shape=arr.shape,
406+
shape=shape,
403407
dtype=arr.dtype,
404-
chunks=arr.shape,
408+
chunks=chunks,
405409
**kwargs,
406410
)
407411
await new.setitem(slice(None), arr)

src/zarr/codecs/_v2.py

Lines changed: 48 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,21 @@
55
from typing import TYPE_CHECKING
66

77
import numcodecs
8-
from numcodecs.compat import ensure_bytes, ensure_ndarray
8+
from numcodecs.compat import ensure_ndarray_like
99

10-
from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec
11-
from zarr.core.buffer import Buffer, NDBuffer, default_buffer_prototype
10+
from zarr.abc.codec import ArrayBytesCodec
1211
from zarr.registry import get_ndbuffer_class
1312

1413
if TYPE_CHECKING:
1514
import numcodecs.abc
1615

1716
from zarr.core.array_spec import ArraySpec
17+
from zarr.core.buffer import Buffer, NDBuffer
1818

1919

2020
@dataclass(frozen=True)
21-
class V2Compressor(ArrayBytesCodec):
21+
class V2Codec(ArrayBytesCodec):
22+
filters: tuple[numcodecs.abc.Codec, ...] | None
2223
compressor: numcodecs.abc.Codec | None
2324

2425
is_fixed_size = False
@@ -28,81 +29,61 @@ async def _decode_single(
2829
chunk_bytes: Buffer,
2930
chunk_spec: ArraySpec,
3031
) -> NDBuffer:
31-
if self.compressor is not None:
32-
chunk_numpy_array = ensure_ndarray(
33-
await asyncio.to_thread(self.compressor.decode, chunk_bytes.as_array_like())
34-
)
32+
cdata = chunk_bytes.as_array_like()
33+
# decompress
34+
if self.compressor:
35+
chunk = await asyncio.to_thread(self.compressor.decode, cdata)
3536
else:
36-
chunk_numpy_array = ensure_ndarray(chunk_bytes.as_array_like())
37+
chunk = cdata
38+
39+
# apply filters
40+
if self.filters:
41+
for f in reversed(self.filters):
42+
chunk = await asyncio.to_thread(f.decode, chunk)
43+
44+
# view as numpy array with correct dtype
45+
chunk = ensure_ndarray_like(chunk)
46+
# special case object dtype, because incorrect handling can lead to
47+
# segfaults and other bad things happening
48+
if chunk_spec.dtype != object:
49+
chunk = chunk.view(chunk_spec.dtype)
50+
elif chunk.dtype != object:
51+
# If we end up here, someone must have hacked around with the filters.
52+
# We cannot deal with object arrays unless there is an object
53+
# codec in the filter chain, i.e., a filter that converts from object
54+
# array to something else during encoding, and converts back to object
55+
# array during decoding.
56+
raise RuntimeError("cannot read object array without object codec")
3757

38-
# ensure correct dtype
39-
if str(chunk_numpy_array.dtype) != chunk_spec.dtype and not chunk_spec.dtype.hasobject:
40-
chunk_numpy_array = chunk_numpy_array.view(chunk_spec.dtype)
58+
# ensure correct chunk shape
59+
chunk = chunk.reshape(-1, order="A")
60+
chunk = chunk.reshape(chunk_spec.shape, order=chunk_spec.order)
4161

42-
return get_ndbuffer_class().from_numpy_array(chunk_numpy_array)
62+
return get_ndbuffer_class().from_ndarray_like(chunk)
4363

4464
async def _encode_single(
45-
self,
46-
chunk_array: NDBuffer,
47-
_chunk_spec: ArraySpec,
48-
) -> Buffer | None:
49-
chunk_numpy_array = chunk_array.as_numpy_array()
50-
if self.compressor is not None:
51-
if (
52-
not chunk_numpy_array.flags.c_contiguous
53-
and not chunk_numpy_array.flags.f_contiguous
54-
):
55-
chunk_numpy_array = chunk_numpy_array.copy(order="A")
56-
encoded_chunk_bytes = ensure_bytes(
57-
await asyncio.to_thread(self.compressor.encode, chunk_numpy_array)
58-
)
59-
else:
60-
encoded_chunk_bytes = ensure_bytes(chunk_numpy_array)
61-
62-
return default_buffer_prototype().buffer.from_bytes(encoded_chunk_bytes)
63-
64-
def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) -> int:
65-
raise NotImplementedError
66-
67-
68-
@dataclass(frozen=True)
69-
class V2Filters(ArrayArrayCodec):
70-
filters: tuple[numcodecs.abc.Codec, ...] | None
71-
72-
is_fixed_size = False
73-
74-
async def _decode_single(
7565
self,
7666
chunk_array: NDBuffer,
7767
chunk_spec: ArraySpec,
78-
) -> NDBuffer:
79-
chunk_ndarray = chunk_array.as_ndarray_like()
80-
# apply filters in reverse order
81-
if self.filters is not None:
82-
for filter in self.filters[::-1]:
83-
chunk_ndarray = await asyncio.to_thread(filter.decode, chunk_ndarray)
84-
85-
# ensure correct chunk shape
86-
if chunk_ndarray.shape != chunk_spec.shape:
87-
chunk_ndarray = chunk_ndarray.reshape(
88-
chunk_spec.shape,
89-
order=chunk_spec.order,
90-
)
68+
) -> Buffer | None:
69+
chunk = chunk_array.as_ndarray_like()
9170

92-
return get_ndbuffer_class().from_ndarray_like(chunk_ndarray)
71+
# apply filters
72+
if self.filters:
73+
for f in self.filters:
74+
chunk = await asyncio.to_thread(f.encode, chunk)
9375

94-
async def _encode_single(
95-
self,
96-
chunk_array: NDBuffer,
97-
chunk_spec: ArraySpec,
98-
) -> NDBuffer | None:
99-
chunk_ndarray = chunk_array.as_ndarray_like().ravel(order=chunk_spec.order)
76+
# check object encoding
77+
if ensure_ndarray_like(chunk).dtype == object:
78+
raise RuntimeError("cannot write object array without object codec")
10079

101-
if self.filters is not None:
102-
for filter in self.filters:
103-
chunk_ndarray = await asyncio.to_thread(filter.encode, chunk_ndarray)
80+
# compress
81+
if self.compressor:
82+
cdata = await asyncio.to_thread(self.compressor.encode, chunk)
83+
else:
84+
cdata = chunk
10485

105-
return get_ndbuffer_class().from_ndarray_like(chunk_ndarray)
86+
return chunk_spec.prototype.buffer.from_bytes(cdata)
10687

10788
def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) -> int:
10889
raise NotImplementedError

src/zarr/core/array.py

Lines changed: 115 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import json
44
from asyncio import gather
5-
from dataclasses import dataclass, field, replace
5+
from dataclasses import dataclass, field
66
from itertools import starmap
77
from logging import getLogger
88
from typing import TYPE_CHECKING, Any, Generic, Literal, cast, overload
@@ -13,7 +13,7 @@
1313
from zarr._compat import _deprecate_positional_args
1414
from zarr.abc.store import Store, set_or_delete
1515
from zarr.codecs import _get_default_array_bytes_codec
16-
from zarr.codecs._v2 import V2Compressor, V2Filters
16+
from zarr.codecs._v2 import V2Codec
1717
from zarr.core.attributes import Attributes
1818
from zarr.core.buffer import (
1919
BufferPrototype,
@@ -118,9 +118,8 @@ def create_codec_pipeline(metadata: ArrayMetadata) -> CodecPipeline:
118118
if isinstance(metadata, ArrayV3Metadata):
119119
return get_pipeline_class().from_codecs(metadata.codecs)
120120
elif isinstance(metadata, ArrayV2Metadata):
121-
return get_pipeline_class().from_codecs(
122-
[V2Filters(metadata.filters), V2Compressor(metadata.compressor)]
123-
)
121+
v2_codec = V2Codec(filters=metadata.filters, compressor=metadata.compressor)
122+
return get_pipeline_class().from_codecs([v2_codec])
124123
else:
125124
raise TypeError
126125

@@ -1104,15 +1103,15 @@ async def setitem(
11041103
)
11051104
return await self._set_selection(indexer, value, prototype=prototype)
11061105

1107-
async def resize(self, new_shape: ChunkCoords, delete_outside_chunks: bool = True) -> Self:
1106+
async def resize(self, new_shape: ShapeLike, delete_outside_chunks: bool = True) -> None:
1107+
new_shape = parse_shapelike(new_shape)
11081108
assert len(new_shape) == len(self.metadata.shape)
11091109
new_metadata = self.metadata.update_shape(new_shape)
11101110

1111-
# Remove all chunks outside of the new shape
1112-
old_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(self.metadata.shape))
1113-
new_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(new_shape))
1114-
11151111
if delete_outside_chunks:
1112+
# Remove all chunks outside of the new shape
1113+
old_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(self.metadata.shape))
1114+
new_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(new_shape))
11161115

11171116
async def _delete_key(key: str) -> None:
11181117
await (self.store_path / key).delete()
@@ -1128,7 +1127,63 @@ async def _delete_key(key: str) -> None:
11281127

11291128
# Write new metadata
11301129
await self._save_metadata(new_metadata)
1131-
return replace(self, metadata=new_metadata)
1130+
1131+
# Update metadata (in place)
1132+
object.__setattr__(self, "metadata", new_metadata)
1133+
1134+
async def append(self, data: npt.ArrayLike, axis: int = 0) -> ChunkCoords:
1135+
"""Append `data` to `axis`.
1136+
1137+
Parameters
1138+
----------
1139+
data : array-like
1140+
Data to be appended.
1141+
axis : int
1142+
Axis along which to append.
1143+
1144+
Returns
1145+
-------
1146+
new_shape : tuple
1147+
1148+
Notes
1149+
-----
1150+
The size of all dimensions other than `axis` must match between this
1151+
array and `data`.
1152+
"""
1153+
# ensure data is array-like
1154+
if not hasattr(data, "shape"):
1155+
data = np.asanyarray(data)
1156+
1157+
self_shape_preserved = tuple(s for i, s in enumerate(self.shape) if i != axis)
1158+
data_shape_preserved = tuple(s for i, s in enumerate(data.shape) if i != axis)
1159+
if self_shape_preserved != data_shape_preserved:
1160+
raise ValueError(
1161+
f"shape of data to append is not compatible with the array. "
1162+
f"The shape of the data is ({data_shape_preserved})"
1163+
f"and the shape of the array is ({self_shape_preserved})."
1164+
"All dimensions must match except for the dimension being "
1165+
"appended."
1166+
)
1167+
# remember old shape
1168+
old_shape = self.shape
1169+
1170+
# determine new shape
1171+
new_shape = tuple(
1172+
self.shape[i] if i != axis else self.shape[i] + data.shape[i]
1173+
for i in range(len(self.shape))
1174+
)
1175+
1176+
# resize
1177+
await self.resize(new_shape)
1178+
1179+
# store data
1180+
append_selection = tuple(
1181+
slice(None) if i != axis else slice(old_shape[i], new_shape[i])
1182+
for i in range(len(self.shape))
1183+
)
1184+
await self.setitem(append_selection, data)
1185+
1186+
return new_shape
11321187

11331188
async def update_attributes(self, new_attributes: dict[str, JSON]) -> Self:
11341189
# metadata.attributes is "frozen" so we simply clear and update the dict
@@ -1147,7 +1202,8 @@ async def info(self) -> None:
11471202
raise NotImplementedError
11481203

11491204

1150-
@dataclass(frozen=True)
1205+
# TODO: Array can be a frozen data class again once property setters (e.g. shape) are removed
1206+
@dataclass(frozen=False)
11511207
class Array:
11521208
"""Instantiate an array from an initialized store."""
11531209

@@ -1297,6 +1353,11 @@ def shape(self) -> ChunkCoords:
12971353
"""
12981354
return self._async_array.shape
12991355

1356+
@shape.setter
1357+
def shape(self, value: ChunkCoords) -> None:
1358+
"""Sets the shape of the array by calling resize."""
1359+
self.resize(value)
1360+
13001361
@property
13011362
def chunks(self) -> ChunkCoords:
13021363
"""Returns a tuple of integers describing the length of each dimension of a chunk of the array.
@@ -2754,18 +2815,18 @@ def blocks(self) -> BlockIndex:
27542815
:func:`set_block_selection` for documentation and examples."""
27552816
return BlockIndex(self)
27562817

2757-
def resize(self, new_shape: ChunkCoords) -> Array:
2818+
def resize(self, new_shape: ShapeLike) -> None:
27582819
"""
27592820
Change the shape of the array by growing or shrinking one or more
27602821
dimensions.
27612822
2762-
This method does not modify the original Array object. Instead, it returns a new Array
2763-
with the specified shape.
2823+
Parameters
2824+
----------
2825+
new_shape : tuple
2826+
New shape of the array.
27642827
27652828
Notes
27662829
-----
2767-
When resizing an array, the data are not rearranged in any way.
2768-
27692830
If one or more dimensions are shrunk, any chunks falling outside the
27702831
new array shape will be deleted from the underlying store.
27712832
However, it is noteworthy that the chunks partially falling inside the new array
@@ -2778,7 +2839,6 @@ def resize(self, new_shape: ChunkCoords) -> Array:
27782839
>>> import zarr
27792840
>>> z = zarr.zeros(shape=(10000, 10000),
27802841
>>> chunk_shape=(1000, 1000),
2781-
>>> store=StorePath(MemoryStore(mode="w")),
27822842
>>> dtype="i4",)
27832843
>>> z.shape
27842844
(10000, 10000)
@@ -2791,10 +2851,43 @@ def resize(self, new_shape: ChunkCoords) -> Array:
27912851
>>> z2.shape
27922852
(50, 50)
27932853
"""
2794-
resized = sync(self._async_array.resize(new_shape))
2795-
# TODO: remove this cast when type inference improves
2796-
_resized = cast(AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata], resized)
2797-
return type(self)(_resized)
2854+
sync(self._async_array.resize(new_shape))
2855+
2856+
def append(self, data: npt.ArrayLike, axis: int = 0) -> ChunkCoords:
2857+
"""Append `data` to `axis`.
2858+
2859+
Parameters
2860+
----------
2861+
data : array-like
2862+
Data to be appended.
2863+
axis : int
2864+
Axis along which to append.
2865+
2866+
Returns
2867+
-------
2868+
new_shape : tuple
2869+
2870+
Notes
2871+
-----
2872+
The size of all dimensions other than `axis` must match between this
2873+
array and `data`.
2874+
2875+
Examples
2876+
--------
2877+
>>> import numpy as np
2878+
>>> import zarr
2879+
>>> a = np.arange(10000000, dtype='i4').reshape(10000, 1000)
2880+
>>> z = zarr.array(a, chunks=(1000, 100))
2881+
>>> z.shape
2882+
(10000, 1000)
2883+
>>> z.append(a)
2884+
(20000, 1000)
2885+
>>> z.append(np.vstack([a, a]), axis=1)
2886+
(20000, 2000)
2887+
>>> z.shape
2888+
(20000, 2000)
2889+
"""
2890+
return sync(self._async_array.append(data, axis=axis))
27982891

27992892
def update_attributes(self, new_attributes: dict[str, JSON]) -> Array:
28002893
# TODO: remove this cast when type inference improves

0 commit comments

Comments
 (0)