Skip to content

Commit 7b0083e

Browse files
Raise error on concatenation with partial boundary chunks (#594)
* Test concatenation with partial boundary chunk * Add check on concetenation * Add a note to custom reader design docs about boundary chunks * Test roundtripping HDF5 with partial chunk Co-authored-by: Tom Nicholas <[email protected]> * Update error message --------- Co-authored-by: Tom Nicholas <[email protected]>
1 parent 627a904 commit 7b0083e

File tree

6 files changed

+51
-1
lines changed

6 files changed

+51
-1
lines changed

docs/custom_readers.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,10 @@ Generally you want to follow steps like this:
9090
7. Group `ManifestArrays` up into one or more `ManifestGroup` objects. Ideally you would only have one group, but your format's data model may preclude that. If there is group-level metadata attach this to the `ManifestGroup` object as a `zarr.metadata.GroupMetadata` object. Remember that `ManifestGroups` can contain other groups as well as arrays.
9191
8. Instantiate the final `ManifestStore` using the top-most `ManifestGroup` and return it.
9292

93+
```{note}
94+
The [regular chunk grid](https://github.com/zarr-developers/zarr-specs/blob/main/docs/v3/chunk-grids/regular-grid/index.rst) for Zarr V3 data expects that chunks at the border of an array always have the full chunk size, even when the array only covers parts of it. For example, having an array with ``"shape": [30, 30]`` and ``"chunk_shape": [16, 16]``, the chunk ``0,1`` would also contain unused values for the indices ``0-16, 30-31``. If the file format that you are virtualizing does not fill in partial chunks, it is recommended that you raise a `ValueError` until Zarr supports [variable chunk sizes](https://github.com/orgs/zarr-developers/discussions/52).
95+
```
96+
9397
### Parsing a pre-existing index file
9498

9599
A custom reader can parse multiple files, perhaps by passing a glob string and looking for expected file naming conventions, or by passing additional reader-specific keyword arguments.

virtualizarr/manifests/array_api.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from .manifest import ChunkManifest
88
from .utils import (
99
check_combinable_zarr_arrays,
10+
check_no_partial_chunks_on_concat_axis,
1011
check_same_ndims,
1112
check_same_shapes,
1213
check_same_shapes_except_on_concat_axis,
@@ -75,7 +76,9 @@ def concatenate(
7576
axis = axis % first_arr.ndim
7677

7778
arr_shapes = [arr.shape for arr in arrays]
79+
arr_chunks = [arr.chunks for arr in arrays]
7880
check_same_shapes_except_on_concat_axis(arr_shapes, axis)
81+
check_no_partial_chunks_on_concat_axis(arr_shapes, arr_chunks, axis)
7982

8083
# find what new array shape must be
8184
new_length_along_concat_axis = sum([shape[axis] for shape in arr_shapes])

virtualizarr/manifests/utils.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,20 @@ def _remove_element_at_position(t: tuple[int, ...], pos: int) -> tuple[int, ...]
157157
return tuple(new_l)
158158

159159

160+
def check_no_partial_chunks_on_concat_axis(
161+
shapes: list[tuple[int, ...]], chunks: list[tuple[int, ...]], axis: int
162+
):
163+
"""Check that there are no partial chunks along the concatenation axis"""
164+
# loop over the arrays to be concatenated
165+
for i, (shape, chunk_shape) in enumerate(zip(shapes, chunks)):
166+
if shape[axis] % chunk_shape[axis] > 0:
167+
raise ValueError(
168+
"Cannot concatenate arrays with partial chunks because only regular chunk grids are currently supported. "
169+
f"Concat input {i} has array length {shape[axis]} along the concatenation axis which is not "
170+
f"evenly divisible by chunk length {chunk_shape[axis]}."
171+
)
172+
173+
160174
def check_same_shapes_except_on_concat_axis(shapes: list[tuple[int, ...]], axis: int):
161175
"""Check that shapes are compatible for concatenation"""
162176

virtualizarr/tests/test_manifests/test_array.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,17 @@ def test_refuse_combine(array_v3_metadata):
385385
with pytest.raises(ValueError, match="inconsistent dtypes"):
386386
func([marr1, marr2], axis=0)
387387

388+
metadata_variable_chunk = array_v3_metadata(
389+
shape=shape,
390+
chunks=(4, 1, 19),
391+
)
392+
marr = ManifestArray(metadata=metadata_variable_chunk, chunkmanifest=chunkmanifest2)
393+
with pytest.raises(
394+
ValueError,
395+
match="Cannot concatenate arrays with partial chunks because only regular chunk grids are currently supported. Concat input 0 has array length 5 along the concatenation axis which is not evenly divisible by chunk length 4.",
396+
):
397+
np.concatenate([marr, marr], axis=0)
398+
388399

389400
class TestIndexing:
390401
@pytest.mark.parametrize("dodgy_indexer", ["string", [], (5, "string")])

virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010
requires_imagecodecs,
1111
requires_kerchunk,
1212
)
13-
from virtualizarr.tests.test_integration import roundtrip_as_in_memory_icechunk
13+
from virtualizarr.tests.test_integration import (
14+
roundtrip_as_in_memory_icechunk,
15+
)
1416

1517

1618
@requires_kerchunk

virtualizarr/tests/test_readers/test_hdf/test_hdf_manifest_store.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,22 @@ def test_rountrip_simple_virtualdataset(self, tmpdir, basic_ds):
4040
)
4141
xr.testing.assert_allclose(basic_ds, rountripped_ds)
4242

43+
def test_rountrip_partial_chunk_virtualdataset(self, tmpdir, basic_ds):
44+
"Roundtrip a dataset to/from NetCDF with the HDF reader and ManifestStore with a single partial chunk"
45+
46+
filepath = f"{tmpdir}/basic_ds_roundtrip.nc"
47+
encoding = {
48+
"temperature": {"chunksizes": (90, 90), "original_shape": (100, 100)}
49+
}
50+
basic_ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding)
51+
store = HDFVirtualBackend._create_manifest_store(
52+
filepath=filepath,
53+
)
54+
rountripped_ds = xr.open_dataset(
55+
store, engine="zarr", consolidated=False, zarr_format=3
56+
)
57+
xr.testing.assert_allclose(basic_ds, rountripped_ds)
58+
4359
def test_rountrip_simple_virtualdataset_default_store(self, tmpdir, basic_ds):
4460
"Roundtrip a dataset to/from NetCDF with the HDF reader and ManifestStore"
4561

0 commit comments

Comments
 (0)