Skip to content

Commit 7e95206

Browse files
refactor: use zarrs V2 to V3 metadata conversion (#109)
* refactor: use `zarrs` V2 to V3 metadata conversion Do the Zarr V2 to V3 metadata conversion entirely on the Rust side. Fixes order/dtype handling for some V2 data. * simplify --------- Co-authored-by: Philipp A. <[email protected]>
1 parent 1eb03aa commit 7e95206

File tree

5 files changed

+46
-100
lines changed

5 files changed

+46
-100
lines changed

python/zarrs/_internal.pyi

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,3 @@ class WithSubset:
4141
subset: typing.Sequence[slice],
4242
shape: typing.Sequence[builtins.int],
4343
) -> WithSubset: ...
44-
45-
def codec_metadata_v2_to_v3(
46-
filters: typing.Sequence[builtins.str] | None, compressor: builtins.str | None
47-
) -> builtins.list[builtins.str]: ...

python/zarrs/pipeline.py

Lines changed: 3 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@
1414
from zarr.core.metadata import ArrayMetadata, ArrayV2Metadata, ArrayV3Metadata
1515

1616
if TYPE_CHECKING:
17-
from collections.abc import Generator, Iterable, Iterator
18-
from typing import Any, Self
17+
from collections.abc import Iterable, Iterator
18+
from typing import Self
1919

2020
from zarr.abc.store import ByteGetter, ByteSetter, Store
2121
from zarr.core.array_spec import ArraySpec
@@ -24,7 +24,7 @@
2424
from zarr.core.indexing import SelectorTuple
2525
from zarr.dtype import ZDType
2626

27-
from ._internal import CodecPipelineImpl, codec_metadata_v2_to_v3
27+
from ._internal import CodecPipelineImpl
2828
from .utils import (
2929
CollapsedDimensionError,
3030
DiscontiguousArrayError,
@@ -66,29 +66,6 @@ def get_codec_pipeline_impl(
6666
return None
6767

6868

69-
def codecs_to_dict(codecs: Iterable[Codec]) -> Generator[dict[str, Any], None, None]:
70-
for codec in codecs:
71-
if codec.__class__.__name__ == "V2Codec":
72-
codec_dict = codec.to_dict()
73-
if codec_dict.get("filters", None) is not None:
74-
filters = [
75-
json.dumps(filter.get_config())
76-
for filter in codec_dict.get("filters")
77-
]
78-
else:
79-
filters = None
80-
if codec_dict.get("compressor", None) is not None:
81-
compressor_json = codec_dict.get("compressor").get_config()
82-
compressor = json.dumps(compressor_json)
83-
else:
84-
compressor = None
85-
codecs_v3 = codec_metadata_v2_to_v3(filters, compressor)
86-
for codec in codecs_v3:
87-
yield json.loads(codec)
88-
else:
89-
yield codec.to_dict()
90-
91-
9269
class ZarrsCodecPipelineState(TypedDict):
9370
codec_metadata_json: str
9471
codecs: tuple[Codec, ...]

src/concurrency.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
use pyo3::{exceptions::PyRuntimeError, PyErr, PyResult};
1+
use pyo3::{exceptions::PyRuntimeError, PyResult};
22
use zarrs::array::{
33
codec::CodecOptions, concurrency::calc_concurrency_outer_inner, ArrayCodecTraits,
44
RecommendedConcurrency,
55
};
66

7-
use crate::{chunk_item::ChunksItem, CodecPipelineImpl};
7+
use crate::{chunk_item::ChunksItem, utils::PyErrExt as _, CodecPipelineImpl};
88

99
pub trait ChunkConcurrentLimitAndCodecOptions {
1010
fn get_chunk_concurrent_limit_and_codec_options(
@@ -30,7 +30,7 @@ where
3030
let codec_concurrency = codec_pipeline_impl
3131
.codec_chain
3232
.recommended_concurrency(chunk_representation)
33-
.map_err(|err| PyErr::new::<PyRuntimeError, _>(err.to_string()))?;
33+
.map_py_err::<PyRuntimeError>()?;
3434

3535
let min_concurrent_chunks =
3636
std::cmp::min(codec_pipeline_impl.chunk_concurrent_minimum, num_chunks);

src/lib.rs

Lines changed: 40 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,16 +23,20 @@ use zarrs::array::codec::{
2323
StoragePartialDecoder,
2424
};
2525
use zarrs::array::{
26-
copy_fill_value_into, update_array_bytes, Array, ArrayBytes, ArrayBytesFixedDisjointView,
26+
copy_fill_value_into, update_array_bytes, ArrayBytes, ArrayBytesFixedDisjointView,
2727
ArrayMetadata, ArraySize, CodecChain, FillValue,
2828
};
2929
use zarrs::array_subset::ArraySubset;
30-
use zarrs::storage::store::MemoryStore;
30+
use zarrs::config::global_config;
31+
use zarrs::metadata::v2::data_type_metadata_v2_to_endianness;
32+
use zarrs::metadata::v3::MetadataV3;
33+
use zarrs::metadata_ext::v2_to_v3::{
34+
codec_metadata_v2_to_v3, data_type_metadata_v2_to_v3, ArrayMetadataV2ToV3Error,
35+
};
3136
use zarrs::storage::{ReadableWritableListableStorage, StorageHandle, StoreKey};
3237

3338
mod chunk_item;
3439
mod concurrency;
35-
mod metadata_v2;
3640
mod runtime;
3741
mod store;
3842
#[cfg(test)]
@@ -41,7 +45,6 @@ mod utils;
4145

4246
use crate::chunk_item::ChunksItem;
4347
use crate::concurrency::ChunkConcurrentLimitAndCodecOptions;
44-
use crate::metadata_v2::codec_metadata_v2_to_v3;
4548
use crate::store::StoreConfig;
4649
use crate::utils::{PyErrExt as _, PyUntypedArrayExt as _};
4750

@@ -203,6 +206,35 @@ impl CodecPipelineImpl {
203206
}
204207
}
205208

209+
fn array_metadata_to_codec_metadata_v3(
210+
metadata: ArrayMetadata,
211+
) -> Result<Vec<MetadataV3>, ArrayMetadataV2ToV3Error> {
212+
match metadata {
213+
ArrayMetadata::V3(metadata) => Ok(metadata.codecs),
214+
ArrayMetadata::V2(metadata) => {
215+
let config = global_config();
216+
let endianness = data_type_metadata_v2_to_endianness(&metadata.dtype)
217+
.map_err(ArrayMetadataV2ToV3Error::InvalidEndianness)?;
218+
let data_type = data_type_metadata_v2_to_v3(
219+
&metadata.dtype,
220+
config.data_type_aliases_v2(),
221+
config.data_type_aliases_v3(),
222+
)?;
223+
224+
codec_metadata_v2_to_v3(
225+
metadata.order,
226+
metadata.shape.len(),
227+
&data_type,
228+
endianness,
229+
&metadata.filters,
230+
&metadata.compressor,
231+
config.codec_aliases_v2(),
232+
config.codec_aliases_v3(),
233+
)
234+
}
235+
}
236+
}
237+
206238
#[gen_stub_pymethods]
207239
#[pymethods]
208240
impl CodecPipelineImpl {
@@ -226,11 +258,10 @@ impl CodecPipelineImpl {
226258
) -> PyResult<Self> {
227259
let metadata: ArrayMetadata =
228260
serde_json::from_str(array_metadata).map_py_err::<PyTypeError>()?;
229-
230-
// TODO: Add a direct metadata -> codec chain method to zarrs
231-
let store = Arc::new(MemoryStore::new());
232-
let array = Array::new_with_metadata(store, "/", metadata).map_py_err::<PyTypeError>()?;
233-
let codec_chain = Arc::new(array.codecs().clone());
261+
let codec_metadata =
262+
array_metadata_to_codec_metadata_v3(metadata).map_py_err::<PyTypeError>()?;
263+
let codec_chain =
264+
Arc::new(CodecChain::from_metadata(&codec_metadata).map_py_err::<PyTypeError>()?);
234265

235266
let mut codec_options = CodecOptionsBuilder::new();
236267
if let Some(validate_checksums) = validate_checksums {
@@ -470,7 +501,6 @@ fn _internal(m: &Bound<'_, PyModule>) -> PyResult<()> {
470501
m.add_class::<CodecPipelineImpl>()?;
471502
m.add_class::<chunk_item::Basic>()?;
472503
m.add_class::<chunk_item::WithSubset>()?;
473-
m.add_function(wrap_pyfunction!(codec_metadata_v2_to_v3, m)?)?;
474504
Ok(())
475505
}
476506

src/metadata_v2.rs

Lines changed: 0 additions & 57 deletions
This file was deleted.

0 commit comments

Comments
 (0)