Skip to content

Commit 76bc343

Browse files
committed
Checkpoint
1 parent 24ee6cf commit 76bc343

File tree

19 files changed

+899
-1
lines changed

19 files changed

+899
-1
lines changed

clean.sh

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#!/bin/bash
2+
3+
# Remove all __pycache__ directories
4+
find . -type d -name "__pycache__" -exec rm -rf {} +
5+
6+
# Remove all .coverage files
7+
find . -type f -name ".coverage*" -exec rm -f {} +
8+
9+
# Remove all .pytest_cache directories
10+
find . -type d -name ".pytest_cache" -exec rm -rf {} +
11+
12+
# Remove all .egg-info directories
13+
find . -type d -name "multidimio.egg-info" -exec rm -rf {} +
14+
15+
# Remove all build directories
16+
find . -type d -name "build" -exec rm -rf {} +
17+
18+
# Remove all dist directories
19+
find . -type d -name "dist" -exec rm -rf {} +
20+
21+
# Remove all .pytest_cache directories
22+
find . -type d -name ".pytest_cache" -exec rm -rf {} +
23+
24+
# Remove all .nox directories
25+
find . -type d -name ".nox" -exec rm -rf {} +

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ dependencies = [
2929
"dask (>=2024.12.0)",
3030
"tqdm (>=4.67.0,<5.0.0)",
3131
"psutil (>=6.1.0,<7.0.0)",
32+
"pydantic (>=2.8.2,<3.0.0)",
33+
"pydantic-settings (>=2.4.0,<3.0.0)",
3234
"fsspec (>=2024.10.0)",
3335
"segy (>=0.4.0,<0.5.0)",
3436
"rich (>=13.9.4,<14.0.0)",

src/mdio/schema/__init__.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
"""MDIO schemas for different data types."""
2+
3+
from mdio.schema.compressors import ZFP
4+
from mdio.schema.compressors import Blosc
5+
from mdio.schema.dimension import NamedDimension
6+
from mdio.schema.dtype import ScalarType
7+
from mdio.schema.dtype import StructuredField
8+
from mdio.schema.dtype import StructuredType
9+
10+
11+
__all__ = [
12+
"Blosc",
13+
"ZFP",
14+
"NamedDimension",
15+
"ScalarType",
16+
"StructuredField",
17+
"StructuredType",
18+
]

src/mdio/schema/base.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
"""Base models to subclass from."""
2+
3+
from pydantic import ConfigDict
4+
from pydantic import Field
5+
from pydantic.json_schema import GenerateJsonSchema
6+
7+
from mdio.schema.compressors import ZFP
8+
from mdio.schema.compressors import Blosc
9+
from mdio.schema.core import CamelCaseStrictModel
10+
from mdio.schema.dimension import NamedDimension
11+
from mdio.schema.dtype import DataTypeModel
12+
13+
14+
JSON_SCHEMA_DIALECT = GenerateJsonSchema.schema_dialect
15+
16+
17+
class BaseDataset(CamelCaseStrictModel):
18+
"""A base class for MDIO datasets.
19+
20+
We add schema dialect to extend the config of `StrictCamelBaseModel`.
21+
We use the default Pydantic schema generator `GenerateJsonSchema` to
22+
define the JSON schema dialect accurately.
23+
"""
24+
25+
model_config = ConfigDict(json_schema_extra={"$schema": JSON_SCHEMA_DIALECT})
26+
27+
28+
class BaseArray(DataTypeModel, CamelCaseStrictModel):
29+
"""A base array schema."""
30+
31+
dimensions: list[NamedDimension] | list[str] = Field(
32+
..., description="List of Dimension collection or reference to dimension names."
33+
)
34+
compressor: Blosc | ZFP | None = Field(
35+
default=None, description="Compression settings."
36+
)
37+
38+
39+
class NamedArray(BaseArray):
40+
"""An array with a name."""
41+
42+
name: str = Field(..., description="Name of the array.")
43+
long_name: str | None = Field(default=None, description="Fully descriptive name.")

src/mdio/schema/chunk_grid.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
"""This module contains data models for Zarr's chunk grid."""
2+
3+
from __future__ import annotations
4+
5+
from pydantic import Field
6+
7+
from mdio.schema.core import CamelCaseStrictModel
8+
9+
10+
class RegularChunkShape(CamelCaseStrictModel):
11+
"""Represents regular chunk sizes along each dimension."""
12+
13+
chunk_shape: list[int] = Field(
14+
..., description="Lengths of the chunk along each dimension of the array."
15+
)
16+
17+
18+
class RectilinearChunkShape(CamelCaseStrictModel):
19+
"""Represents irregular chunk sizes along each dimension."""
20+
21+
chunk_shape: list[list[int]] = Field(
22+
...,
23+
description="Lengths of the chunk along each dimension of the array.",
24+
)
25+
26+
27+
class RegularChunkGrid(CamelCaseStrictModel):
28+
"""Represents a rectangular and regularly spaced chunk grid."""
29+
30+
name: str = Field(default="regular", description="The name of the chunk grid.")
31+
32+
configuration: RegularChunkShape = Field(
33+
..., description="Configuration of the regular chunk grid."
34+
)
35+
36+
37+
class RectilinearChunkGrid(CamelCaseStrictModel):
38+
"""Represents a rectangular and irregularly spaced chunk grid."""
39+
40+
name: str = Field(default="rectilinear", description="The name of the chunk grid.")
41+
42+
configuration: RectilinearChunkShape = Field(
43+
..., description="Configuration of the irregular chunk grid."
44+
)

src/mdio/schema/compressors.py

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
"""This module contains a Pydantic model to parameterize compressors.
2+
3+
Important Objects:
4+
- Blosc: A Pydantic model that represents a Blosc compression setup.
5+
- ZFP: Class that represents the ZFP compression model.
6+
"""
7+
8+
from __future__ import annotations
9+
10+
from enum import IntEnum
11+
from enum import StrEnum
12+
13+
from pydantic import Field
14+
from pydantic import model_validator
15+
16+
from mdio.schema.core import CamelCaseStrictModel
17+
18+
19+
class BloscAlgorithm(StrEnum):
20+
"""Enum for Blosc algorithm options."""
21+
22+
BLOSCLZ = "blosclz"
23+
LZ4 = "lz4"
24+
LZ4HC = "lz4hc"
25+
ZLIB = "zlib"
26+
ZSTD = "zstd"
27+
28+
29+
class BloscShuffle(IntEnum):
30+
"""Enum for Blosc shuffle options."""
31+
32+
NOSHUFFLE = 0
33+
SHUFFLE = 1
34+
BITSHUFFLE = 2
35+
AUTOSHUFFLE = -1
36+
37+
38+
class Blosc(CamelCaseStrictModel):
39+
"""Data Model for Blosc options."""
40+
41+
name: str = Field(default="blosc", description="Name of the compressor.")
42+
algorithm: BloscAlgorithm = Field(
43+
default=BloscAlgorithm.LZ4,
44+
description="The Blosc compression algorithm to be used.",
45+
)
46+
level: int = Field(default=5, ge=0, le=9, description="The compression level.")
47+
shuffle: BloscShuffle = Field(
48+
default=BloscShuffle.SHUFFLE,
49+
description="The shuffle strategy to be applied before compression.",
50+
)
51+
blocksize: int = Field(
52+
default=0,
53+
description="The size of the block to be used for compression.",
54+
)
55+
56+
def make_instance(self): # noqa: ANN201
57+
"""Translate parameters to compressor kwargs.."""
58+
from zarr.codecs import Blosc as _Blosc
59+
60+
return _Blosc(
61+
cname=self.algorithm,
62+
clevel=self.level,
63+
shuffle=self.shuffle,
64+
blocksize=self.blocksize,
65+
)
66+
67+
68+
zfp_mode_map = {
69+
"fixed_rate": 2,
70+
"fixed_precision": 3,
71+
"fixed_accuracy": 4,
72+
"reversible": 5,
73+
}
74+
75+
76+
class ZFPMode(StrEnum):
77+
"""Enum for ZFP algorithm modes."""
78+
79+
FIXED_RATE = "fixed_rate"
80+
FIXED_PRECISION = "fixed_precision"
81+
FIXED_ACCURACY = "fixed_accuracy"
82+
REVERSIBLE = "reversible"
83+
84+
@property
85+
def int_code(self) -> int:
86+
"""Return the integer code of ZFP mode."""
87+
return zfp_mode_map[self.value]
88+
89+
90+
class ZFP(CamelCaseStrictModel):
91+
"""Data Model for ZFP options."""
92+
93+
name: str = Field(default="zfp", description="Name of the compressor.")
94+
mode: ZFPMode = Field()
95+
96+
tolerance: float | None = Field(
97+
default=None,
98+
description="Fixed accuracy in terms of absolute error tolerance.",
99+
)
100+
101+
rate: float | None = Field(
102+
default=None,
103+
description="Fixed rate in terms of number of compressed bits per value.",
104+
)
105+
106+
precision: int | None = Field(
107+
default=None,
108+
description="Fixed precision in terms of number of uncompressed bits per value.",
109+
)
110+
111+
write_header: bool = Field(
112+
default=True,
113+
description="Encode array shape, scalar type, and compression parameters.",
114+
)
115+
116+
@model_validator(mode="after")
117+
def check_requirements(self) -> ZFP:
118+
"""Check if ZFP parameters make sense."""
119+
mode = self.mode
120+
121+
# Check if reversible mode is provided without other parameters.
122+
if mode == ZFPMode.REVERSIBLE and any(
123+
getattr(self, key) is not None for key in ["tolerance", "rate", "precision"]
124+
):
125+
msg = "Other fields must be None in REVERSIBLE mode"
126+
raise ValueError(msg)
127+
128+
if mode == ZFPMode.FIXED_ACCURACY and self.tolerance is None:
129+
msg = "Tolerance required for FIXED_ACCURACY mode"
130+
raise ValueError(msg)
131+
132+
if mode == ZFPMode.FIXED_RATE and self.rate is None:
133+
msg = "Rate required for FIXED_RATE mode"
134+
raise ValueError(msg)
135+
136+
if mode == ZFPMode.FIXED_PRECISION and self.precision is None:
137+
msg = "Precision required for FIXED_PRECISION mode"
138+
raise ValueError(msg)
139+
140+
return self
141+
142+
def make_instance(self): # noqa: ANN201
143+
"""Translate parameters to compressor kwargs.."""
144+
from zarr.codecs import ZFPY as _ZFPY
145+
146+
return _ZFPY(
147+
mode=self.mode.int_code,
148+
tolerance=self.tolerance,
149+
rate=self.rate,
150+
precision=self.precision,
151+
)
152+
153+
154+
class CompressorModel(CamelCaseStrictModel):
155+
"""Model representing compressor configuration."""
156+
157+
compressor: Blosc | ZFP | None = Field(
158+
default=None, description="Compression settings."
159+
)

src/mdio/schema/core.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
"""This module implements the core components of the MDIO schemas."""
2+
3+
from __future__ import annotations
4+
5+
from typing import Any
6+
from typing import get_type_hints
7+
8+
from pydantic import BaseModel
9+
from pydantic import ConfigDict
10+
from pydantic.alias_generators import to_camel
11+
12+
13+
def model_fields(model: type[BaseModel]) -> dict[str, tuple[Any, Any]]:
14+
"""Extract Pydantic BaseModel fields.
15+
16+
Args:
17+
model: (Type) The model object for which the fields will be extracted.
18+
19+
Returns:
20+
A dictionary containing the fields of the model along with
21+
their corresponding types and default values.
22+
23+
Example:
24+
>>> class MyModel(BaseModel):
25+
... name: str
26+
... age: int = 0
27+
...
28+
>>> model_fields(MyModel)
29+
{'name': (str, <default_value>), 'age': (int, 0)}
30+
"""
31+
annotations = get_type_hints(model)
32+
33+
fields = {}
34+
for field_name, field in model.model_fields.items():
35+
fields[field_name] = (annotations[field_name], field)
36+
37+
return fields
38+
39+
40+
class StrictModel(BaseModel):
41+
"""A model with forbidden extras."""
42+
43+
model_config = ConfigDict(extra="forbid", populate_by_name=True)
44+
45+
46+
class CamelCaseStrictModel(StrictModel):
47+
"""A model with forbidden extras and camel case aliases."""
48+
49+
model_config = ConfigDict(alias_generator=to_camel)

src/mdio/schema/dimension.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
"""Dimension schema."""
2+
3+
from pydantic import Field
4+
5+
from mdio.schema.core import CamelCaseStrictModel
6+
7+
8+
class NamedDimension(CamelCaseStrictModel):
9+
"""Represents a single dimension with a name and size."""
10+
11+
name: str = Field(..., description="Unique identifier for the dimension.")
12+
size: int = Field(..., gt=0, description="Total size of the dimension.")

0 commit comments

Comments
 (0)