Skip to content

Commit 583175d

Browse files
committed
Merge remote-tracking branch 'origin/zarr-v3' into zarrv3_with_uv
2 parents e13fa6e + b8213e0 commit 583175d

File tree

15 files changed

+130
-164
lines changed

15 files changed

+130
-164
lines changed

.github/workflows/tests.yml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,19 +14,17 @@ jobs:
1414
include:
1515
- { python: "3.13", os: "ubuntu-latest", session: "pre-commit" }
1616
- { python: "3.13", os: "ubuntu-latest", session: "safety" }
17+
# - { python: "3.13", os: "ubuntu-latest", session: "mypy" }
1718
# - { python: "3.12", os: "ubuntu-latest", session: "mypy" }
1819
# - { python: "3.11", os: "ubuntu-latest", session: "mypy" }
19-
# - { python: "3.10", os: "ubuntu-latest", session: "mypy" }
2020
- { python: "3.13", os: "ubuntu-latest", session: "tests" }
2121
- { python: "3.12", os: "ubuntu-latest", session: "tests" }
2222
- { python: "3.11", os: "ubuntu-latest", session: "tests" }
23-
- { python: "3.10", os: "ubuntu-latest", session: "tests" }
2423
- { python: "3.13", os: "windows-latest", session: "tests" }
2524
- { python: "3.13", os: "macos-latest", session: "tests" }
25+
# - { python: "3.13", os: "ubuntu-latest", session: "typeguard" }
2626
# - { python: "3.12", os: "ubuntu-latest", session: "typeguard" }
2727
# - { python: "3.11", os: "ubuntu-latest", session: "typeguard" }
28-
# - { python: "3.10", os: "ubuntu-latest", session: "typeguard" }
29-
# - { python: "3.10", os: "ubuntu-latest", session: "xdoctest" }
3028
- { python: "3.13", os: "ubuntu-latest", session: "docs-build" }
3129

3230
env:

noxfile.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515

1616
package = "mdio"
17-
python_versions = ["3.13", "3.12", "3.11", "3.10"]
17+
python_versions = ["3.13", "3.12", "3.11"]
1818
nox.needs_version = ">= 2024.10.9"
1919
nox.options.sessions = (
2020
"pre-commit",

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ name = "multidimio"
33
version = "0.8.5"
44
description = "Cloud-native, scalable, and user-friendly multi dimensional energy data!"
55
authors = [{ name = "Altay Sansal", email = "[email protected]" }]
6-
requires-python = ">=3.10,<3.14"
6+
requires-python = ">=3.11,<3.14"
77
readme = "README.md"
88
license = { "text" = "Apache-2.0" }
99
keywords = [
@@ -25,7 +25,7 @@ classifiers = [
2525
dependencies = [
2626
"click (>=8.1.7,<9.0.0)",
2727
"click-params (>=0.5.0,<0.6.0)",
28-
"zarr (>=2.18.2,<3.0.0)",
28+
"zarr (>=3.0.2,<4.0.0)",
2929
"dask (>=2024.12.0)",
3030
"tqdm (>=4.67.0,<5.0.0)",
3131
"psutil (>=6.1.0,<7.0.0)",

src/mdio/api/accessor.py

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -186,32 +186,33 @@ def _validate_store(self, storage_options):
186186
if storage_options is None:
187187
storage_options = {}
188188

189-
self.store = process_url(
189+
self.url = process_url(
190190
url=self.url,
191-
mode=self.mode,
192-
storage_options=storage_options,
193-
memory_cache_size=self._memory_cache_size,
194191
disk_cache=self._disk_cache,
195192
)
196193

197-
def _connect(self):
198-
"""Open the zarr root."""
199194
try:
200-
if self.mode in {"r", "r+"}:
201-
self.root = zarr.open_consolidated(store=self.store, mode=self.mode)
202-
elif self.mode == "w":
203-
self.root = zarr.open(store=self.store, mode="r+")
204-
else:
205-
msg = f"Invalid mode: {self.mode}"
206-
raise ValueError(msg)
207-
except KeyError as e:
195+
self.store = zarr.open(
196+
self.url, mode=self.mode, storage_options=storage_options
197+
).store
198+
except FileNotFoundError as e:
208199
msg = (
209-
f"MDIO file not found or corrupt at {self.store.path}. "
200+
f"MDIO file not found or corrupt at {self.url}. "
210201
"Please check the URL or ensure it is not a deprecated "
211202
"version of MDIO file."
212203
)
213204
raise MDIONotFoundError(msg) from e
214205

206+
def _connect(self):
207+
"""Open the zarr root."""
208+
if self.mode in {"r", "r+"}:
209+
self.root = zarr.open_consolidated(store=self.store, mode=self.mode)
210+
elif self.mode == "w":
211+
self.root = zarr.open(store=self.store, mode="r+")
212+
else:
213+
msg = f"Invalid mode: {self.mode}"
214+
raise ValueError(msg)
215+
215216
def _deserialize_grid(self):
216217
"""Deserialize grid from Zarr metadata."""
217218
self.grid = Grid.from_zarr(self.root)
@@ -375,12 +376,12 @@ def stats(self, value: dict) -> None:
375376
@property
376377
def _metadata_group(self) -> zarr.Group:
377378
"""Get metadata zarr.group handle."""
378-
return self.root.metadata
379+
return self.root["metadata"]
379380

380381
@property
381382
def _data_group(self) -> zarr.Group:
382383
"""Get data zarr.Group handle."""
383-
return self.root.data
384+
return self.root["data"]
384385

385386
def __getitem__(self, item: int | tuple) -> npt.ArrayLike | da.Array | tuple:
386387
"""Data getter."""

src/mdio/api/convenience.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
from typing import TYPE_CHECKING
66

77
import zarr
8+
from numcodecs import Blosc
89
from tqdm.auto import tqdm
9-
from zarr import Blosc
1010

1111
from mdio.api.io_utils import process_url
1212
from mdio.core.indexing import ChunkIterator
@@ -51,20 +51,22 @@ def copy_mdio( # noqa: PLR0913
5151
Default is None (will assume anonymous).
5252
overwrite: Overwrite destination or not.
5353
54+
Raises:
55+
NotImplementedError: because Zarr v3 doesn't support copy.
5456
"""
5557
if storage_options is None:
5658
storage_options = {}
5759

5860
dest_store = process_url(
5961
url=dest_path_or_buffer,
60-
mode="w",
61-
storage_options=storage_options,
62-
memory_cache_size=0,
6362
disk_cache=False,
6463
)
6564

6665
if_exists = "replace" if overwrite is True else "raise"
6766

67+
# TODO(Altay): Update this function when Zarr v3 supports copy.
68+
raise NotImplementedError("Zarr version 3.0.0+ does not support copy yet.")
69+
6870
zarr.copy_store(
6971
source=source.store,
7072
dest=dest_store,
@@ -242,6 +244,7 @@ def rechunk_batch(
242244
write_rechunked_values(source, suffix_list, *plan)
243245

244246

247+
# TODO(Altay): This needs to be validated
245248
def rechunk(
246249
source: MDIOAccessor,
247250
chunks: tuple[int, ...],

src/mdio/api/io_utils.py

Lines changed: 7 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -2,40 +2,27 @@
22

33
from __future__ import annotations
44

5-
from typing import Any
6-
75
import dask.array as da
86
import zarr
9-
from zarr.storage import FSStore
107

118

129
def process_url(
1310
url: str,
14-
mode: str,
15-
storage_options: dict[str, Any],
16-
memory_cache_size: int,
1711
disk_cache: bool,
18-
) -> FSStore:
12+
) -> str:
1913
"""Check read/write access to FSStore target and return FSStore with double caching.
2014
21-
It can use an in-memory Least Recently Used (LRU) cache implementation from
22-
Zarr, and optionally, a file cache (`simplecache` protocol from FSSpec) that
23-
is useful for remote stores.
24-
25-
File cache is only valid for remote stores. The LRU caching works
26-
on both remote and local.
15+
It can optionally use a file cache (`simplecache` protocol from fsspec) that
16+
is useful for remote stores. File cache is only useful for remote stores.
2717
2818
The `storage_options` argument represents a set of parameters to be passed
29-
to the FSSpec backend. Note that the format of `storage_options` is
19+
to the fsspec backend. Note that the format of `storage_options` is
3020
different if `disk_cache` is enabled or disabled, since `disk_cache`
3121
interanlly uses the simplecache protocol.
3222
3323
Args:
34-
url: FSSpec compliant url
35-
mode: Toggle for overwriting existing store
36-
storage_options: Storage options for the storage backend.
37-
memory_cache_size: Maximum in memory LRU cache size in bytes.
38-
disk_cache: This enables FSSpec's `simplecache` if True.
24+
url: fsspec compliant url
25+
disk_cache: This enables fsspec's `simplecache` if True.
3926
4027
Returns:
4128
Store with augmentations like cache, write verification etc.
@@ -52,7 +39,6 @@ def process_url(
5239
... url="s3://bucket/key",
5340
... mode="r",
5441
... storage_options={"key": "my_key", "secret": "my_secret"},
55-
... memory_cache_size=0,
5642
... disk_cache=False,
5743
... )
5844
@@ -64,7 +50,6 @@ def process_url(
6450
... url="s3://bucket/key",
6551
... mode="r",
6652
... storage_options={"s3": {"key": "my_key", "secret": "my_secret"}},
67-
... memory_cache_size=0,
6853
... disk_cache=True,
6954
... )
7055
@@ -77,35 +62,13 @@ def process_url(
7762
... "s3": {"key": "my_key", "secret": "my_secret"},
7863
... "simplecache": {"cache_storage": "custom/local/cache/path"},
7964
... },
80-
... memory_cache_size=0,
8165
... disk_cache=True,
8266
... )
8367
"""
8468
if disk_cache is True:
8569
url = "::".join(["simplecache", url])
8670

87-
# Strip whitespaces and slashes from end of string
88-
url = url.rstrip("/ ")
89-
90-
# Flag for checking write access
91-
check = True if mode == "w" else False
92-
93-
# TODO: Turning off write checking now because zarr has a bug.
94-
# Get rid of this once bug is fixed.
95-
check = False
96-
97-
store = FSStore(
98-
url=url,
99-
check=check,
100-
create=check,
101-
mode=mode,
102-
**storage_options,
103-
)
104-
105-
if memory_cache_size != 0:
106-
store = zarr.storage.LRUStoreCache(store=store, max_size=memory_cache_size)
107-
108-
return store
71+
return url
10972

11073

11174
def open_zarr_array(group_handle: zarr.Group, name: str) -> zarr.Array:

src/mdio/commands/info.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ def parse_grid(grid: Grid) -> dict[str, dict[str, int | str]]:
9595
def parse_access_patterns(reader: MDIOReader) -> dict[str, Any]:
9696
"""Extract access patterns and their info."""
9797
access_pattern_dict = {}
98-
for name, array in reader._data_group.items():
98+
for name, array in reader._data_group.arrays():
9999
pattern = name.replace("chunked_", "")
100100
chunks = str(array.chunks)
101101
format_ = str(array.dtype)

src/mdio/converters/segy.py

Lines changed: 22 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,8 @@ def segy_to_mdio( # noqa: C901
343343
... grid_overrides={"HasDuplicates": True},
344344
... )
345345
"""
346+
zarr.config.set({"default_zarr_format": 2, "write_empty_chunks": False})
347+
346348
if index_names is None:
347349
index_names = [f"dim_{i}" for i in range(len(index_bytes))]
348350

@@ -364,13 +366,8 @@ def segy_to_mdio( # noqa: C901
364366
if storage_options_output is None:
365367
storage_options_output = {}
366368

367-
store = process_url(
368-
url=mdio_path_or_buffer,
369-
mode="w",
370-
storage_options=storage_options_output,
371-
memory_cache_size=0, # Making sure disk caching is disabled,
372-
disk_cache=False, # Making sure disk caching is disabled
373-
)
369+
url = process_url(url=mdio_path_or_buffer, disk_cache=False)
370+
root_group = zarr.open_group(url, mode="w", storage_options=storage_options_output)
374371

375372
# Open SEG-Y with MDIO's SegySpec. Endianness will be inferred.
376373
mdio_spec = mdio_segy_spec()
@@ -406,42 +403,43 @@ def segy_to_mdio( # noqa: C901
406403
logger.warning(f"Ingestion grid shape: {grid.shape}.")
407404
raise GridTraceCountError(np.sum(grid.live_mask), num_traces)
408405

409-
zarr_root = create_zarr_hierarchy(
410-
store=store,
406+
root_group = create_zarr_hierarchy(
407+
root_group=root_group,
411408
overwrite=overwrite,
412409
)
413410

414411
# Get UTC time, then add local timezone information offset.
415412
iso_datetime = datetime.now(timezone.utc).isoformat()
416413

417-
write_attribute(name="created", zarr_group=zarr_root, attribute=iso_datetime)
418-
write_attribute(name="api_version", zarr_group=zarr_root, attribute=API_VERSION)
414+
write_attribute(name="created", zarr_group=root_group, attribute=iso_datetime)
415+
write_attribute(name="api_version", zarr_group=root_group, attribute=API_VERSION)
419416

420417
dimensions_dict = [dim.to_dict() for dim in dimensions]
421-
write_attribute(name="dimension", zarr_group=zarr_root, attribute=dimensions_dict)
418+
write_attribute(name="dimension", zarr_group=root_group, attribute=dimensions_dict)
422419

423420
# Write trace count
424421
trace_count = np.count_nonzero(grid.live_mask)
425-
write_attribute(name="trace_count", zarr_group=zarr_root, attribute=trace_count)
422+
write_attribute(name="trace_count", zarr_group=root_group, attribute=trace_count)
426423

427424
# Note, live mask is not chunked since it's bool and small.
428-
zarr_root["metadata"].create_dataset(
429-
data=grid.live_mask,
425+
live_mask_arr = root_group["metadata"].create_array(
430426
name="live_mask",
431427
shape=grid.shape[:-1],
432-
chunks=-1,
433-
dimension_separator="/",
428+
chunks=grid.shape[:-1],
429+
dtype="bool",
430+
chunk_key_encoding={"name": "v2", "separator": "/"},
434431
)
432+
live_mask_arr[...] = grid.live_mask[...]
435433

436434
write_attribute(
437435
name="text_header",
438-
zarr_group=zarr_root["metadata"],
436+
zarr_group=root_group["metadata"],
439437
attribute=text_header.split("\n"),
440438
)
441439

442440
write_attribute(
443441
name="binary_header",
444-
zarr_group=zarr_root["metadata"],
442+
zarr_group=root_group["metadata"],
445443
attribute=binary_header.to_dict(),
446444
)
447445

@@ -470,8 +468,8 @@ def segy_to_mdio( # noqa: C901
470468
stats = blocked_io.to_zarr(
471469
segy_file=segy,
472470
grid=grid,
473-
data_root=zarr_root["data"],
474-
metadata_root=zarr_root["metadata"],
471+
data_root=root_group["data"],
472+
metadata_root=root_group["metadata"],
475473
name="_".join(["chunked", suffix]),
476474
dtype="float32",
477475
chunks=chunksize,
@@ -480,17 +478,7 @@ def segy_to_mdio( # noqa: C901
480478
)
481479

482480
for key, value in stats.items():
483-
write_attribute(name=key, zarr_group=zarr_root, attribute=value)
484-
485-
# Non-cached store for consolidating metadata.
486-
# If caching is enabled the metadata may fall out of cache hence
487-
# creating an incomplete `.zmetadata` file.
488-
store_nocache = process_url(
489-
url=mdio_path_or_buffer,
490-
mode="r+",
491-
storage_options=storage_options_output,
492-
memory_cache_size=0, # Making sure disk caching is disabled,
493-
disk_cache=False, # Making sure disk caching is disabled
494-
)
481+
write_attribute(name=key, zarr_group=root_group, attribute=value)
495482

496-
zarr.consolidate_metadata(store_nocache)
483+
# Finalize Zarr for fast open
484+
zarr.consolidate_metadata(root_group.store)

0 commit comments

Comments
 (0)