Skip to content

Commit 9d8e0c8

Browse files
authored
Merge branch 'main' into 624_prestack_schema
2 parents 8a6b86e + bdebc7e commit 9d8e0c8

File tree

13 files changed

+839
-727
lines changed

13 files changed

+839
-727
lines changed

.github/workflows/constraints.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
bump-my-version==1.2.3
1+
bump-my-version==1.2.4
22
nox==2025.5.1

docs/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
aiohttp==3.12.15
1+
aiohttp==3.13.0
22
autodoc-pydantic==2.2.0
33
furo==2025.9.25
44
linkify-it-py==2.0.3

pyproject.toml

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "multidimio"
3-
version = "1.0.5"
3+
version = "1.0.6"
44
description = "Cloud-native, scalable, and user-friendly multi dimensional energy data!"
55
authors = [{ name = "Altay Sansal", email = "[email protected]" }]
66
requires-python = ">=3.11,<3.14"
@@ -18,24 +18,24 @@ classifiers = [
1818
]
1919

2020
dependencies = [
21-
"click>=8.2.1",
21+
"click>=8.3.0",
2222
"click-params>=0.5.0",
23-
"dask>=2025.9.0",
23+
"dask>=2025.9.1",
2424
"fsspec>=2025.9.0",
2525
"pint>=0.25.0",
26-
"psutil>=7.0.0",
27-
"pydantic>=2.11.9",
26+
"psutil>=7.1.0",
27+
"pydantic>=2.12.0",
2828
"rich>=14.1.0",
29-
"segy>=0.5.2",
29+
"segy>=0.5.3",
3030
"tqdm>=4.67.1",
31-
"universal-pathlib>=0.2.6",
32-
"xarray>=2025.9.1",
31+
"universal-pathlib>=0.3.3",
32+
"xarray>=2025.10.1",
3333
"zarr>=3.1.3",
3434
]
3535

3636
[project.optional-dependencies]
3737
cloud = ["s3fs>=2025.9.0", "gcsfs>=2025.9.0", "adlfs>=2025.8.0"]
38-
distributed = ["distributed>=2025.9.0", "bokeh>=3.8.0"]
38+
distributed = ["distributed>=2025.9.1", "bokeh>=3.8.0"]
3939
lossy = ["zfpy>=1.0.1"]
4040

4141
[project.urls]
@@ -48,9 +48,9 @@ mdio = "mdio.__main__:main"
4848

4949
[dependency-groups]
5050
dev = [
51-
"ruff>=0.13.0",
52-
"coverage[toml]>=7.10.6",
53-
"mypy>=1.18.1",
51+
"ruff>=0.14.0",
52+
"coverage[toml]>=7.10.7",
53+
"mypy>=1.18.2",
5454
"pre-commit>=4.3.0",
5555
"pre-commit-hooks>=6.0.0",
5656
"pytest>=8.4.2",
@@ -61,9 +61,9 @@ dev = [
6161
]
6262

6363
docs = [
64-
"aiohttp>=3.12.15",
64+
"aiohttp>=3.13.0",
6565
"autodoc-pydantic>=2.2.0",
66-
"furo>=2025.7.19",
66+
"furo>=2025.9.25",
6767
"linkify-it-py>=2.0.3",
6868
"matplotlib>=3.10.6",
6969
"myst-nb>=1.3.0",
@@ -181,7 +181,7 @@ init_typed = true
181181
warn_required_dynamic_aliases = true
182182

183183
[tool.bumpversion]
184-
current_version = "1.0.5"
184+
current_version = "1.0.6"
185185
allow_dirty = true
186186
commit = false
187187
tag = false

src/mdio/converters/segy.py

Lines changed: 72 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,13 @@
44

55
import base64
66
import logging
7+
import multiprocessing as mp
78
import os
8-
from dataclasses import dataclass
9+
from concurrent.futures import ProcessPoolExecutor
910
from typing import TYPE_CHECKING
1011

1112
import numpy as np
1213
import zarr
13-
from segy import SegyFile
1414
from segy.config import SegyFileSettings
1515
from segy.config import SegyHeaderOverrides
1616
from segy.standards.codes import MeasurementSystem as SegyMeasurementSystem
@@ -37,9 +37,10 @@
3737
from mdio.core.utils_write import MAX_SIZE_LIVE_MASK
3838
from mdio.core.utils_write import get_constrained_chunksize
3939
from mdio.segy import blocked_io
40+
from mdio.segy._workers import SegyFileInfo
41+
from mdio.segy._workers import info_worker
4042
from mdio.segy.scalar import SCALE_COORDINATE_KEYS
4143
from mdio.segy.scalar import _apply_coordinate_scalar
42-
from mdio.segy.scalar import _get_coordinate_scalar
4344
from mdio.segy.utilities import get_grid_plan
4445

4546
if TYPE_CHECKING:
@@ -54,6 +55,7 @@
5455
from mdio.builder.schemas import Dataset
5556
from mdio.builder.templates.abstract_dataset_template import AbstractDatasetTemplate
5657
from mdio.core.dimension import Dimension
58+
from mdio.segy._workers import SegyFileArguments
5759

5860
logger = logging.getLogger(__name__)
5961

@@ -135,37 +137,9 @@ def grid_density_qc(grid: Grid, num_traces: int) -> None:
135137
raise GridTraceSparsityError(grid.shape, num_traces, msg)
136138

137139

138-
@dataclass
139-
class SegyFileHeaderDump:
140-
"""Segy metadata information."""
141-
142-
text_header: str
143-
binary_header_dict: dict
144-
raw_binary_headers: bytes
145-
146-
147-
def _get_segy_file_header_dump(segy_file: SegyFile) -> SegyFileHeaderDump:
148-
"""Reads information from a SEG-Y file."""
149-
text_header = segy_file.text_header
150-
151-
raw_binary_headers: bytes = segy_file.fs.read_block(
152-
fn=segy_file.url,
153-
offset=segy_file.spec.binary_header.offset,
154-
length=segy_file.spec.binary_header.itemsize,
155-
)
156-
157-
# We read here twice, but it's ok for now. Only 400-bytes.
158-
binary_header_dict = segy_file.binary_header.to_dict()
159-
160-
return SegyFileHeaderDump(
161-
text_header=text_header,
162-
binary_header_dict=binary_header_dict,
163-
raw_binary_headers=raw_binary_headers,
164-
)
165-
166-
167140
def _scan_for_headers(
168-
segy_file: SegyFile,
141+
segy_file_kwargs: SegyFileArguments,
142+
segy_file_info: SegyFileInfo,
169143
template: AbstractDatasetTemplate,
170144
grid_overrides: dict[str, Any] | None = None,
171145
) -> tuple[list[Dimension], SegyHeaderArray]:
@@ -176,7 +150,8 @@ def _scan_for_headers(
176150
"""
177151
full_chunk_size = template.full_chunk_size
178152
segy_dimensions, chunk_size, segy_headers = get_grid_plan(
179-
segy_file=segy_file,
153+
segy_file_kwargs=segy_file_kwargs,
154+
segy_file_info=segy_file_info,
180155
return_headers=True,
181156
template=template,
182157
chunksize=full_chunk_size,
@@ -192,12 +167,29 @@ def _scan_for_headers(
192167
return segy_dimensions, segy_headers
193168

194169

195-
def _build_and_check_grid(segy_dimensions: list[Dimension], num_traces: int, segy_headers: SegyHeaderArray) -> Grid:
170+
def _read_segy_file_info(segy_file_kwargs: SegyFileArguments) -> SegyFileInfo:
171+
"""Read SEG-Y file in a separate process.
172+
173+
This is an ugly workaround for Zarr issues 3487 'Explicitly using fsspec and zarr FsspecStore causes
174+
RuntimeError "Task attached to a different loop"'
175+
"""
176+
# TODO (Dmitriy Repin): when Zarr issue 3487 is resolved, we can remove this workaround
177+
# https://github.com/zarr-developers/zarr-python/issues/3487
178+
with ProcessPoolExecutor(max_workers=1, mp_context=mp.get_context("spawn")) as executor:
179+
future = executor.submit(info_worker, segy_file_kwargs)
180+
return future.result()
181+
182+
183+
def _build_and_check_grid(
184+
segy_dimensions: list[Dimension],
185+
segy_file_info: SegyFileInfo,
186+
segy_headers: SegyHeaderArray,
187+
) -> Grid:
196188
"""Build and check the grid from the SEG-Y headers and dimensions.
197189
198190
Args:
199191
segy_dimensions: List of of all SEG-Y dimensions to build grid from.
200-
num_traces: Number of traces in the SEG-Y file.
192+
segy_file_info: SegyFileInfo instance containing the SEG-Y file information.
201193
segy_headers: Headers read in from SEG-Y file for building the trace map.
202194
203195
Returns:
@@ -207,6 +199,7 @@ def _build_and_check_grid(segy_dimensions: list[Dimension], num_traces: int, seg
207199
GridTraceCountError: If number of traces in SEG-Y file does not match the parsed grid
208200
"""
209201
grid = Grid(dims=segy_dimensions)
202+
num_traces = segy_file_info.num_traces
210203
grid_density_qc(grid, num_traces)
211204
grid.build_map(segy_headers)
212205
# Check grid validity by comparing trace numbers
@@ -303,9 +296,9 @@ def populate_non_dim_coordinates(
303296
return dataset, drop_vars_delayed
304297

305298

306-
def _get_horizontal_coordinate_unit(segy_info: SegyFileHeaderDump) -> LengthUnitModel | None:
299+
def _get_horizontal_coordinate_unit(segy_file_info: SegyFileInfo) -> LengthUnitModel | None:
307300
"""Get the coordinate unit from the SEG-Y headers."""
308-
measurement_system_code = int(segy_info.binary_header_dict[MEASUREMENT_SYSTEM_KEY])
301+
measurement_system_code = int(segy_file_info.binary_header_dict[MEASUREMENT_SYSTEM_KEY])
309302

310303
if measurement_system_code not in (1, 2):
311304
logger.warning(
@@ -359,19 +352,19 @@ def _populate_coordinates(
359352
return dataset, drop_vars_delayed
360353

361354

362-
def _add_segy_file_headers(xr_dataset: xr_Dataset, segy_file_header_dump: SegyFileHeaderDump) -> xr_Dataset:
355+
def _add_segy_file_headers(xr_dataset: xr_Dataset, segy_file_info: SegyFileInfo) -> xr_Dataset:
363356
save_file_header = os.getenv("MDIO__IMPORT__SAVE_SEGY_FILE_HEADER", "") in ("1", "true", "yes", "on")
364357
if not save_file_header:
365358
return xr_dataset
366359

367360
expected_rows = 40
368361
expected_cols = 80
369362

370-
text_header_rows = segy_file_header_dump.text_header.splitlines()
363+
text_header_rows = segy_file_info.text_header.splitlines()
371364
text_header_cols_bad = [len(row) != expected_cols for row in text_header_rows]
372365

373366
if len(text_header_rows) != expected_rows:
374-
err = f"Invalid text header count: expected {expected_rows}, got {len(segy_file_header_dump.text_header)}"
367+
err = f"Invalid text header count: expected {expected_rows}, got {len(segy_file_info.text_header)}"
375368
raise ValueError(err)
376369

377370
if any(text_header_cols_bad):
@@ -381,12 +374,12 @@ def _add_segy_file_headers(xr_dataset: xr_Dataset, segy_file_header_dump: SegyFi
381374
xr_dataset["segy_file_header"] = ((), "")
382375
xr_dataset["segy_file_header"].attrs.update(
383376
{
384-
"textHeader": segy_file_header_dump.text_header,
385-
"binaryHeader": segy_file_header_dump.binary_header_dict,
377+
"textHeader": segy_file_info.text_header,
378+
"binaryHeader": segy_file_info.binary_header_dict,
386379
}
387380
)
388381
if os.getenv("MDIO__IMPORT__RAW_HEADERS") in ("1", "true", "yes", "on"):
389-
raw_binary_base64 = base64.b64encode(segy_file_header_dump.raw_binary_headers).decode("ascii")
382+
raw_binary_base64 = base64.b64encode(segy_file_info.raw_binary_headers).decode("ascii")
390383
xr_dataset["segy_file_header"].attrs.update({"rawBinaryHeader": raw_binary_base64})
391384

392385
return xr_dataset
@@ -482,6 +475,21 @@ def determine_target_size(var_type: str) -> int:
482475
ds.variables[index].metadata.chunk_grid = chunk_grid
483476

484477

478+
def _validate_spec_in_template(segy_spec: SegySpec, mdio_template: AbstractDatasetTemplate) -> None:
479+
"""Validate that the SegySpec has all required fields in the MDIO template."""
480+
header_fields = {field.name for field in segy_spec.trace.header.fields}
481+
482+
required_fields = set(mdio_template._dim_names[:-1]) | set(mdio_template._coord_names) | {"coordinate_scalar"}
483+
missing_fields = required_fields - header_fields
484+
485+
if missing_fields:
486+
err = (
487+
f"Required fields {sorted(missing_fields)} for template {mdio_template.name} "
488+
f"not found in the provided segy_spec"
489+
)
490+
raise ValueError(err)
491+
492+
485493
def segy_to_mdio( # noqa PLR0913
486494
segy_spec: SegySpec,
487495
mdio_template: AbstractDatasetTemplate,
@@ -507,6 +515,8 @@ def segy_to_mdio( # noqa PLR0913
507515
Raises:
508516
FileExistsError: If the output location already exists and overwrite is False.
509517
"""
518+
_validate_spec_in_template(segy_spec, mdio_template)
519+
510520
input_path = _normalize_path(input_path)
511521
output_path = _normalize_path(output_path)
512522

@@ -515,17 +525,21 @@ def segy_to_mdio( # noqa PLR0913
515525
raise FileExistsError(err)
516526

517527
segy_settings = SegyFileSettings(storage_options=input_path.storage_options)
518-
segy_file = SegyFile(
519-
url=input_path.as_posix(),
520-
spec=segy_spec,
521-
settings=segy_settings,
522-
header_overrides=segy_header_overrides,
528+
segy_file_kwargs: SegyFileArguments = {
529+
"url": input_path.as_posix(),
530+
"spec": segy_spec,
531+
"settings": segy_settings,
532+
"header_overrides": segy_header_overrides,
533+
}
534+
segy_file_info = _read_segy_file_info(segy_file_kwargs)
535+
536+
segy_dimensions, segy_headers = _scan_for_headers(
537+
segy_file_kwargs,
538+
segy_file_info,
539+
template=mdio_template,
540+
grid_overrides=grid_overrides,
523541
)
524-
segy_info: SegyFileHeaderDump = _get_segy_file_header_dump(segy_file)
525-
526-
segy_dimensions, segy_headers = _scan_for_headers(segy_file, mdio_template, grid_overrides)
527-
528-
grid = _build_and_check_grid(segy_dimensions, segy_file.num_traces, segy_headers)
542+
grid = _build_and_check_grid(segy_dimensions, segy_file_info, segy_headers)
529543

530544
_, non_dim_coords = _get_coordinates(grid, segy_headers, mdio_template)
531545
header_dtype = to_structured_type(segy_spec.trace.header.dtype)
@@ -537,7 +551,7 @@ def segy_to_mdio( # noqa PLR0913
537551
logger.warning("MDIO__IMPORT__RAW_HEADERS is experimental and expected to change or be removed.")
538552
mdio_template = _add_raw_headers_to_template(mdio_template)
539553

540-
horizontal_unit = _get_horizontal_coordinate_unit(segy_info)
554+
horizontal_unit = _get_horizontal_coordinate_unit(segy_file_info)
541555
mdio_ds: Dataset = mdio_template.build_dataset(
542556
name=mdio_template.name,
543557
sizes=grid.shape,
@@ -554,15 +568,14 @@ def segy_to_mdio( # noqa PLR0913
554568

555569
xr_dataset: xr_Dataset = to_xarray_dataset(mdio_ds=mdio_ds)
556570

557-
coordinate_scalar = _get_coordinate_scalar(segy_file)
558571
xr_dataset, drop_vars_delayed = _populate_coordinates(
559572
dataset=xr_dataset,
560573
grid=grid,
561574
coords=non_dim_coords,
562-
horizontal_coordinate_scalar=coordinate_scalar,
575+
horizontal_coordinate_scalar=segy_file_info.coordinate_scalar,
563576
)
564577

565-
xr_dataset = _add_segy_file_headers(xr_dataset, segy_info)
578+
xr_dataset = _add_segy_file_headers(xr_dataset, segy_file_info)
566579

567580
xr_dataset.trace_mask.data[:] = grid.live_mask
568581
# IMPORTANT: Do not drop the "trace_mask" here, as it will be used later in
@@ -583,7 +596,7 @@ def segy_to_mdio( # noqa PLR0913
583596
# This is an memory-expensive and time-consuming read-write operation
584597
# performed in chunks to save the memory
585598
blocked_io.to_zarr(
586-
segy_file=segy_file,
599+
segy_file_kwargs=segy_file_kwargs,
587600
output_path=output_path,
588601
grid_map=grid.map,
589602
dataset=xr_dataset,

src/mdio/core/indexing.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,11 +74,13 @@ def __next__(self) -> dict[str, slice]:
7474
# We build slices here. It is dimension agnostic
7575
current_start = next(self._ranges)
7676

77-
# TODO (Dmitriy Repin): Enhance ChunkIterator to make the last slice, if needed, smaller
78-
# https://github.com/TGSAI/mdio-python/issues/586
7977
start_indices = tuple(dim * chunk for dim, chunk in zip(current_start, self.len_chunks, strict=True))
8078

81-
stop_indices = tuple((dim + 1) * chunk for dim, chunk in zip(current_start, self.len_chunks, strict=True))
79+
# Calculate stop indices, making the last slice fit the data exactly
80+
stop_indices = tuple(
81+
min((dim + 1) * chunk, self.arr_shape[i])
82+
for i, (dim, chunk) in enumerate(zip(current_start, self.len_chunks, strict=True))
83+
)
8284

8385
slices = tuple(slice(start, stop) for start, stop in zip(start_indices, stop_indices, strict=True))
8486

0 commit comments

Comments
 (0)