Skip to content

Commit 817e0f9

Browse files
authored
Removed raw byte inserts (#10)
* Update Xarray api access (TGSAI#688) * Reimplement disaster recovery logic * Ensure getting true raw bytes for DR array * Linting * Add v2 issue check * Fix pre-commit * Profiled disaster recovery array (#8) - Avoids duplicate read regression issue - Implements isolated and testable logic * Fix unclosed parenthesis * Linting * Test DR compatibility with all tested schemas * Fix missing test fixture error * Suppress unused linting error * Attempt to use view * Add hex-dump and MDIO output reproducer * Fixes * Cleanup * Provide clean disaster recovery interface * Begin work on tests * Fix flattening issue * Push for debugging * Numpy updates * Testing * Working end-to-end examples * Cleanup * Bandaid fix * linting pass 1 * Fix logic issue * Use wrapper class * Precommit * Remove external debugging code * Remove debug code * Remove errant numpy additon to pyproject toml * Fix uv lock to mainline * Pre-commit * Removed raw byte inserts Removed the insertions of raw bytes into the raw bytes Variable. This issue will be addressed in tgsai/segy release >0.5.1 * Use new segy API calls * Updates to get working * Use released version * Linting
1 parent e5305b8 commit 817e0f9

File tree

6 files changed

+63
-63
lines changed

6 files changed

+63
-63
lines changed

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,10 @@ dependencies = [
2626
"psutil>=7.0.0",
2727
"pydantic>=2.11.9",
2828
"rich>=14.1.0",
29-
"segy>=0.5.0",
29+
"segy>=0.5.1.post1",
3030
"tqdm>=4.67.1",
3131
"universal-pathlib>=0.2.6",
32-
"xarray>=2025.9.0",
32+
"xarray>=2025.9.1",
3333
"zarr>=3.1.3",
3434
]
3535

src/mdio/api/io.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from upath import UPath
1111
from xarray import Dataset as xr_Dataset
1212
from xarray import open_zarr as xr_open_zarr
13-
from xarray.backends.api import to_zarr as xr_to_zarr
13+
from xarray.backends.writers import to_zarr as xr_to_zarr
1414

1515
from mdio.constants import ZarrFormat
1616
from mdio.core.zarr_io import zarr_warnings_suppress_unstable_structs_v3

src/mdio/segy/_disaster_recovery_wrapper.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,8 @@
22

33
from __future__ import annotations
44

5-
from copy import deepcopy
65
from typing import TYPE_CHECKING
76

8-
import numpy as np
97

108
if TYPE_CHECKING:
119
from numpy.typing import NDArray
@@ -16,19 +14,21 @@ class SegyFileTraceDataWrapper:
1614
def __init__(self, segy_file: SegyFile, indices: int | list[int] | NDArray | slice):
1715
self.segy_file = segy_file
1816
self.indices = indices
19-
self._header_pipeline = deepcopy(segy_file.accessors.header_decode_pipeline)
20-
segy_file.accessors.header_decode_pipeline.transforms = []
21-
self.traces = segy_file.trace[indices]
2217

23-
@property
24-
def header(self) -> NDArray:
25-
# The copy is necessary to avoid applying the pipeline to the original header.
26-
return self._header_pipeline.apply(self.traces.header.copy())
18+
self.idx = self.segy_file.trace.normalize_and_validate_query(self.indices)
19+
self.traces = self.segy_file.trace.fetch(self.idx, raw=True)
20+
21+
self.raw_view = self.traces.view(self.segy_file.spec.trace.dtype)
22+
self.decoded_traces = self.segy_file.accessors.trace_decode_pipeline.apply(self.raw_view.copy())
2723

2824
@property
2925
def raw_header(self) -> NDArray:
30-
return np.ascontiguousarray(self.traces.header.copy()).view("|V240")
26+
return self.raw_view.header.view("|V240")
27+
28+
@property
29+
def header(self) -> NDArray:
30+
return self.decoded_traces.header
3131

3232
@property
3333
def sample(self) -> NDArray:
34-
return self.traces.sample
34+
return self.decoded_traces.sample

src/mdio/segy/creation.py

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,38 @@
2828
logger = logging.getLogger(__name__)
2929

3030

31+
def _filter_raw_unspecified_fields(headers: NDArray) -> NDArray:
32+
"""Filter out __MDIO_RAW_UNSPECIFIED_Field_* fields from headers array.
33+
34+
These fields are added during SEGY import to preserve raw header bytes,
35+
but they cause dtype mismatches during export. This function removes them.
36+
37+
Args:
38+
headers: Header array that may contain raw unspecified fields.
39+
40+
Returns:
41+
Header array with raw unspecified fields removed.
42+
"""
43+
if headers.dtype.names is None:
44+
return headers
45+
46+
# Find field names that don't start with __MDIO_RAW_UNSPECIFIED_
47+
field_names = [name for name in headers.dtype.names if not name.startswith("__MDIO_RAW_UNSPECIFIED_")]
48+
49+
if len(field_names) == len(headers.dtype.names):
50+
# No raw unspecified fields found, return as-is
51+
return headers
52+
53+
# Create new structured array with only the non-raw fields
54+
new_dtype = [(name, headers.dtype.fields[name][0]) for name in field_names]
55+
filtered_headers = np.empty(headers.shape, dtype=new_dtype)
56+
57+
for name in field_names:
58+
filtered_headers[name] = headers[name]
59+
60+
return filtered_headers
61+
62+
3163
def make_segy_factory(spec: SegySpec, binary_header: dict[str, int]) -> SegyFactory:
3264
"""Generate SEG-Y factory from MDIO metadata."""
3365
sample_interval = binary_header["sample_interval"]
@@ -167,7 +199,9 @@ def serialize_to_segy_stack( # noqa: PLR0913
167199
samples = samples[live_mask]
168200
headers = headers[live_mask]
169201

170-
buffer = segy_factory.create_traces(headers, samples)
202+
# Filter out raw unspecified fields that cause dtype mismatches
203+
filtered_headers = _filter_raw_unspecified_fields(headers)
204+
buffer = segy_factory.create_traces(filtered_headers, samples)
171205

172206
global_index = block_start[0]
173207
record_id_str = str(global_index)
@@ -199,7 +233,9 @@ def serialize_to_segy_stack( # noqa: PLR0913
199233
rec_samples = samples[rec_index][rec_live_mask]
200234
rec_headers = headers[rec_index][rec_live_mask]
201235

202-
buffer = segy_factory.create_traces(rec_headers, rec_samples)
236+
# Filter out raw unspecified fields that cause dtype mismatches
237+
filtered_headers = _filter_raw_unspecified_fields(rec_headers)
238+
buffer = segy_factory.create_traces(filtered_headers, rec_samples)
203239

204240
global_index = tuple(block_start[i] + rec_index[i] for i in range(record_ndim))
205241
record_id_str = "/".join(map(str, global_index))

tests/unit/test_disaster_recovery_wrapper.py

Lines changed: 0 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -287,39 +287,3 @@ def test_different_index_types(
287287
expected_count = 1
288288

289289
assert wrapper.header.size == expected_count
290-
291-
def test_header_pipeline_preservation(self, temp_dir: Path, basic_segy_spec: SegySpec, segy_config: dict) -> None:
292-
"""Test that the wrapper preserves the original header pipeline."""
293-
config_name = segy_config["name"]
294-
endianness = segy_config["endianness"]
295-
data_format = segy_config["data_format"]
296-
297-
segy_path = temp_dir / f"test_pipeline_{config_name}.segy"
298-
299-
# Create test SEGY file
300-
num_traces = 5
301-
samples_per_trace = SAMPLES_PER_TRACE
302-
303-
spec = self.create_test_segy_file(
304-
spec=basic_segy_spec,
305-
num_traces=num_traces,
306-
samples_per_trace=samples_per_trace,
307-
output_path=segy_path,
308-
endianness=endianness,
309-
data_format=data_format,
310-
)
311-
312-
# Load the SEGY file
313-
segy_file = SegyFile(segy_path, spec=spec)
314-
315-
# Store original pipeline transforms count
316-
original_transforms_count = len(segy_file.accessors.header_decode_pipeline.transforms)
317-
318-
# Create wrapper
319-
wrapper = SegyFileTraceDataWrapper(segy_file, 0)
320-
321-
# Verify that the original SEGY file's pipeline was modified (transforms cleared)
322-
assert len(segy_file.accessors.header_decode_pipeline.transforms) == 0
323-
324-
# Verify that the wrapper has its own pipeline with the original transforms
325-
assert len(wrapper._header_pipeline.transforms) == original_transforms_count

uv.lock

Lines changed: 11 additions & 11 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)