Removed raw byte inserts (#10)

BrianMichell · web-flow · commit 817e0f985417 · 2025-09-30T15:51:29.000-05:00
* Update Xarray api access (TGSAI#688) * Reimplement disaster recovery logic * Ensure getting true raw bytes for DR array * Linting * Add v2 issue check * Fix pre-commit * Profiled disaster recovery array (#8) - Avoids duplicate read regression issue - Implements isolated and testable logic * Fix unclosed parenthesis * Linting * Test DR compatibility with all tested schemas * Fix missing test fixture error * Suppress unused linting error * Attempt to use view * Add hex-dump and MDIO output reproducer * Fixes * Cleanup * Provide clean disaster recovery interface * Begin work on tests * Fix flattening issue * Push for debugging * Numpy updates * Testing * Working end-to-end examples * Cleanup * Bandaid fix * linting pass 1 * Fix logic issue * Use wrapper class * Precommit * Remove external debugging code * Remove debug code * Remove errant numpy additon to pyproject toml * Fix uv lock to mainline * Pre-commit * Removed raw byte inserts Removed the insertions of raw bytes into the raw bytes Variable. This issue will be addressed in tgsai/segy release >0.5.1 * Use new segy API calls * Updates to get working * Use released version * Linting
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,10 +26,10 @@ dependencies = [
     "psutil>=7.0.0",
     "pydantic>=2.11.9",
     "rich>=14.1.0",
-    "segy>=0.5.0",
+    "segy>=0.5.1.post1",
     "tqdm>=4.67.1",
     "universal-pathlib>=0.2.6",
-    "xarray>=2025.9.0",
+    "xarray>=2025.9.1",
     "zarr>=3.1.3",
 ]
 
diff --git a/src/mdio/api/io.py b/src/mdio/api/io.py
@@ -10,7 +10,7 @@
 from upath import UPath
 from xarray import Dataset as xr_Dataset
 from xarray import open_zarr as xr_open_zarr
-from xarray.backends.api import to_zarr as xr_to_zarr
+from xarray.backends.writers import to_zarr as xr_to_zarr
 
 from mdio.constants import ZarrFormat
 from mdio.core.zarr_io import zarr_warnings_suppress_unstable_structs_v3
diff --git a/src/mdio/segy/_disaster_recovery_wrapper.py b/src/mdio/segy/_disaster_recovery_wrapper.py
@@ -2,10 +2,8 @@
 
 from __future__ import annotations
 
-from copy import deepcopy
 from typing import TYPE_CHECKING
 
-import numpy as np
 
 if TYPE_CHECKING:
     from numpy.typing import NDArray
@@ -16,19 +14,21 @@ class SegyFileTraceDataWrapper:
     def __init__(self, segy_file: SegyFile, indices: int | list[int] | NDArray | slice):
         self.segy_file = segy_file
         self.indices = indices
-        self._header_pipeline = deepcopy(segy_file.accessors.header_decode_pipeline)
-        segy_file.accessors.header_decode_pipeline.transforms = []
-        self.traces = segy_file.trace[indices]
 
-    @property
-    def header(self) -> NDArray:
-        # The copy is necessary to avoid applying the pipeline to the original header.
-        return self._header_pipeline.apply(self.traces.header.copy())
+        self.idx = self.segy_file.trace.normalize_and_validate_query(self.indices)
+        self.traces = self.segy_file.trace.fetch(self.idx, raw=True)
+
+        self.raw_view = self.traces.view(self.segy_file.spec.trace.dtype)
+        self.decoded_traces = self.segy_file.accessors.trace_decode_pipeline.apply(self.raw_view.copy())
 
     @property
     def raw_header(self) -> NDArray:
-        return np.ascontiguousarray(self.traces.header.copy()).view("|V240")
+        return self.raw_view.header.view("|V240")
+
+    @property
+    def header(self) -> NDArray:
+        return self.decoded_traces.header
 
     @property
     def sample(self) -> NDArray:
-        return self.traces.sample
+        return self.decoded_traces.sample
diff --git a/src/mdio/segy/creation.py b/src/mdio/segy/creation.py
@@ -28,6 +28,38 @@
 logger = logging.getLogger(__name__)
 
 
+def _filter_raw_unspecified_fields(headers: NDArray) -> NDArray:
+    """Filter out __MDIO_RAW_UNSPECIFIED_Field_* fields from headers array.
+
+    These fields are added during SEGY import to preserve raw header bytes,
+    but they cause dtype mismatches during export. This function removes them.
+
+    Args:
+        headers: Header array that may contain raw unspecified fields.
+
+    Returns:
+        Header array with raw unspecified fields removed.
+    """
+    if headers.dtype.names is None:
+        return headers
+
+    # Find field names that don't start with __MDIO_RAW_UNSPECIFIED_
+    field_names = [name for name in headers.dtype.names if not name.startswith("__MDIO_RAW_UNSPECIFIED_")]
+
+    if len(field_names) == len(headers.dtype.names):
+        # No raw unspecified fields found, return as-is
+        return headers
+
+    # Create new structured array with only the non-raw fields
+    new_dtype = [(name, headers.dtype.fields[name][0]) for name in field_names]
+    filtered_headers = np.empty(headers.shape, dtype=new_dtype)
+
+    for name in field_names:
+        filtered_headers[name] = headers[name]
+
+    return filtered_headers
+
+
 def make_segy_factory(spec: SegySpec, binary_header: dict[str, int]) -> SegyFactory:
     """Generate SEG-Y factory from MDIO metadata."""
     sample_interval = binary_header["sample_interval"]
@@ -167,7 +199,9 @@ def serialize_to_segy_stack(  # noqa: PLR0913
         samples = samples[live_mask]
         headers = headers[live_mask]
 
-        buffer = segy_factory.create_traces(headers, samples)
+        # Filter out raw unspecified fields that cause dtype mismatches
+        filtered_headers = _filter_raw_unspecified_fields(headers)
+        buffer = segy_factory.create_traces(filtered_headers, samples)
 
         global_index = block_start[0]
         record_id_str = str(global_index)
@@ -199,7 +233,9 @@ def serialize_to_segy_stack(  # noqa: PLR0913
             rec_samples = samples[rec_index][rec_live_mask]
             rec_headers = headers[rec_index][rec_live_mask]
 
-            buffer = segy_factory.create_traces(rec_headers, rec_samples)
+            # Filter out raw unspecified fields that cause dtype mismatches
+            filtered_headers = _filter_raw_unspecified_fields(rec_headers)
+            buffer = segy_factory.create_traces(filtered_headers, rec_samples)
 
             global_index = tuple(block_start[i] + rec_index[i] for i in range(record_ndim))
             record_id_str = "/".join(map(str, global_index))
diff --git a/tests/unit/test_disaster_recovery_wrapper.py b/tests/unit/test_disaster_recovery_wrapper.py
@@ -287,39 +287,3 @@ def test_different_index_types(
             expected_count = 1
 
         assert wrapper.header.size == expected_count
-
-    def test_header_pipeline_preservation(self, temp_dir: Path, basic_segy_spec: SegySpec, segy_config: dict) -> None:
-        """Test that the wrapper preserves the original header pipeline."""
-        config_name = segy_config["name"]
-        endianness = segy_config["endianness"]
-        data_format = segy_config["data_format"]
-
-        segy_path = temp_dir / f"test_pipeline_{config_name}.segy"
-
-        # Create test SEGY file
-        num_traces = 5
-        samples_per_trace = SAMPLES_PER_TRACE
-
-        spec = self.create_test_segy_file(
-            spec=basic_segy_spec,
-            num_traces=num_traces,
-            samples_per_trace=samples_per_trace,
-            output_path=segy_path,
-            endianness=endianness,
-            data_format=data_format,
-        )
-
-        # Load the SEGY file
-        segy_file = SegyFile(segy_path, spec=spec)
-
-        # Store original pipeline transforms count
-        original_transforms_count = len(segy_file.accessors.header_decode_pipeline.transforms)
-
-        # Create wrapper
-        wrapper = SegyFileTraceDataWrapper(segy_file, 0)
-
-        # Verify that the original SEGY file's pipeline was modified (transforms cleared)
-        assert len(segy_file.accessors.header_decode_pipeline.transforms) == 0
-
-        # Verify that the wrapper has its own pipeline with the original transforms
-        assert len(wrapper._header_pipeline.transforms) == original_transforms_count
diff --git a/uv.lock b/uv.lock