Environment variable flag to preserve unmodified, raw trace headers from SEG-Y as variable. (TGSAI#659)

BrianMichell · tasansal · web-flow · commit 1b982559bcf6 · 2025-09-30T17:19:21.000-05:00
* Reimplement disaster recovery logic * Ensure getting true raw bytes for DR array * Linting * Add v2 issue check * Fix pre-commit * Profiled disaster recovery array (#8) - Avoids duplicate read regression issue - Implements isolated and testable logic * Fix unclosed parenthesis * Linting * Test DR compatibility with all tested schemas * Fix missing test fixture error * Suppress unused linting error * Dr with modifications (#9) * Attempt to use view * Add hex-dump and MDIO output reproducer * Fixes * Cleanup * Provide clean disaster recovery interface * Begin work on tests * Fix flattening issue * Push for debugging * Numpy updates * Testing * Working end-to-end examples * Cleanup * Bandaid fix * linting pass 1 * Fix logic issue * Use wrapper class * Precommit * Remove external debugging code * Remove debug code * Remove errant numpy additon to pyproject toml * Fix uv lock to mainline * Pre-commit * Remove raw field additions. Depends on segy >= 0.5.1 * Removed raw byte inserts (#10) * Update Xarray api access (TGSAI#688) * Reimplement disaster recovery logic * Ensure getting true raw bytes for DR array * Linting * Add v2 issue check * Fix pre-commit * Profiled disaster recovery array (#8) - Avoids duplicate read regression issue - Implements isolated and testable logic * Fix unclosed parenthesis * Linting * Test DR compatibility with all tested schemas * Fix missing test fixture error * Suppress unused linting error * Attempt to use view * Add hex-dump and MDIO output reproducer * Fixes * Cleanup * Provide clean disaster recovery interface * Begin work on tests * Fix flattening issue * Push for debugging * Numpy updates * Testing * Working end-to-end examples * Cleanup * Bandaid fix * linting pass 1 * Fix logic issue * Use wrapper class * Precommit * Remove external debugging code * Remove debug code * Remove errant numpy additon to pyproject toml * Fix uv lock to mainline * Pre-commit * Removed raw byte inserts Removed the insertions of raw bytes into the raw bytes Variable. This issue will be addressed in tgsai/segy release >0.5.1 * Use new segy API calls * Updates to get working * Use released version * Linting * Linting * revert filter/raw stuff * rename env var for raw headers * simplify 240-byte scalar type * rename trace wrapper and do lazy decoding --------- Co-authored-by: Altay Sansal <tasansal@users.noreply.github.com>
diff --git a/src/mdio/builder/schemas/dtype.py b/src/mdio/builder/schemas/dtype.py
@@ -32,6 +32,7 @@ class ScalarType(StrEnum):
     COMPLEX64 = "complex64"
     COMPLEX128 = "complex128"
     COMPLEX256 = "complex256"
+    BYTES240 = "V240"  # fixed-width 240-byte string, used for raw v0/1/2 trace headers
 
 
 class StructuredField(CamelCaseStrictModel):
diff --git a/src/mdio/constants.py b/src/mdio/constants.py
@@ -64,4 +64,5 @@ class ZarrFormat(IntEnum):
     ScalarType.COMPLEX64: complex(np.nan, np.nan),
     ScalarType.COMPLEX128: complex(np.nan, np.nan),
     ScalarType.COMPLEX256: complex(np.nan, np.nan),
+    ScalarType.BYTES240: b"\x00" * 240,
 }
diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py
@@ -7,16 +7,24 @@
 from typing import TYPE_CHECKING
 
 import numpy as np
+import zarr
 from segy import SegyFile
 from segy.config import SegySettings
 from segy.standards.codes import MeasurementSystem as segy_MeasurementSystem
 from segy.standards.fields.trace import Rev0 as TraceHeaderFieldsRev0
 
 from mdio.api.io import _normalize_path
 from mdio.api.io import to_mdio
+from mdio.builder.schemas.chunk_grid import RegularChunkGrid
+from mdio.builder.schemas.chunk_grid import RegularChunkShape
+from mdio.builder.schemas.compressors import Blosc
+from mdio.builder.schemas.compressors import BloscCname
+from mdio.builder.schemas.dtype import ScalarType
 from mdio.builder.schemas.v1.units import LengthUnitEnum
 from mdio.builder.schemas.v1.units import LengthUnitModel
+from mdio.builder.schemas.v1.variable import VariableMetadata
 from mdio.builder.xarray_builder import to_xarray_dataset
+from mdio.constants import ZarrFormat
 from mdio.converters.exceptions import EnvironmentFormatError
 from mdio.converters.exceptions import GridTraceCountError
 from mdio.converters.exceptions import GridTraceSparsityError
@@ -333,6 +341,61 @@ def _add_grid_override_to_metadata(dataset: Dataset, grid_overrides: dict[str, A
         dataset.metadata.attributes["gridOverrides"] = grid_overrides
 
 
+def _add_raw_headers_to_template(mdio_template: AbstractDatasetTemplate) -> AbstractDatasetTemplate:
+    """Add raw headers capability to the MDIO template by monkey-patching its _add_variables method.
+
+    This function modifies the template's _add_variables method to also add a raw headers variable
+    with the following characteristics:
+    - Same rank as the Headers variable (all dimensions except vertical)
+    - Name: "RawHeaders"
+    - Type: ScalarType.HEADERS
+    - No coordinates
+    - zstd compressor
+    - No additional metadata
+    - Chunked the same as the Headers variable
+
+    Args:
+        mdio_template: The MDIO template to mutate
+    Returns:
+        The mutated MDIO template
+    """
+    # Check if raw headers enhancement has already been applied to avoid duplicate additions
+    if hasattr(mdio_template, "_mdio_raw_headers_enhanced"):
+        return mdio_template
+
+    # Store the original _add_variables method
+    original_add_variables = mdio_template._add_variables
+
+    def enhanced_add_variables() -> None:
+        # Call the original method first
+        original_add_variables()
+
+        # Now add the raw headers variable
+        chunk_shape = mdio_template._var_chunk_shape[:-1]
+
+        # Create chunk grid metadata
+        chunk_metadata = RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=chunk_shape))
+
+        # Add the raw headers variable using the builder's add_variable method
+        mdio_template._builder.add_variable(
+            name="raw_headers",
+            long_name="Raw Headers",
+            dimensions=mdio_template._dim_names[:-1],  # All dimensions except vertical
+            data_type=ScalarType.BYTES240,
+            compressor=Blosc(cname=BloscCname.zstd),
+            coordinates=None,  # No coordinates as specified
+            metadata=VariableMetadata(chunk_grid=chunk_metadata),
+        )
+
+    # Replace the template's _add_variables method
+    mdio_template._add_variables = enhanced_add_variables
+
+    # Mark the template as enhanced to prevent duplicate monkey-patching
+    mdio_template._mdio_raw_headers_enhanced = True
+
+    return mdio_template
+
+
 def segy_to_mdio(  # noqa PLR0913
     segy_spec: SegySpec,
     mdio_template: AbstractDatasetTemplate,
@@ -372,6 +435,14 @@ def segy_to_mdio(  # noqa PLR0913
 
     _, non_dim_coords = _get_coordinates(grid, segy_headers, mdio_template)
     header_dtype = to_structured_type(segy_spec.trace.header.dtype)
+
+    if os.getenv("MDIO__IMPORT__RAW_HEADERS") in ("1", "true", "yes", "on"):
+        if zarr.config.get("default_zarr_format") == ZarrFormat.V2:
+            logger.warning("Raw headers are only supported for Zarr v3. Skipping raw headers.")
+        else:
+            logger.warning("MDIO__IMPORT__RAW_HEADERS is experimental and expected to change or be removed.")
+            mdio_template = _add_raw_headers_to_template(mdio_template)
+
     horizontal_unit = _get_horizontal_coordinate_unit(segy_dimensions)
     mdio_ds: Dataset = mdio_template.build_dataset(
         name=mdio_template.name,
diff --git a/src/mdio/segy/_raw_trace_wrapper.py b/src/mdio/segy/_raw_trace_wrapper.py
@@ -0,0 +1,49 @@
+"""Consumer-side utility to get both raw and transformed header data with single filesystem read."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+if TYPE_CHECKING:
+    from numpy.typing import NDArray
+    from segy import SegyFile
+
+
+class SegyFileRawTraceWrapper:
+    def __init__(self, segy_file: SegyFile, indices: int | list[int] | NDArray | slice):
+        self.segy_file = segy_file
+        self.indices = indices
+
+        self.idx = self.segy_file.trace.normalize_and_validate_query(self.indices)
+        self.trace_buffer_array = self.segy_file.trace.fetch(self.idx, raw=True)
+
+        self.trace_view = self.trace_buffer_array.view(self.segy_file.spec.trace.dtype)
+
+        self.trace_decode_pipeline = self.segy_file.accessors.trace_decode_pipeline
+        self.decoded_traces = None  # decode later when not-raw header/sample is called
+
+    def _ensure_decoded(self) -> None:
+        """Apply trace decoding pipeline if not already done."""
+        if self.decoded_traces is not None:  # already done
+            return
+        self.decoded_traces = self.trace_decode_pipeline.apply(self.trace_view.copy())
+
+    @property
+    def raw_header(self) -> NDArray:
+        """Get byte array view of the raw headers."""
+        header_itemsize = self.segy_file.spec.trace.header.itemsize  # should be 240
+        return self.trace_view.header.view(np.dtype((np.void, header_itemsize)))
+
+    @property
+    def header(self) -> NDArray:
+        """Get decoded header."""
+        self._ensure_decoded()  # decode when needed in-place to avoid copy.
+        return self.decoded_traces.header
+
+    @property
+    def sample(self) -> NDArray:
+        """Get decoded trace samples."""
+        self._ensure_decoded()  # decode when needed in-place to avoid copy.
+        return self.decoded_traces.sample
diff --git a/src/mdio/segy/_workers.py b/src/mdio/segy/_workers.py
@@ -12,6 +12,7 @@
 
 from mdio.api.io import to_mdio
 from mdio.builder.schemas.dtype import ScalarType
+from mdio.segy._raw_trace_wrapper import SegyFileRawTraceWrapper
 
 if TYPE_CHECKING:
     from segy.arrays import HeaderArray
@@ -121,18 +122,38 @@ def trace_worker(  # noqa: PLR0913
     zarr_config.set({"threading.max_workers": 1})
 
     live_trace_indexes = local_grid_map[not_null].tolist()
-    traces = segy_file.trace[live_trace_indexes]
 
     header_key = "headers"
+    raw_header_key = "raw_headers"
 
     # Get subset of the dataset that has not yet been saved
     # The headers might not be present in the dataset
     worker_variables = [data_variable_name]
     if header_key in dataset.data_vars:  # Keeping the `if` here to allow for more worker configurations
         worker_variables.append(header_key)
+    if raw_header_key in dataset.data_vars:
+        worker_variables.append(raw_header_key)
+
+    # traces = segy_file.trace[live_trace_indexes]
+    # Raw headers are not intended to remain as a feature of the SEGY ingestion.
+    # For that reason, we have wrapped the accessors to provide an interface that can be removed
+    # and not require additional changes to the below code.
+    # NOTE: The `raw_header_key` code block should be removed in full as it will become dead code.
+    traces = SegyFileRawTraceWrapper(segy_file, live_trace_indexes)
 
     ds_to_write = dataset[worker_variables]
 
+    if raw_header_key in worker_variables:
+        tmp_raw_headers = np.zeros_like(dataset[raw_header_key])
+        tmp_raw_headers[not_null] = traces.raw_header
+
+        ds_to_write[raw_header_key] = Variable(
+            ds_to_write[raw_header_key].dims,
+            tmp_raw_headers,
+            attrs=ds_to_write[raw_header_key].attrs,
+            encoding=ds_to_write[raw_header_key].encoding,  # Not strictly necessary, but safer than not doing it.
+        )
+
     if header_key in worker_variables:
         # TODO(BrianMichell): Implement this better so that we can enable fill values without changing the code
         # https://github.com/TGSAI/mdio-python/issues/584
diff --git a/tests/integration/test_segy_import_export_masked.py b/tests/integration/test_segy_import_export_masked.py
diff --git a/tests/unit/test_disaster_recovery_wrapper.py b/tests/unit/test_disaster_recovery_wrapper.py

Original file line number	Diff line number	Diff line change
`@@ -64,4 +64,5 @@ class ZarrFormat(IntEnum):`
`64`	`64`	`ScalarType.COMPLEX64: complex(np.nan, np.nan),`
`65`	`65`	`ScalarType.COMPLEX128: complex(np.nan, np.nan),`
`66`	`66`	`ScalarType.COMPLEX256: complex(np.nan, np.nan),`
	`67`	`+ ScalarType.BYTES240: b"\x00" * 240,`
`67`	`68`	`}`