Serialize text and binary headers (TGSAI#600)

dmitriyrepin · tasansal · web-flow · commit db0d5644198d · 2025-08-12T15:55:47.000-05:00
* Fix integration import tests

* mask_and_scale=False

* pre-commit

* PR Review issues

* serialize-text-and-binary-headers

* remove dev test data

* add back whitespace

* revert import changes

* fix attribute initialization in `_add_text_binary_headers`

* Add tests

* refactor: improve type annotations and docstrings in test utilities

* fix formatting

* remove redundant `str()` casting in `xr.open_dataset` calls

---------

Co-authored-by: Altay Sansal &lt;tasansal@users.noreply.github.com&gt;
diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py
@@ -280,6 +280,40 @@ def _populate_coordinates(
     return dataset, drop_vars_delayed
 
 
+def _add_text_binary_headers(dataset: Dataset, segy_file: SegyFile) -> None:
+    text_header = segy_file.text_header.splitlines()
+    # Validate:
+    # text_header this should be a 40-items array of strings with width of 80 characters.
+    item_count = 40
+    if len(text_header) != item_count:
+        err = f"Invalid text header count: expected {item_count}, got {len(text_header)}"
+        raise ValueError(err)
+    char_count = 80
+    for i, line in enumerate(text_header):
+        if len(line) != char_count:
+            err = f"Invalid text header {i} line length: expected {char_count}, got {len(line)}"
+            raise ValueError(err)
+    ext_text_header = segy_file.ext_text_header
+
+    # If using SegyFile.ext_text_header this should be a minimum of 40 elements and must
+    # capture all textual information (ensure text_header is a subset of ext_text_header).
+    if ext_text_header is not None:
+        for ext_hdr in ext_text_header:
+            text_header.append(ext_hdr.splitlines())
+
+    # Handle case where it may not have any metadata yet
+    if dataset.metadata.attributes is None:
+        dataset.attrs["attributes"] = {}
+
+    # Update the attributes with the text and binary headers.
+    dataset.metadata.attributes.update(
+        {
+            "textHeader": text_header,
+            "binaryHeader": segy_file.binary_header.to_dict(),
+        }
+    )
+
+
 def segy_to_mdio(
     segy_spec: SegySpec,
     mdio_template: AbstractDatasetTemplate,
@@ -324,6 +358,8 @@ def segy_to_mdio(
         name=mdio_template.name, sizes=shape, horizontal_coord_unit=horizontal_unit, headers=headers
     )
 
+    _add_text_binary_headers(dataset=mdio_ds, segy_file=segy_file)
+
     xr_dataset: xr_Dataset = to_xarray_dataset(mdio_ds=mdio_ds)
 
     xr_dataset, drop_vars_delayed = _populate_coordinates(
diff --git a/tests/integration/test_segy_import_export.py b/tests/integration/test_segy_import_export.py
@@ -13,6 +13,8 @@
 import xarray as xr
 from segy import SegyFile
 from segy.standards import get_segy_standard
+from tests.integration.testing_data import binary_header_teapot_dome
+from tests.integration.testing_data import text_header_teapot_dome
 from tests.integration.testing_helpers import customize_segy_specs
 from tests.integration.testing_helpers import get_inline_header_values
 from tests.integration.testing_helpers import get_values
@@ -266,8 +268,8 @@ def test_3d_import(
     segy_to_mdio(
         segy_spec=segy_spec,
         mdio_template=TemplateRegistry().get("PostStack3DTime"),
-        input_location=StorageLocation(segy_input.__str__()),
-        output_location=StorageLocation(zarr_tmp.__str__()),
+        input_location=StorageLocation(str(segy_input)),
+        output_location=StorageLocation(str(zarr_tmp)),
         overwrite=True,
     )
 
@@ -278,11 +280,9 @@ class TestReader:
 
     def test_meta_dataset_read(self, zarr_tmp: Path) -> None:
         """Metadata reading tests."""
-        path = zarr_tmp.__str__()
-        # path = "/tmp/pytest-of-vscode/my-mdio/mdio0"
         # NOTE: If mask_and_scale is not set,
         # Xarray will convert int to float and replace _FillValue with NaN
-        ds = xr.open_dataset(path, engine="zarr", mask_and_scale=False)
+        ds = xr.open_dataset(zarr_tmp, engine="zarr", mask_and_scale=False)
         expected_attrs = {
             "apiVersion": "1.0.0a1",
             "createdOn": "2025-08-06 16:21:54.747880+00:00",
@@ -297,13 +297,25 @@ def test_meta_dataset_read(self, zarr_tmp: Path) -> None:
             else:
                 assert actual_attrs_json[key] == value
 
+        attributes = ds.attrs["attributes"]
+        assert attributes is not None
+
+        # Validate attributes provided by the template
+        assert attributes["surveyDimensionality"] == "3D"
+        assert attributes["ensembleType"] == "line"
+        assert attributes["processingStage"] == "post-stack"
+
+        # Validate text header
+        assert attributes["textHeader"] == text_header_teapot_dome()
+
+        # Validate binary header
+        assert attributes["binaryHeader"] == binary_header_teapot_dome()
+
     def test_meta_variable_read(self, zarr_tmp: Path) -> None:
         """Metadata reading tests."""
-        path = zarr_tmp.__str__()
         # NOTE: If mask_and_scale is not set,
         # Xarray will convert int to float and replace _FillValue with NaN
-        # path = "/tmp/pytest-of-vscode/my-mdio/mdio0"
-        ds = xr.open_dataset(path, engine="zarr", mask_and_scale=False)
+        ds = xr.open_dataset(zarr_tmp, engine="zarr", mask_and_scale=False)
         expected_attrs = {
             "count": 97354860,
             "sum": -8594.551666259766,
@@ -318,11 +330,9 @@ def test_meta_variable_read(self, zarr_tmp: Path) -> None:
     def test_grid(self, zarr_tmp: Path) -> None:
         """Test validating MDIO variables."""
         # Load Xarray dataset from the MDIO file
-        path = zarr_tmp.__str__()
-        # path = "/tmp/pytest-of-vscode/my-mdio/mdio0"
         # NOTE: If mask_and_scale is not set,
         # Xarray will convert int to float and replace _FillValue with NaN
-        ds = xr.open_dataset(path, engine="zarr", mask_and_scale=False)
+        ds = xr.open_dataset(zarr_tmp, engine="zarr", mask_and_scale=False)
 
         # Note: in order to create the dataset we used the Time template, so the
         # sample dimension is called "time"
@@ -366,34 +376,28 @@ def test_grid(self, zarr_tmp: Path) -> None:
 
     def test_inline(self, zarr_tmp: Path) -> None:
         """Read and compare every 75 inlines' mean and std. dev."""
-        path = zarr_tmp.__str__()
-        # path = "/tmp/pytest-of-vscode/my-mdio/mdio0"
         # NOTE: If mask_and_scale is not set,
         # Xarray will convert int to float and replace _FillValue with NaN
-        ds = xr.open_dataset(path, engine="zarr", mask_and_scale=False)
+        ds = xr.open_dataset(zarr_tmp, engine="zarr", mask_and_scale=False)
         inlines = ds["amplitude"][::75, :, :]
         mean, std = inlines.mean(), inlines.std()
         npt.assert_allclose([mean, std], [1.0555277e-04, 6.0027051e-01])
 
     def test_crossline(self, zarr_tmp: Path) -> None:
         """Read and compare every 75 crosslines' mean and std. dev."""
-        path = zarr_tmp.__str__()
-        # path = "/tmp/pytest-of-vscode/my-mdio/mdio0"
         # NOTE: If mask_and_scale is not set,
         # Xarray will convert int to float and replace _FillValue with NaN
-        ds = xr.open_dataset(path, engine="zarr", mask_and_scale=False)
+        ds = xr.open_dataset(zarr_tmp, engine="zarr", mask_and_scale=False)
         xlines = ds["amplitude"][:, ::75, :]
         mean, std = xlines.mean(), xlines.std()
 
         npt.assert_allclose([mean, std], [-5.0329847e-05, 5.9406823e-01])
 
     def test_zslice(self, zarr_tmp: Path) -> None:
         """Read and compare every 225 z-slices' mean and std. dev."""
-        path = zarr_tmp.__str__()
-        # path = "/tmp/pytest-of-vscode/my-mdio/mdio0"
         # NOTE: If mask_and_scale is not set,
         # Xarray will convert int to float and replace _FillValue with NaN
-        ds = xr.open_dataset(path, engine="zarr", mask_and_scale=False)
+        ds = xr.open_dataset(zarr_tmp, engine="zarr", mask_and_scale=False)
         slices = ds["amplitude"][:, :, ::225]
         mean, std = slices.mean(), slices.std()
         npt.assert_allclose([mean, std], [0.005236923, 0.61279935])
diff --git a/tests/integration/testing_data.py b/tests/integration/testing_data.py
@@ -0,0 +1,84 @@
+"""Integration tests data for teapot dome SEG-Y."""
+
+
+def text_header_teapot_dome() -> list[str]:
+    """Return the teapot dome expected text header."""
+    return [
+        "C 1 CLIENT: ROCKY MOUNTAIN OILFIELD TESTING CENTER                              ",
+        "C 2 PROJECT: NAVAL PETROLEUM RESERVE #3 (TEAPOT DOME); NATRONA COUNTY, WYOMING  ",
+        "C 3 LINE: 3D                                                                    ",
+        "C 4                                                                             ",
+        "C 5 THIS IS THE FILTERED POST STACK MIGRATION                                   ",
+        "C 6                                                                             ",
+        "C 7 INLINE 1, XLINE 1:   X COORDINATE: 788937  Y COORDINATE: 938845             ",
+        "C 8 INLINE 1, XLINE 188: X COORDINATE: 809501  Y COORDINATE: 939333             ",
+        "C 9 INLINE 188, XLINE 1: X COORDINATE: 788039  Y COORDINATE: 976674             ",
+        "C10 INLINE NUMBER:    MIN: 1  MAX: 345  TOTAL: 345                              ",
+        "C11 CROSSLINE NUMBER: MIN: 1  MAX: 188  TOTAL: 188                              ",
+        "C12 TOTAL NUMBER OF CDPS: 64860   BIN DIMENSION: 110' X 110'                    ",
+        "C13                                                                             ",
+        "C14                                                                             ",
+        "C15                                                                             ",
+        "C16                                                                             ",
+        "C17                                                                             ",
+        "C18                                                                             ",
+        "C19 GENERAL SEGY INFORMATION                                                    ",
+        "C20 RECORD LENGHT (MS): 3000                                                    ",
+        "C21 SAMPLE RATE (MS): 2.0                                                       ",
+        "C22 DATA FORMAT: 4 BYTE IBM FLOATING POINT                                      ",
+        "C23 BYTES  13- 16: CROSSLINE NUMBER (TRACE)                                     ",
+        "C24 BYTES  17- 20: INLINE NUMBER (LINE)                                         ",
+        "C25 BYTES  81- 84: CDP_X COORD                                                  ",
+        "C26 BYTES  85- 88: CDP_Y COORD                                                  ",
+        "C27 BYTES 181-184: INLINE NUMBER (LINE)                                         ",
+        "C28 BYTES 185-188: CROSSLINE NUMBER (TRACE)                                     ",
+        "C29 BYTES 189-192: CDP_X COORD                                                  ",
+        "C30 BYTES 193-196: CDP_Y COORD                                                  ",
+        "C31                                                                             ",
+        "C32                                                                             ",
+        "C33                                                                             ",
+        "C34                                                                             ",
+        "C35                                                                             ",
+        "C36 Processed by: Excel Geophysical Services, Inc.                              ",
+        "C37               8301 East Prentice Ave. Ste. 402                              ",
+        "C38               Englewood, Colorado 80111                                     ",
+        "C39               (voice) 303.694.9629 (fax) 303.771.1646                       ",
+        "C40 END EBCDIC                                                                  ",
+    ]
+
+
+def binary_header_teapot_dome() -> dict[str, int]:
+    """Return the teapot dome expected binary header."""
+    return {
+        "job_id": 9999,
+        "line_num": 9999,
+        "reel_num": 1,
+        "data_traces_per_ensemble": 188,
+        "aux_traces_per_ensemble": 0,
+        "sample_interval": 2000,
+        "orig_sample_interval": 0,
+        "samples_per_trace": 1501,
+        "orig_samples_per_trace": 1501,
+        "data_sample_format": 1,
+        "ensemble_fold": 57,
+        "trace_sorting_code": 4,
+        "vertical_sum_code": 1,
+        "sweep_freq_start": 0,
+        "sweep_freq_end": 0,
+        "sweep_length": 0,
+        "sweep_type_code": 0,
+        "sweep_trace_num": 0,
+        "sweep_taper_start": 0,
+        "sweep_taper_end": 0,
+        "taper_type_code": 0,
+        "correlated_data_code": 2,
+        "binary_gain_code": 1,
+        "amp_recovery_code": 4,
+        "measurement_system_code": 2,
+        "impulse_polarity_code": 1,
+        "vibratory_polarity_code": 0,
+        "fixed_length_trace_flag": 0,
+        "num_extended_text_headers": 0,
+        "segy_revision_major": 0,
+        "segy_revision_minor": 0,
+    }