Skip to content

Commit db0d564

Browse files
Serialize text and binary headers (TGSAI#600)
* Fix integration import tests * mask_and_scale=False * pre-commit * PR Review issues * serialize-text-and-binary-headers * remove dev test data * add back whitespace * revert import changes * fix attribute initialization in `_add_text_binary_headers` * Add tests * refactor: improve type annotations and docstrings in test utilities * fix formatting * remove redundant `str()` casting in `xr.open_dataset` calls --------- Co-authored-by: Altay Sansal <[email protected]>
1 parent 4762b7d commit db0d564

File tree

3 files changed

+144
-20
lines changed

3 files changed

+144
-20
lines changed

src/mdio/converters/segy.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,40 @@ def _populate_coordinates(
280280
return dataset, drop_vars_delayed
281281

282282

283+
def _add_text_binary_headers(dataset: Dataset, segy_file: SegyFile) -> None:
284+
text_header = segy_file.text_header.splitlines()
285+
# Validate:
286+
# text_header this should be a 40-items array of strings with width of 80 characters.
287+
item_count = 40
288+
if len(text_header) != item_count:
289+
err = f"Invalid text header count: expected {item_count}, got {len(text_header)}"
290+
raise ValueError(err)
291+
char_count = 80
292+
for i, line in enumerate(text_header):
293+
if len(line) != char_count:
294+
err = f"Invalid text header {i} line length: expected {char_count}, got {len(line)}"
295+
raise ValueError(err)
296+
ext_text_header = segy_file.ext_text_header
297+
298+
# If using SegyFile.ext_text_header this should be a minimum of 40 elements and must
299+
# capture all textual information (ensure text_header is a subset of ext_text_header).
300+
if ext_text_header is not None:
301+
for ext_hdr in ext_text_header:
302+
text_header.append(ext_hdr.splitlines())
303+
304+
# Handle case where it may not have any metadata yet
305+
if dataset.metadata.attributes is None:
306+
dataset.attrs["attributes"] = {}
307+
308+
# Update the attributes with the text and binary headers.
309+
dataset.metadata.attributes.update(
310+
{
311+
"textHeader": text_header,
312+
"binaryHeader": segy_file.binary_header.to_dict(),
313+
}
314+
)
315+
316+
283317
def segy_to_mdio(
284318
segy_spec: SegySpec,
285319
mdio_template: AbstractDatasetTemplate,
@@ -324,6 +358,8 @@ def segy_to_mdio(
324358
name=mdio_template.name, sizes=shape, horizontal_coord_unit=horizontal_unit, headers=headers
325359
)
326360

361+
_add_text_binary_headers(dataset=mdio_ds, segy_file=segy_file)
362+
327363
xr_dataset: xr_Dataset = to_xarray_dataset(mdio_ds=mdio_ds)
328364

329365
xr_dataset, drop_vars_delayed = _populate_coordinates(

tests/integration/test_segy_import_export.py

Lines changed: 24 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
import xarray as xr
1414
from segy import SegyFile
1515
from segy.standards import get_segy_standard
16+
from tests.integration.testing_data import binary_header_teapot_dome
17+
from tests.integration.testing_data import text_header_teapot_dome
1618
from tests.integration.testing_helpers import customize_segy_specs
1719
from tests.integration.testing_helpers import get_inline_header_values
1820
from tests.integration.testing_helpers import get_values
@@ -266,8 +268,8 @@ def test_3d_import(
266268
segy_to_mdio(
267269
segy_spec=segy_spec,
268270
mdio_template=TemplateRegistry().get("PostStack3DTime"),
269-
input_location=StorageLocation(segy_input.__str__()),
270-
output_location=StorageLocation(zarr_tmp.__str__()),
271+
input_location=StorageLocation(str(segy_input)),
272+
output_location=StorageLocation(str(zarr_tmp)),
271273
overwrite=True,
272274
)
273275

@@ -278,11 +280,9 @@ class TestReader:
278280

279281
def test_meta_dataset_read(self, zarr_tmp: Path) -> None:
280282
"""Metadata reading tests."""
281-
path = zarr_tmp.__str__()
282-
# path = "/tmp/pytest-of-vscode/my-mdio/mdio0"
283283
# NOTE: If mask_and_scale is not set,
284284
# Xarray will convert int to float and replace _FillValue with NaN
285-
ds = xr.open_dataset(path, engine="zarr", mask_and_scale=False)
285+
ds = xr.open_dataset(zarr_tmp, engine="zarr", mask_and_scale=False)
286286
expected_attrs = {
287287
"apiVersion": "1.0.0a1",
288288
"createdOn": "2025-08-06 16:21:54.747880+00:00",
@@ -297,13 +297,25 @@ def test_meta_dataset_read(self, zarr_tmp: Path) -> None:
297297
else:
298298
assert actual_attrs_json[key] == value
299299

300+
attributes = ds.attrs["attributes"]
301+
assert attributes is not None
302+
303+
# Validate attributes provided by the template
304+
assert attributes["surveyDimensionality"] == "3D"
305+
assert attributes["ensembleType"] == "line"
306+
assert attributes["processingStage"] == "post-stack"
307+
308+
# Validate text header
309+
assert attributes["textHeader"] == text_header_teapot_dome()
310+
311+
# Validate binary header
312+
assert attributes["binaryHeader"] == binary_header_teapot_dome()
313+
300314
def test_meta_variable_read(self, zarr_tmp: Path) -> None:
301315
"""Metadata reading tests."""
302-
path = zarr_tmp.__str__()
303316
# NOTE: If mask_and_scale is not set,
304317
# Xarray will convert int to float and replace _FillValue with NaN
305-
# path = "/tmp/pytest-of-vscode/my-mdio/mdio0"
306-
ds = xr.open_dataset(path, engine="zarr", mask_and_scale=False)
318+
ds = xr.open_dataset(zarr_tmp, engine="zarr", mask_and_scale=False)
307319
expected_attrs = {
308320
"count": 97354860,
309321
"sum": -8594.551666259766,
@@ -318,11 +330,9 @@ def test_meta_variable_read(self, zarr_tmp: Path) -> None:
318330
def test_grid(self, zarr_tmp: Path) -> None:
319331
"""Test validating MDIO variables."""
320332
# Load Xarray dataset from the MDIO file
321-
path = zarr_tmp.__str__()
322-
# path = "/tmp/pytest-of-vscode/my-mdio/mdio0"
323333
# NOTE: If mask_and_scale is not set,
324334
# Xarray will convert int to float and replace _FillValue with NaN
325-
ds = xr.open_dataset(path, engine="zarr", mask_and_scale=False)
335+
ds = xr.open_dataset(zarr_tmp, engine="zarr", mask_and_scale=False)
326336

327337
# Note: in order to create the dataset we used the Time template, so the
328338
# sample dimension is called "time"
@@ -366,34 +376,28 @@ def test_grid(self, zarr_tmp: Path) -> None:
366376

367377
def test_inline(self, zarr_tmp: Path) -> None:
368378
"""Read and compare every 75 inlines' mean and std. dev."""
369-
path = zarr_tmp.__str__()
370-
# path = "/tmp/pytest-of-vscode/my-mdio/mdio0"
371379
# NOTE: If mask_and_scale is not set,
372380
# Xarray will convert int to float and replace _FillValue with NaN
373-
ds = xr.open_dataset(path, engine="zarr", mask_and_scale=False)
381+
ds = xr.open_dataset(zarr_tmp, engine="zarr", mask_and_scale=False)
374382
inlines = ds["amplitude"][::75, :, :]
375383
mean, std = inlines.mean(), inlines.std()
376384
npt.assert_allclose([mean, std], [1.0555277e-04, 6.0027051e-01])
377385

378386
def test_crossline(self, zarr_tmp: Path) -> None:
379387
"""Read and compare every 75 crosslines' mean and std. dev."""
380-
path = zarr_tmp.__str__()
381-
# path = "/tmp/pytest-of-vscode/my-mdio/mdio0"
382388
# NOTE: If mask_and_scale is not set,
383389
# Xarray will convert int to float and replace _FillValue with NaN
384-
ds = xr.open_dataset(path, engine="zarr", mask_and_scale=False)
390+
ds = xr.open_dataset(zarr_tmp, engine="zarr", mask_and_scale=False)
385391
xlines = ds["amplitude"][:, ::75, :]
386392
mean, std = xlines.mean(), xlines.std()
387393

388394
npt.assert_allclose([mean, std], [-5.0329847e-05, 5.9406823e-01])
389395

390396
def test_zslice(self, zarr_tmp: Path) -> None:
391397
"""Read and compare every 225 z-slices' mean and std. dev."""
392-
path = zarr_tmp.__str__()
393-
# path = "/tmp/pytest-of-vscode/my-mdio/mdio0"
394398
# NOTE: If mask_and_scale is not set,
395399
# Xarray will convert int to float and replace _FillValue with NaN
396-
ds = xr.open_dataset(path, engine="zarr", mask_and_scale=False)
400+
ds = xr.open_dataset(zarr_tmp, engine="zarr", mask_and_scale=False)
397401
slices = ds["amplitude"][:, :, ::225]
398402
mean, std = slices.mean(), slices.std()
399403
npt.assert_allclose([mean, std], [0.005236923, 0.61279935])

tests/integration/testing_data.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
"""Integration tests data for teapot dome SEG-Y."""
2+
3+
4+
def text_header_teapot_dome() -> list[str]:
5+
"""Return the teapot dome expected text header."""
6+
return [
7+
"C 1 CLIENT: ROCKY MOUNTAIN OILFIELD TESTING CENTER ",
8+
"C 2 PROJECT: NAVAL PETROLEUM RESERVE #3 (TEAPOT DOME); NATRONA COUNTY, WYOMING ",
9+
"C 3 LINE: 3D ",
10+
"C 4 ",
11+
"C 5 THIS IS THE FILTERED POST STACK MIGRATION ",
12+
"C 6 ",
13+
"C 7 INLINE 1, XLINE 1: X COORDINATE: 788937 Y COORDINATE: 938845 ",
14+
"C 8 INLINE 1, XLINE 188: X COORDINATE: 809501 Y COORDINATE: 939333 ",
15+
"C 9 INLINE 188, XLINE 1: X COORDINATE: 788039 Y COORDINATE: 976674 ",
16+
"C10 INLINE NUMBER: MIN: 1 MAX: 345 TOTAL: 345 ",
17+
"C11 CROSSLINE NUMBER: MIN: 1 MAX: 188 TOTAL: 188 ",
18+
"C12 TOTAL NUMBER OF CDPS: 64860 BIN DIMENSION: 110' X 110' ",
19+
"C13 ",
20+
"C14 ",
21+
"C15 ",
22+
"C16 ",
23+
"C17 ",
24+
"C18 ",
25+
"C19 GENERAL SEGY INFORMATION ",
26+
"C20 RECORD LENGHT (MS): 3000 ",
27+
"C21 SAMPLE RATE (MS): 2.0 ",
28+
"C22 DATA FORMAT: 4 BYTE IBM FLOATING POINT ",
29+
"C23 BYTES 13- 16: CROSSLINE NUMBER (TRACE) ",
30+
"C24 BYTES 17- 20: INLINE NUMBER (LINE) ",
31+
"C25 BYTES 81- 84: CDP_X COORD ",
32+
"C26 BYTES 85- 88: CDP_Y COORD ",
33+
"C27 BYTES 181-184: INLINE NUMBER (LINE) ",
34+
"C28 BYTES 185-188: CROSSLINE NUMBER (TRACE) ",
35+
"C29 BYTES 189-192: CDP_X COORD ",
36+
"C30 BYTES 193-196: CDP_Y COORD ",
37+
"C31 ",
38+
"C32 ",
39+
"C33 ",
40+
"C34 ",
41+
"C35 ",
42+
"C36 Processed by: Excel Geophysical Services, Inc. ",
43+
"C37 8301 East Prentice Ave. Ste. 402 ",
44+
"C38 Englewood, Colorado 80111 ",
45+
"C39 (voice) 303.694.9629 (fax) 303.771.1646 ",
46+
"C40 END EBCDIC ",
47+
]
48+
49+
50+
def binary_header_teapot_dome() -> dict[str, int]:
51+
"""Return the teapot dome expected binary header."""
52+
return {
53+
"job_id": 9999,
54+
"line_num": 9999,
55+
"reel_num": 1,
56+
"data_traces_per_ensemble": 188,
57+
"aux_traces_per_ensemble": 0,
58+
"sample_interval": 2000,
59+
"orig_sample_interval": 0,
60+
"samples_per_trace": 1501,
61+
"orig_samples_per_trace": 1501,
62+
"data_sample_format": 1,
63+
"ensemble_fold": 57,
64+
"trace_sorting_code": 4,
65+
"vertical_sum_code": 1,
66+
"sweep_freq_start": 0,
67+
"sweep_freq_end": 0,
68+
"sweep_length": 0,
69+
"sweep_type_code": 0,
70+
"sweep_trace_num": 0,
71+
"sweep_taper_start": 0,
72+
"sweep_taper_end": 0,
73+
"taper_type_code": 0,
74+
"correlated_data_code": 2,
75+
"binary_gain_code": 1,
76+
"amp_recovery_code": 4,
77+
"measurement_system_code": 2,
78+
"impulse_polarity_code": 1,
79+
"vibratory_polarity_code": 0,
80+
"fixed_length_trace_flag": 0,
81+
"num_extended_text_headers": 0,
82+
"segy_revision_major": 0,
83+
"segy_revision_minor": 0,
84+
}

0 commit comments

Comments
 (0)