Skip to content

Commit 2df22be

Browse files
tasansalAltay Sansal
andauthored
Move SEG-Y metadata away from Dataset root (TGSAI#667)
* remove old segy revision encoder and rename function * use new rev encoder and refactor segy creation during export * add segy related metadata to a separate variable for cleaner dataset metadata * update tests --------- Co-authored-by: Altay Sansal <[email protected]>
1 parent 3e04f43 commit 2df22be

File tree

5 files changed

+54
-61
lines changed

5 files changed

+54
-61
lines changed

src/mdio/converters/segy.py

Lines changed: 30 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -293,41 +293,40 @@ def _populate_coordinates(
293293
return dataset, drop_vars_delayed
294294

295295

296-
def _add_segy_ingest_attributes(dataset: Dataset, segy_file: SegyFile, grid_overrides: dict[str, Any] | None) -> None:
297-
text_header = segy_file.text_header.splitlines()
298-
# Validate:
299-
# text_header this should be a 40-items array of strings with width of 80 characters.
300-
item_count = 40
301-
if len(text_header) != item_count:
302-
err = f"Invalid text header count: expected {item_count}, got {len(text_header)}"
296+
def _add_segy_file_headers(xr_dataset: xr_Dataset, segy_file: SegyFile) -> xr_Dataset:
297+
expected_rows = 40
298+
expected_cols = 80
299+
300+
text_header = segy_file.text_header
301+
text_header_rows = text_header.splitlines()
302+
text_header_cols_bad = [len(row) != expected_cols for row in text_header_rows]
303+
304+
if len(text_header_rows) != expected_rows:
305+
err = f"Invalid text header count: expected {expected_rows}, got {len(text_header)}"
303306
raise ValueError(err)
304-
char_count = 80
305-
for i, line in enumerate(text_header):
306-
if len(line) != char_count:
307-
err = f"Invalid text header {i} line length: expected {char_count}, got {len(line)}"
308-
raise ValueError(err)
309-
ext_text_header = segy_file.ext_text_header
310307

311-
# If using SegyFile.ext_text_header this should be a minimum of 40 elements and must
312-
# capture all textual information (ensure text_header is a subset of ext_text_header).
313-
if ext_text_header is not None:
314-
for ext_hdr in ext_text_header:
315-
text_header.append(ext_hdr.splitlines())
308+
if any(text_header_cols_bad):
309+
err = f"Invalid text header columns: expected {expected_cols} per line."
310+
raise ValueError(err)
311+
312+
xr_dataset["segy_file_header"] = ((), "")
313+
xr_dataset["segy_file_header"].attrs.update(
314+
{
315+
"textHeader": text_header,
316+
"binaryHeader": segy_file.binary_header.to_dict(),
317+
}
318+
)
319+
320+
return xr_dataset
321+
316322

317-
# Handle case where it may not have any metadata yet
323+
def _add_grid_override_to_metadata(dataset: Dataset, grid_overrides: dict[str, Any] | None) -> None:
324+
"""Add grid override to Dataset metadata if needed."""
318325
if dataset.metadata.attributes is None:
319326
dataset.metadata.attributes = {}
320327

321-
segy_attributes = {
322-
"textHeader": text_header,
323-
"binaryHeader": segy_file.binary_header.to_dict(),
324-
}
325-
326328
if grid_overrides is not None:
327-
segy_attributes["gridOverrides"] = grid_overrides
328-
329-
# Update the attributes with the text and binary headers.
330-
dataset.metadata.attributes.update(segy_attributes)
329+
dataset.metadata.attributes["gridOverrides"] = grid_overrides
331330

332331

333332
def segy_to_mdio( # noqa PLR0913
@@ -377,7 +376,7 @@ def segy_to_mdio( # noqa PLR0913
377376
header_dtype=header_dtype,
378377
)
379378

380-
_add_segy_ingest_attributes(dataset=mdio_ds, segy_file=segy_file, grid_overrides=grid_overrides)
379+
_add_grid_override_to_metadata(dataset=mdio_ds, grid_overrides=grid_overrides)
381380

382381
xr_dataset: xr_Dataset = to_xarray_dataset(mdio_ds=mdio_ds)
383382

@@ -387,8 +386,9 @@ def segy_to_mdio( # noqa PLR0913
387386
coords=non_dim_coords,
388387
)
389388

390-
xr_dataset.trace_mask.data[:] = grid.live_mask
389+
xr_dataset = _add_segy_file_headers(xr_dataset, segy_file)
391390

391+
xr_dataset.trace_mask.data[:] = grid.live_mask
392392
# IMPORTANT: Do not drop the "trace_mask" here, as it will be used later in
393393
# blocked_io.to_zarr() -> _workers.trace_worker()
394394

src/mdio/segy/compat.py

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -95,29 +95,22 @@ def mdio_segy_spec(version_str: str | None = None) -> SegySpec:
9595
)
9696

9797

98-
def revision_encode(binary_header: dict, version_str: str) -> dict:
98+
def encode_segy_revision(binary_header: dict) -> dict:
9999
"""Encode revision code to binary header.
100100
101-
We have two cases where legacy MDIO uses keys "SEGYRevision" and "SEGYRevisionMinor" whereas
102-
the new one uses "segy_revision_major" and "segy_revision_minor". Given either case we return
103-
the correctly Rev1 like encoded revision code, ready to write to SEG-Y.
101+
Return the correctly Rev1-like encoded revision code, ready to write to SEG-Y.
104102
105103
Args:
106104
binary_header: Dictionary representing the SEG-Y binary header. Contains keys for major
107105
and minor revision numbers.
108-
version_str: MDIO version string to determine the encoding format.
109106
110107
Returns:
111108
The updated binary header with the encoded revision.
112109
113110
Raises:
114111
InvalidMDIOError: Raised when binary header in MDIO is broken.
115112
"""
116-
version_obj = version.parse(version_str)
117-
if version_obj > version.parse("0.7.4"):
118-
major_key, minor_key = "segy_revision_major", "segy_revision_minor"
119-
else: # MDIO <0.8
120-
major_key, minor_key = "SEGYRevision", "SEGYRevisionMinor"
113+
major_key, minor_key = "segy_revision_major", "segy_revision_minor"
121114

122115
try:
123116
major = binary_header.pop(major_key)

src/mdio/segy/creation.py

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from tqdm.auto import tqdm
1414

1515
from mdio.api.io import open_mdio
16-
from mdio.segy.compat import revision_encode
16+
from mdio.segy.compat import encode_segy_revision
1717

1818
if TYPE_CHECKING:
1919
import xarray as xr
@@ -25,9 +25,8 @@
2525
logger = logging.getLogger(__name__)
2626

2727

28-
def make_segy_factory(dataset: xr.Dataset, spec: SegySpec) -> SegyFactory:
28+
def make_segy_factory(spec: SegySpec, binary_header: dict[str, int]) -> SegyFactory:
2929
"""Generate SEG-Y factory from MDIO metadata."""
30-
binary_header = dataset.attrs["attributes"]["binaryHeader"]
3130
sample_interval = binary_header["sample_interval"]
3231
samples_per_trace = binary_header["samples_per_trace"]
3332
return SegyFactory(
@@ -63,22 +62,20 @@ def mdio_spec_to_segy(
6362
Opened Xarray Dataset for MDIO file and SegyFactory
6463
"""
6564
dataset = open_mdio(input_path, chunks=new_chunks)
66-
factory = make_segy_factory(dataset, spec=segy_spec)
6765

68-
attr = dataset.attrs["attributes"]
66+
file_header = dataset["segy_file_header"]
67+
text_header = file_header.attrs["textHeader"]
68+
binary_header = file_header.attrs["binaryHeader"]
69+
binary_header = encode_segy_revision(binary_header)
6970

70-
txt_header = attr["textHeader"]
71-
text_str = "\n".join(txt_header)
72-
text_bytes = factory.create_textual_header(text_str)
71+
factory = make_segy_factory(spec=segy_spec, binary_header=binary_header)
7372

74-
bin_header = attr["binaryHeader"]
75-
mdio_file_version = dataset.attrs["apiVersion"]
76-
binary_header = revision_encode(bin_header, mdio_file_version)
77-
bin_hdr_bytes = factory.create_binary_header(binary_header)
73+
text_header_bytes = factory.create_textual_header(text_header)
74+
binary_header_bytes = factory.create_binary_header(binary_header)
7875

7976
with output_path.open(mode="wb") as fp:
80-
fp.write(text_bytes)
81-
fp.write(bin_hdr_bytes)
77+
fp.write(text_header_bytes)
78+
fp.write(binary_header_bytes)
8279

8380
return dataset, factory
8481

tests/integration/test_segy_import_export.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def test_import_4d_segy( # noqa: PLR0913
7373

7474
ds = open_mdio(zarr_tmp)
7575

76-
assert ds.attrs["attributes"]["binaryHeader"]["samples_per_trace"] == num_samples
76+
assert ds["segy_file_header"].attrs["binaryHeader"]["samples_per_trace"] == num_samples
7777
assert ds.attrs["attributes"]["gridOverrides"] == grid_override
7878

7979
assert npt.assert_array_equal(ds["shot_point"], shots)
@@ -120,7 +120,7 @@ def test_import_4d_segy( # noqa: PLR0913
120120

121121
ds = open_mdio(zarr_tmp)
122122

123-
assert ds.attrs["attributes"]["binaryHeader"]["samples_per_trace"] == num_samples
123+
assert ds["segy_file_header"].attrs["binaryHeader"]["samples_per_trace"] == num_samples
124124
assert ds.attrs["attributes"].get("gridOverrides", None) == grid_override # may not exist, so default=None
125125

126126
xrt.assert_duckarray_equal(ds["shot_point"], shots)
@@ -261,13 +261,15 @@ def test_dataset_metadata(self, zarr_tmp: Path) -> None:
261261

262262
attributes = ds.attrs["attributes"]
263263
assert attributes is not None
264-
assert len(attributes) == 5
264+
assert len(attributes) == 3
265265
# Validate all attributes provided by the abstract template
266266
assert attributes["defaultVariableName"] == "amplitude"
267267
assert attributes["surveyType"] == "3D"
268268
assert attributes["gatherType"] == "stacked"
269-
assert attributes["textHeader"] == text_header_teapot_dome()
270-
assert attributes["binaryHeader"] == binary_header_teapot_dome()
269+
270+
segy_file_header = ds["segy_file_header"]
271+
assert segy_file_header.attrs["textHeader"] == text_header_teapot_dome()
272+
assert segy_file_header.attrs["binaryHeader"] == binary_header_teapot_dome()
271273

272274
def test_variable_metadata(self, zarr_tmp: Path) -> None:
273275
"""Metadata reading tests."""

tests/integration/testing_data.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,9 @@ def custom_teapot_dome_segy_spec(keep_unaltered: bool) -> SegySpec:
3434
)
3535

3636

37-
def text_header_teapot_dome() -> list[str]:
37+
def text_header_teapot_dome() -> str:
3838
"""Return the teapot dome expected text header."""
39-
return [
39+
header_rows = [
4040
"C 1 CLIENT: ROCKY MOUNTAIN OILFIELD TESTING CENTER ",
4141
"C 2 PROJECT: NAVAL PETROLEUM RESERVE #3 (TEAPOT DOME); NATRONA COUNTY, WYOMING ",
4242
"C 3 LINE: 3D ",
@@ -78,6 +78,7 @@ def text_header_teapot_dome() -> list[str]:
7878
"C39 (voice) 303.694.9629 (fax) 303.771.1646 ",
7979
"C40 END EBCDIC ",
8080
]
81+
return "\n".join(header_rows)
8182

8283

8384
def binary_header_teapot_dome() -> dict[str, int]:

0 commit comments

Comments
 (0)