Skip to content

Commit 5262eed

Browse files
committed
Retain coordinates from segy header scan
1 parent a1e27ec commit 5262eed

File tree

1 file changed

+27
-17
lines changed

1 file changed

+27
-17
lines changed

src/mdio/converters/segy.py

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -659,6 +659,8 @@ def segy_to_mdio_schematized(
659659

660660
ds = MDIO.open(mdio_path_or_buffer) # Reopen because we needed to do some weird stuff (hacky)
661661

662+
seismic_coords = [name for name in ds.seismic.coords if name not in ds.seismic.dims]
663+
662664
try:
663665
dims = get_dims(ds, segy_schema)
664666
except ValueError as e:
@@ -686,11 +688,8 @@ def segy_to_mdio_schematized(
686688
storage_options_input = storage_options_input or {}
687689
storage_options_output = storage_options_output or {}
688690

689-
mdio_spec = (
690-
mdio_segy_spec()
691-
) # TODO: I think this may need to be updated to work with our new input schemas
691+
mdio_spec = mdio_segy_spec()
692692
segy_settings = SegySettings(storage_options=storage_options_input)
693-
# segy = SegyFile(url=segy_path, spec=mdio_spec, settings=segy_settings)
694693
segy = SegyFile(url=segy_schema["path"], spec=mdio_spec, settings=segy_settings)
695694

696695
text_header = segy.text_header
@@ -701,30 +700,49 @@ def segy_to_mdio_schematized(
701700
index_fields = []
702701
for name, byte, format_ in zip(index_names, index_bytes, index_types, strict=True):
703702
index_fields.append(HeaderField(name=name, byte=byte, format=format_))
703+
704+
# Add the coordinates as pseudo-index_fields.
705+
# This allows us to sneakily grab them from the grid builder (I hope)
706+
# First, we need to extract the index bytes and index types from the segy_schema.
707+
coord_index_names = []
708+
coord_index_bytes = []
709+
coord_index_types = []
710+
for coord in segy_schema["trace"]["header_entries"]:
711+
if coord["name"] in seismic_coords:
712+
coord_index_names.append(coord["name"])
713+
coord_index_bytes.append(coord["byte_start"])
714+
coord_index_types.append(coord["format"])
715+
716+
for newName, newByte, newFormat in zip(coord_index_names, coord_index_bytes, coord_index_types, strict=True):
717+
index_fields.append(HeaderField(name=newName, byte=newByte, format=newFormat))
718+
704719
mdio_spec_grid = mdio_spec.customize(trace_header_fields=index_fields)
705720
segy_grid = SegyFile(url=segy_schema["path"], spec=mdio_spec_grid, settings=segy_settings)
706-
707721
dimensions, chunksize, index_headers = get_grid_plan(
708722
segy_file=segy_grid,
709723
return_headers=True,
710724
chunksize=chunksize,
711725
grid_overrides=grid_overrides,
712726
)
727+
# Override the "sample" dimension name
713728
dimensions[-1].name = get_sample_name(ds, dimensions)
714-
# print(dimensions)
729+
730+
ds_coords = [dim for dim in dimensions if dim.name in seismic_coords]
731+
dimensions = [dim for dim in dimensions if dim.name not in seismic_coords]
732+
715733
grid = Grid(dims=dimensions)
716734
grid_density_qc(grid, num_traces)
717735
grid.build_map(index_headers)
718736

719-
# Override the "sample" dimension name
737+
# print(f"dimensions: {dimensions}")
738+
# print(f"chunksize: {chunksize}")
739+
# print(f"index_headers: {index_headers}")
720740

721741
# Set dimension coordinates
722742
new_coords = {dim.name: dim.coords for dim in dimensions}
723743
ds = ds.assign_coords(new_coords)
724744
ds.to_mdio(store=mdio_path_or_buffer, mode="r+")
725745

726-
# Set all coordinates which are not dimensions, root Variables, or live_mask
727-
728746
# Check grid validity by ensuring every trace's header-index is within dimension bounds
729747
valid_mask = np.ones(grid.num_traces, dtype=bool)
730748
for d_idx in range(len(grid.header_index_arrays)):
@@ -744,8 +762,6 @@ def segy_to_mdio_schematized(
744762
del valid_mask
745763
gc.collect()
746764

747-
coords = ds.seismic.coords # TODO: We also need to iterate over the coords and assign their values in parallel with the live_mask
748-
749765
# live_mask_array = ds.live_mask # TODO: Make this more robust
750766
live_mask_array = ds.trace_mask
751767
from mdio.core.v1._overloads import MDIODataArray
@@ -756,8 +772,6 @@ def segy_to_mdio_schematized(
756772

757773
chunker = ChunkIterator(live_mask_array, chunk_samples=True)
758774
for chunk_indices in chunker:
759-
# print(f"chunk_indices: {chunk_indices}")
760-
# chunk_indices is a tuple of N–1 slice objects
761775
trace_ids = grid.get_traces_for_chunk(chunk_indices)
762776
if trace_ids.size == 0:
763777
# Free memory immediately for empty chunks
@@ -782,7 +796,6 @@ def segy_to_mdio_schematized(
782796
if local_idx.dtype != np.intp:
783797
local_idx = local_idx.astype(np.intp)
784798
local_coords.append(local_idx)
785-
# local_idx is now owned by local_coords list, safe to continue
786799

787800
# Free trace_ids as soon as we're done with it
788801
del trace_ids
@@ -794,13 +807,10 @@ def segy_to_mdio_schematized(
794807
del local_coords
795808

796809
# Write the entire block to Zarr at once
797-
# live_mask_array.isel(chunk_indices).values[:] = block
798810
live_mask_array[chunk_indices] = block
799811

800812
# Free block immediately after writing
801813
del block
802-
803-
# Force garbage collection periodically to free memory aggressively
804814
gc.collect()
805815

806816
# Save the entire dataset to persist the live_mask changes

0 commit comments

Comments
 (0)