Skip to content

Commit 3e51745

Browse files
authored
Merge pull request #212 from TGSAI/enh/float_headers
Support more data types for parsing headers during ingestion.
2 parents 1078d82 + 0af9509 commit 3e51745

File tree

14 files changed

+266
-259
lines changed

14 files changed

+266
-259
lines changed

poetry.lock

Lines changed: 107 additions & 124 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,9 @@ numba = ">=0.55.2,<1.0.0"
3535
psutil = "^5.9.1"
3636
distributed = {version = ">=2022.11.0", optional = true}
3737
bokeh = {version = "^2.4.3", optional = true}
38-
s3fs = {version = ">=2022.7.0", optional = true}
39-
gcsfs = {version = ">=2022.7.0", optional = true}
40-
adlfs = {version = ">=2022.7.0", optional = true}
38+
s3fs = {version = ">=2023.5.0", optional = true}
39+
gcsfs = {version = ">=2023.5.0", optional = true}
40+
adlfs = {version = ">=2023.4.0", optional = true}
4141
zfpy = {version = "^0.5.5", optional = true}
4242

4343
[tool.poetry.extras]

src/mdio/commands/segy.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -83,11 +83,11 @@
8383
type=click_params.IntListParamType(),
8484
)
8585
@click.option(
86-
"-len",
87-
"--header-lengths",
86+
"-types",
87+
"--header-types",
8888
required=False,
89-
help="Byte lengths of the index attributes in SEG-Y trace header.",
90-
type=click_params.IntListParamType(),
89+
help="Data types of the index attributes in SEG-Y trace header.",
90+
type=click_params.StringListParamType(),
9191
)
9292
@click.option(
9393
"-names",
@@ -151,7 +151,7 @@ def segy_import(
151151
input_segy_path,
152152
output_mdio_file,
153153
header_locations,
154-
header_lengths,
154+
header_types,
155155
header_names,
156156
chunk_size,
157157
endian,
@@ -266,7 +266,7 @@ def segy_import(
266266
segy_path=input_segy_path,
267267
mdio_path_or_buffer=output_mdio_file,
268268
index_bytes=header_locations,
269-
index_lengths=header_lengths,
269+
index_types=header_types,
270270
index_names=header_names,
271271
chunksize=chunk_size,
272272
endian=endian,

src/mdio/converters/segy.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from mdio.core import Grid
1919
from mdio.core.utils_write import write_attribute
2020
from mdio.segy import blocked_io
21+
from mdio.segy.byte_utils import Dtype
2122
from mdio.segy.helpers_segy import create_zarr_hierarchy
2223
from mdio.segy.parsers import parse_binary_header
2324
from mdio.segy.parsers import parse_text_header
@@ -32,12 +33,31 @@
3233
BACKENDS = ["s3", "gcs", "gs", "az", "abfs"]
3334

3435

36+
def parse_index_types(
37+
str_types: Sequence[str] | None, num_index: int
38+
) -> Sequence[Dtype]:
39+
"""Convert string type keys to Dtype enums."""
40+
if str_types is None:
41+
parsed_types = [Dtype.INT32] * num_index
42+
else:
43+
try:
44+
parsed_types = [Dtype[_type.upper()] for _type in str_types]
45+
except KeyError as exc:
46+
msg = (
47+
"Unsupported header data-type. 'index_types' must be in "
48+
f"{list(Dtype.__members__.keys())}"
49+
)
50+
raise KeyError(msg) from exc
51+
52+
return parsed_types
53+
54+
3555
def segy_to_mdio(
3656
segy_path: str,
3757
mdio_path_or_buffer: str,
3858
index_bytes: Sequence[int],
3959
index_names: Sequence[str] | None = None,
40-
index_lengths: Sequence[int] | None = None,
60+
index_types: Sequence[str] | None = None,
4161
chunksize: Sequence[int] | None = None,
4262
endian: str = "big",
4363
lossless: bool = True,
@@ -83,8 +103,9 @@ def segy_to_mdio(
83103
mdio_path_or_buffer: Output path for MDIO file
84104
index_bytes: Tuple of the byte location for the index attributes
85105
index_names: Tuple of the index names for the index attributes
86-
index_lengths: Tuple of the byte lengths for the index attributes
87-
Default is 4-byte for each index key.
106+
index_types: Tuple of the data-types for the index attributes.
107+
Must be in {"int16, int32, float16, float32, ibm32"}
108+
Default is 4-byte integers for each index key.
88109
chunksize : Override default chunk size, which is (64, 64, 64) if
89110
3D, and (512, 512) for 2D.
90111
endian: Endianness of the input SEG-Y. Rev.2 allows little endian.
@@ -221,12 +242,14 @@ def segy_to_mdio(
221242
binary_header = parse_binary_header(segy_handle)
222243
num_traces = segy_handle.tracecount
223244

245+
index_types = parse_index_types(index_types, num_index)
246+
224247
dimensions, index_headers = get_grid_plan(
225248
segy_path=segy_path,
226249
segy_endian=endian,
227250
index_bytes=index_bytes,
228251
index_names=index_names,
229-
index_lengths=index_lengths,
252+
index_types=index_types,
230253
binary_header=binary_header,
231254
return_headers=True,
232255
grid_overrides=grid_overrides,

src/mdio/core/grid.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -85,9 +85,8 @@ def build_map(self, index_headers):
8585
index_headers: Headers to be normalized (indexed)
8686
"""
8787
live_dim_indices = tuple()
88-
89-
# TODO: Add strict=True and remove noqa when minimum Python is 3.10
90-
for dim, dim_hdr in zip(self.dims, index_headers.T): # noqa: B905
88+
for dim in self.dims[:-1]:
89+
dim_hdr = index_headers[dim.name]
9190
live_dim_indices += (np.searchsorted(dim, dim_hdr),)
9291

9392
# We set dead traces to uint32 max. Should be far away from actual trace counts.

src/mdio/segy/_workers.py

Lines changed: 39 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,18 @@
1414
from mdio.constants import UINT32_MAX
1515
from mdio.core import Grid
1616
from mdio.segy.byte_utils import ByteOrder
17+
from mdio.segy.byte_utils import Dtype
18+
from mdio.segy.ibm_float import ibm2ieee
1719

1820

1921
def header_scan_worker(
2022
segy_path_or_handle: str | segyio.SegyFile,
2123
trace_range: Sequence[int],
2224
byte_locs: Sequence[int],
23-
byte_lengths: Sequence[int],
25+
byte_types: Sequence[Dtype],
26+
index_names: Sequence[str],
2427
segy_endian: str,
25-
) -> ArrayLike:
28+
) -> dict[str, ArrayLike]:
2629
"""Header scan worker.
2730
2831
Can accept file path or segyio.SegyFile.
@@ -36,9 +39,9 @@ def header_scan_worker(
3639
Args:
3740
segy_path_or_handle: Path or handle to the input SEG-Y file
3841
byte_locs: Byte locations to return. It will be a subset of the headers.
39-
byte_lengths: Tuple consisting of the byte lengths for the index
40-
attributes. None sets it to 4 per index
42+
byte_types: Tuple consisting of the data types for the index attributes.
4143
trace_range: Tuple consisting of the trace ranges to read
44+
index_names: Tuple of the names for the index attributes
4245
segy_endian: Endianness of the input SEG-Y. Rev.2 allows little endian
4346
4447
Returns:
@@ -77,14 +80,14 @@ def header_scan_worker(
7780
# Pads the rest of the data with voids.
7881
endian = ByteOrder[segy_endian.upper()]
7982

80-
# Handle byte locations and word lengths that are not specified for numpy struct
81-
lengths = [4 if length is None else length for length in byte_lengths]
83+
# Handle byte offsets
8284
offsets = [0 if byte_loc is None else byte_loc - 1 for byte_loc in byte_locs]
85+
formats = [type_.numpy_dtype.newbyteorder(endian) for type_ in byte_types]
8386

8487
struct_dtype = np.dtype(
8588
{
86-
"names": [f"dim_{idx}" for idx in range(len(byte_locs))],
87-
"formats": [endian + "i" + str(length) for length in lengths],
89+
"names": index_names,
90+
"formats": formats,
8891
"offsets": offsets,
8992
"itemsize": 240,
9093
}
@@ -95,17 +98,37 @@ def header_scan_worker(
9598
block_headers = b"".join([trace_headers.buf for trace_headers in block_headers])
9699
n_traces = stop - start
97100
block_headers = np.frombuffer(block_headers, struct_dtype, count=n_traces)
98-
block_headers = [block_headers[dim] for dim in block_headers.dtype.names]
101+
block_headers = {name: block_headers[name] for name in index_names}
99102

100-
block_headers = np.column_stack(block_headers)
103+
out_dtype = []
104+
for name, type_ in zip(index_names, byte_types): # noqa: B905
105+
if type_ == Dtype.IBM32:
106+
native_dtype = Dtype.FLOAT32.numpy_dtype
107+
else:
108+
native_dtype = type_.numpy_dtype
101109

102-
if None in byte_locs:
103-
# Zero out the junk we read for `None` byte locations.
104-
# We could have multiple None values.
105-
none_idx = tuple(i for i, val in enumerate(byte_locs) if val is None)
106-
block_headers[:, none_idx] = 0
110+
out_dtype.append((name, native_dtype))
107111

108-
return block_headers
112+
out_array = np.empty(n_traces, out_dtype)
113+
114+
# TODO: Add strict=True and remove noqa when minimum Python is 3.10
115+
for name, loc, type_ in zip(index_names, byte_locs, byte_types): # noqa: B905
116+
# Handle exception when a byte_loc is None
117+
if loc is None:
118+
out_array[name] = 0
119+
del block_headers[name]
120+
continue
121+
122+
header = block_headers[name]
123+
124+
if type_ == Dtype.IBM32:
125+
header = ibm2ieee(header)
126+
127+
out_array[name] = header
128+
129+
del block_headers[name]
130+
131+
return out_array
109132

110133

111134
def trace_worker(

src/mdio/segy/blocked_io.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -297,7 +297,6 @@ def to_segy(
297297
headers: Header array.
298298
live_mask: Live mask array.
299299
out_dtype: Desired type of output samples.
300-
out_dtype: Desired output data type.
301300
out_byteorder: Desired output data byte order.
302301
file_root: Root directory to write partial SEG-Y files.
303302
axis: Which axes to merge on. Excluding sample axis.

src/mdio/segy/byte_utils.py

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9,22 +9,27 @@
99
from numpy.typing import NDArray
1010

1111

12-
class Dtype(str, Enum):
12+
class Dtype(Enum):
1313
"""Dtype string to Numpy format enum."""
1414

15-
STRING = "S"
16-
UINT8 = "u1"
17-
UINT16 = "u2"
18-
UINT32 = "u4"
19-
UINT64 = "u8"
20-
INT8 = "i1"
21-
INT16 = "i2"
22-
INT32 = "i4"
23-
INT64 = "i8"
24-
FLOAT16 = "f2"
25-
FLOAT32 = "f4"
26-
FLOAT64 = "f8"
27-
IBM32 = "u4"
15+
STRING = ("STRING", "S")
16+
UINT8 = ("UINT8", "u1")
17+
UINT16 = ("UINT16", "u2")
18+
UINT32 = ("UINT32", "u4")
19+
UINT64 = ("UINT64", "u8")
20+
INT8 = ("INT8", "i1")
21+
INT16 = ("INT16", "i2")
22+
INT32 = ("INT32", "i4")
23+
INT64 = ("INT64", "i8")
24+
FLOAT16 = ("FLOAT16", "f2")
25+
FLOAT32 = ("FLOAT32", "f4")
26+
FLOAT64 = ("FLOAT64", "f8")
27+
IBM32 = ("IBM32", "u4")
28+
29+
@property
30+
def numpy_dtype(self):
31+
"""Return a numpy dtype of the Enum."""
32+
return np.dtype(self.value[1])
2833

2934

3035
class ByteOrder(str, Enum):
@@ -59,7 +64,7 @@ def __len__(self) -> int:
5964
@property
6065
def dtype(self):
6166
"""Return Numpy dtype of the struct."""
62-
return np.dtype(self.endian + self.type)
67+
return np.dtype(self.endian + self.type.numpy_dtype)
6368

6469
def byteswap(self):
6570
"""Swap endianness in place."""

src/mdio/segy/creation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ def cast_sample_format(
240240
samples = samples.astype("float32", copy=False)
241241
samples = ieee2ibm(samples)
242242
else:
243-
samples = samples.astype(out_dtype, copy=False)
243+
samples = samples.astype(out_dtype.numpy_dtype, copy=False)
244244

245245
return samples
246246

src/mdio/segy/helpers_segy.py

Lines changed: 0 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
"""Helper functions for tinkering with SEG-Y related Zarr."""
22

33

4-
from math import prod
5-
64
from zarr import Group
75
from zarr import open_group
86
from zarr.errors import ContainsGroupError
@@ -37,57 +35,3 @@ def create_zarr_hierarchy(store: FSStore, overwrite: bool) -> Group:
3735
raise MDIOAlreadyExistsError(msg) from e
3836

3937
return root_group
40-
41-
42-
# TODO: This is not used right now, but it is a template for what we can do for
43-
# automatic chunk size determination based on shape of the arrays etc.
44-
def infer_header_chunksize(orig_chunks, orig_shape, target_size=2**26, length=240):
45-
"""Infer larger chunks based on target chunk filesize.
46-
47-
This tool takes an original chunking scheme, the full shape of the
48-
original array, a target size (in bytes) and length of the `array` or
49-
`struct` to calculate a multidimensional scalar for smaller arrays.
50-
51-
Use case is: Seismic data has 1 extra time/depth dimension, which doesn't
52-
exist in headers or spatial live mask. So we can make chunk size bigger
53-
for these flatter arrays.
54-
55-
This module infers a scalar based on the parameters and returns a new
56-
chunking scheme.
57-
58-
Args:
59-
orig_chunks: Original array chunks.
60-
orig_shape: Original array shape.
61-
target_size: Uncompressed, expected size of each chunk. This is much
62-
larger than the ideal 1MB because on metadata, after compression,
63-
the size goes down by 10x. Default: 32 MB.
64-
length: Length of the multidimensional array's dtype.
65-
Default is 240-bytes.
66-
67-
Returns:
68-
Tuple of adjusted chunk sizes.
69-
"""
70-
orig_bytes = prod(orig_chunks) * length
71-
72-
# Size scalar in bytes
73-
scalar = target_size / orig_bytes
74-
75-
# Divide than into chunks (root of the scalar based on length of dims)
76-
# Then round it to the nearest integer.
77-
scalar = round(scalar ** (1 / len(orig_chunks)))
78-
79-
# Scale chunks by inferred isotropic scalar.
80-
new_chunks = [dim_chunk * scalar for dim_chunk in orig_chunks]
81-
82-
# Set it to max if after scaling, it is larger than the max values.
83-
new_chunks = [
84-
min(dim_new, dim_orig)
85-
for dim_new, dim_orig in zip(new_chunks, orig_shape) # noqa: B905
86-
]
87-
88-
# Special case if the new_chunks are larger than 80% the original shape.
89-
# In this case we want one chunk.
90-
if prod(new_chunks) > 0.8 * prod(orig_shape):
91-
new_chunks = orig_shape
92-
93-
return new_chunks

0 commit comments

Comments
 (0)