TGSAI
diff --git a/‎poetry.lock‎
Lines changed: 107 additions & 124 deletions b/‎poetry.lock‎
Lines changed: 107 additions & 124 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 3 additions & 3 deletions b/‎pyproject.toml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/mdio/commands/segy.py‎
Lines changed: 6 additions & 6 deletions b/‎src/mdio/commands/segy.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎src/mdio/converters/segy.py‎
Lines changed: 27 additions & 4 deletions b/‎src/mdio/converters/segy.py‎
Lines changed: 27 additions & 4 deletions
diff --git a/‎src/mdio/core/grid.py‎
Lines changed: 2 additions & 3 deletions b/‎src/mdio/core/grid.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎src/mdio/segy/_workers.py‎
Lines changed: 39 additions & 16 deletions b/‎src/mdio/segy/_workers.py‎
Lines changed: 39 additions & 16 deletions
diff --git a/‎src/mdio/segy/blocked_io.py‎
Lines changed: 0 additions & 1 deletion b/‎src/mdio/segy/blocked_io.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/mdio/segy/byte_utils.py‎
Lines changed: 20 additions & 15 deletions b/‎src/mdio/segy/byte_utils.py‎
Lines changed: 20 additions & 15 deletions
diff --git a/‎src/mdio/segy/creation.py‎
Lines changed: 1 addition & 1 deletion b/‎src/mdio/segy/creation.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/mdio/segy/helpers_segy.py‎
Lines changed: 0 additions & 56 deletions b/‎src/mdio/segy/helpers_segy.py‎
Lines changed: 0 additions & 56 deletions
@@ -35,9 +35,9 @@ numba = ">=0.55.2,<1.0.0"
 psutil = "^5.9.1"
 distributed = {version = ">=2022.11.0", optional = true}
 bokeh = {version = "^2.4.3", optional = true}
-s3fs = {version = ">=2022.7.0", optional = true}
-gcsfs = {version = ">=2022.7.0", optional = true}
-adlfs = {version = ">=2022.7.0", optional = true}
+s3fs = {version = ">=2023.5.0", optional = true}
+gcsfs = {version = ">=2023.5.0", optional = true}
+adlfs = {version = ">=2023.4.0", optional = true}
 zfpy = {version = "^0.5.5", optional = true}
 
 [tool.poetry.extras]
 
@@ -83,11 +83,11 @@
     type=click_params.IntListParamType(),
 )
 @click.option(
-    "-len",
-    "--header-lengths",
+    "-types",
+    "--header-types",
     required=False,
-    help="Byte lengths of the index attributes in SEG-Y trace header.",
-    type=click_params.IntListParamType(),
+    help="Data types of the index attributes in SEG-Y trace header.",
+    type=click_params.StringListParamType(),
 )
 @click.option(
     "-names",
@@ -151,7 +151,7 @@ def segy_import(
     input_segy_path,
     output_mdio_file,
     header_locations,
-    header_lengths,
+    header_types,
     header_names,
     chunk_size,
     endian,
@@ -266,7 +266,7 @@ def segy_import(
         segy_path=input_segy_path,
         mdio_path_or_buffer=output_mdio_file,
         index_bytes=header_locations,
-        index_lengths=header_lengths,
+        index_types=header_types,
         index_names=header_names,
         chunksize=chunk_size,
         endian=endian,
 
@@ -18,6 +18,7 @@
 from mdio.core import Grid
 from mdio.core.utils_write import write_attribute
 from mdio.segy import blocked_io
+from mdio.segy.byte_utils import Dtype
 from mdio.segy.helpers_segy import create_zarr_hierarchy
 from mdio.segy.parsers import parse_binary_header
 from mdio.segy.parsers import parse_text_header
@@ -32,12 +33,31 @@
 BACKENDS = ["s3", "gcs", "gs", "az", "abfs"]
 
 
+def parse_index_types(
+    str_types: Sequence[str] | None, num_index: int
+) -> Sequence[Dtype]:
+    """Convert string type keys to Dtype enums."""
+    if str_types is None:
+        parsed_types = [Dtype.INT32] * num_index
+    else:
+        try:
+            parsed_types = [Dtype[_type.upper()] for _type in str_types]
+        except KeyError as exc:
+            msg = (
+                "Unsupported header data-type. 'index_types' must be in "
+                f"{list(Dtype.__members__.keys())}"
+            )
+            raise KeyError(msg) from exc
+
+    return parsed_types
+
+
 def segy_to_mdio(
     segy_path: str,
     mdio_path_or_buffer: str,
     index_bytes: Sequence[int],
     index_names: Sequence[str] | None = None,
-    index_lengths: Sequence[int] | None = None,
+    index_types: Sequence[str] | None = None,
     chunksize: Sequence[int] | None = None,
     endian: str = "big",
     lossless: bool = True,
@@ -83,8 +103,9 @@ def segy_to_mdio(
         mdio_path_or_buffer: Output path for MDIO file
         index_bytes: Tuple of the byte location for the index attributes
         index_names: Tuple of the index names for the index attributes
-        index_lengths: Tuple of the byte lengths for the index attributes
-            Default is 4-byte for each index key.
+        index_types: Tuple of the data-types for the index attributes.
+            Must be in {"int16, int32, float16, float32, ibm32"}
+            Default is 4-byte integers for each index key.
         chunksize : Override default chunk size, which is (64, 64, 64) if
             3D, and (512, 512) for 2D.
         endian: Endianness of the input SEG-Y. Rev.2 allows little endian.
@@ -221,12 +242,14 @@ def segy_to_mdio(
         binary_header = parse_binary_header(segy_handle)
         num_traces = segy_handle.tracecount
 
+    index_types = parse_index_types(index_types, num_index)
+
     dimensions, index_headers = get_grid_plan(
         segy_path=segy_path,
         segy_endian=endian,
         index_bytes=index_bytes,
         index_names=index_names,
-        index_lengths=index_lengths,
+        index_types=index_types,
         binary_header=binary_header,
         return_headers=True,
         grid_overrides=grid_overrides,
 
@@ -85,9 +85,8 @@ def build_map(self, index_headers):
             index_headers: Headers to be normalized (indexed)
         """
         live_dim_indices = tuple()
-
-        # TODO: Add strict=True and remove noqa when minimum Python is 3.10
-        for dim, dim_hdr in zip(self.dims, index_headers.T):  # noqa: B905
+        for dim in self.dims[:-1]:
+            dim_hdr = index_headers[dim.name]
             live_dim_indices += (np.searchsorted(dim, dim_hdr),)
 
         # We set dead traces to uint32 max. Should be far away from actual trace counts.
 
@@ -14,15 +14,18 @@
 from mdio.constants import UINT32_MAX
 from mdio.core import Grid
 from mdio.segy.byte_utils import ByteOrder
+from mdio.segy.byte_utils import Dtype
+from mdio.segy.ibm_float import ibm2ieee
 
 
 def header_scan_worker(
     segy_path_or_handle: str | segyio.SegyFile,
     trace_range: Sequence[int],
     byte_locs: Sequence[int],
-    byte_lengths: Sequence[int],
+    byte_types: Sequence[Dtype],
+    index_names: Sequence[str],
     segy_endian: str,
-) -> ArrayLike:
+) -> dict[str, ArrayLike]:
     """Header scan worker.
 
     Can accept file path or segyio.SegyFile.
@@ -36,9 +39,9 @@ def header_scan_worker(
     Args:
         segy_path_or_handle: Path or handle to the input SEG-Y file
         byte_locs: Byte locations to return. It will be a subset of the headers.
-        byte_lengths: Tuple consisting of the byte lengths for the index
-            attributes. None sets it to 4 per index
+        byte_types: Tuple consisting of the data types for the index attributes.
         trace_range: Tuple consisting of the trace ranges to read
+        index_names: Tuple of the names for the index attributes
         segy_endian: Endianness of the input SEG-Y. Rev.2 allows little endian
 
     Returns:
@@ -77,14 +80,14 @@ def header_scan_worker(
     # Pads the rest of the data with voids.
     endian = ByteOrder[segy_endian.upper()]
 
-    # Handle byte locations and word lengths that are not specified for numpy struct
-    lengths = [4 if length is None else length for length in byte_lengths]
+    # Handle byte offsets
     offsets = [0 if byte_loc is None else byte_loc - 1 for byte_loc in byte_locs]
+    formats = [type_.numpy_dtype.newbyteorder(endian) for type_ in byte_types]
 
     struct_dtype = np.dtype(
         {
-            "names": [f"dim_{idx}" for idx in range(len(byte_locs))],
-            "formats": [endian + "i" + str(length) for length in lengths],
+            "names": index_names,
+            "formats": formats,
             "offsets": offsets,
             "itemsize": 240,
         }
@@ -95,17 +98,37 @@ def header_scan_worker(
     block_headers = b"".join([trace_headers.buf for trace_headers in block_headers])
     n_traces = stop - start
     block_headers = np.frombuffer(block_headers, struct_dtype, count=n_traces)
-    block_headers = [block_headers[dim] for dim in block_headers.dtype.names]
+    block_headers = {name: block_headers[name] for name in index_names}
 
-    block_headers = np.column_stack(block_headers)
+    out_dtype = []
+    for name, type_ in zip(index_names, byte_types):  # noqa: B905
+        if type_ == Dtype.IBM32:
+            native_dtype = Dtype.FLOAT32.numpy_dtype
+        else:
+            native_dtype = type_.numpy_dtype
 
-    if None in byte_locs:
-        # Zero out the junk we read for `None` byte locations.
-        # We could have multiple None values.
-        none_idx = tuple(i for i, val in enumerate(byte_locs) if val is None)
-        block_headers[:, none_idx] = 0
+        out_dtype.append((name, native_dtype))
 
-    return block_headers
+    out_array = np.empty(n_traces, out_dtype)
+
+    # TODO: Add strict=True and remove noqa when minimum Python is 3.10
+    for name, loc, type_ in zip(index_names, byte_locs, byte_types):  # noqa: B905
+        # Handle exception when a byte_loc is None
+        if loc is None:
+            out_array[name] = 0
+            del block_headers[name]
+            continue
+
+        header = block_headers[name]
+
+        if type_ == Dtype.IBM32:
+            header = ibm2ieee(header)
+
+        out_array[name] = header
+
+        del block_headers[name]
+
+    return out_array
 
 
 def trace_worker(
 
@@ -297,7 +297,6 @@ def to_segy(
         headers: Header array.
         live_mask: Live mask array.
         out_dtype: Desired type of output samples.
-        out_dtype: Desired output data type.
         out_byteorder: Desired output data byte order.
         file_root: Root directory to write partial SEG-Y files.
         axis: Which axes to merge on. Excluding sample axis.
 
@@ -9,22 +9,27 @@
 from numpy.typing import NDArray
 
 
-class Dtype(str, Enum):
+class Dtype(Enum):
     """Dtype string to Numpy format enum."""
 
-    STRING = "S"
-    UINT8 = "u1"
-    UINT16 = "u2"
-    UINT32 = "u4"
-    UINT64 = "u8"
-    INT8 = "i1"
-    INT16 = "i2"
-    INT32 = "i4"
-    INT64 = "i8"
-    FLOAT16 = "f2"
-    FLOAT32 = "f4"
-    FLOAT64 = "f8"
-    IBM32 = "u4"
+    STRING = ("STRING", "S")
+    UINT8 = ("UINT8", "u1")
+    UINT16 = ("UINT16", "u2")
+    UINT32 = ("UINT32", "u4")
+    UINT64 = ("UINT64", "u8")
+    INT8 = ("INT8", "i1")
+    INT16 = ("INT16", "i2")
+    INT32 = ("INT32", "i4")
+    INT64 = ("INT64", "i8")
+    FLOAT16 = ("FLOAT16", "f2")
+    FLOAT32 = ("FLOAT32", "f4")
+    FLOAT64 = ("FLOAT64", "f8")
+    IBM32 = ("IBM32", "u4")
+
+    @property
+    def numpy_dtype(self):
+        """Return a numpy dtype of the Enum."""
+        return np.dtype(self.value[1])
 
 
 class ByteOrder(str, Enum):
@@ -59,7 +64,7 @@ def __len__(self) -> int:
     @property
     def dtype(self):
         """Return Numpy dtype of the struct."""
-        return np.dtype(self.endian + self.type)
+        return np.dtype(self.endian + self.type.numpy_dtype)
 
     def byteswap(self):
         """Swap endianness in place."""
 
@@ -240,7 +240,7 @@ def cast_sample_format(
         samples = samples.astype("float32", copy=False)
         samples = ieee2ibm(samples)
     else:
-        samples = samples.astype(out_dtype, copy=False)
+        samples = samples.astype(out_dtype.numpy_dtype, copy=False)
 
     return samples
 
 
@@ -1,8 +1,6 @@
 """Helper functions for tinkering with SEG-Y related Zarr."""
 
 
-from math import prod
-
 from zarr import Group
 from zarr import open_group
 from zarr.errors import ContainsGroupError
@@ -37,57 +35,3 @@ def create_zarr_hierarchy(store: FSStore, overwrite: bool) -> Group:
         raise MDIOAlreadyExistsError(msg) from e
 
     return root_group
-
-
-# TODO: This is not used right now, but it is a template for what we can do for
-#  automatic chunk size determination based on shape of the arrays etc.
-def infer_header_chunksize(orig_chunks, orig_shape, target_size=2**26, length=240):
-    """Infer larger chunks based on target chunk filesize.
-
-    This tool takes an original chunking scheme, the full shape of the
-    original array, a target size (in bytes) and length of the `array` or
-    `struct` to calculate a multidimensional scalar for smaller arrays.
-
-    Use case is: Seismic data has 1 extra time/depth dimension, which doesn't
-    exist in headers or spatial live mask. So we can make chunk size bigger
-    for these flatter arrays.
-
-    This module infers a scalar based on the parameters and returns a new
-    chunking scheme.
-
-    Args:
-        orig_chunks: Original array chunks.
-        orig_shape: Original array shape.
-        target_size: Uncompressed, expected size of each chunk. This is much
-            larger than the ideal 1MB because on metadata, after compression,
-            the size goes down by 10x. Default: 32 MB.
-        length: Length of the multidimensional array's dtype.
-            Default is 240-bytes.
-
-    Returns:
-            Tuple of adjusted chunk sizes.
-    """
-    orig_bytes = prod(orig_chunks) * length
-
-    # Size scalar in bytes
-    scalar = target_size / orig_bytes
-
-    # Divide than into chunks (root of the scalar based on length of dims)
-    # Then round it to the nearest integer.
-    scalar = round(scalar ** (1 / len(orig_chunks)))
-
-    # Scale chunks by inferred isotropic scalar.
-    new_chunks = [dim_chunk * scalar for dim_chunk in orig_chunks]
-
-    # Set it to max if after scaling, it is larger than the max values.
-    new_chunks = [
-        min(dim_new, dim_orig)
-        for dim_new, dim_orig in zip(new_chunks, orig_shape)  # noqa: B905
-    ]
-
-    # Special case if the new_chunks are larger than 80% the original shape.
-    # In this case we want one chunk.
-    if prod(new_chunks) > 0.8 * prod(orig_shape):
-        new_chunks = orig_shape
-
-    return new_chunks