AutoShotWrap grid override (#338)

markspec · tasansal · web-flow · commit 40ed5e0e08e3 · 2024-03-10T13:49:22.000-05:00
* Added AutoShopWrap grid override.  This allows ingestion with gun indexed by unwrapping shot_point for shot_lines.

* Add documentation to cover segy_to_mdio AutoShotWrap grid override.

* Resolve merge conflicts.

* Update Dockerfile to align with nox pipeline checks.

* Merge poetry.lock.

* Fix ingestion tests.

* Fix issues with tests.

* Switch to poetry.lock from main to resolve conflict.

* Add some more explainination on ShotGumGeometryType..

* Linting updates.

* Remove unused code.

* Update .devcontainer/Dockerfile

* Update tests/integration/conftest.py

* update long text syntax

* Fix linting.

---------

Co-authored-by: Altay Sansal &lt;tasansal@users.noreply.github.com&gt;
diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
@@ -1,16 +1,16 @@
-ARG PYTHON_VERSION=3.11
+ARG PYTHON_VERSION=3.12
 ARG LINUX_DISTRO=bookworm
 
 FROM mcr.microsoft.com/devcontainers/python:1-${PYTHON_VERSION}-${LINUX_DISTRO}
 
 # Install git for nox pre-commit
 RUN apt-get update \
-    && apt-get install -y --no-install-recommends  \
-    git \
-    && rm -rf /var/lib/apt/lists/*
+  && apt-get install -y --no-install-recommends  \
+  git \
+  && rm -rf /var/lib/apt/lists/*
 
 # Poetry
-ARG POETRY_VERSION="1.6.1"
+ARG POETRY_VERSION="1.8.2"
 RUN if [ "${POETRY_VERSION}" != "none" ]; then bash -c "umask 0002 && pip3 install poetry==${POETRY_VERSION}"; fi
 
 # Nox
diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py
@@ -90,6 +90,8 @@ def grid_density_qc(grid: Grid, num_traces: int) -> None:
 
     logger.debug(f"Dimensions: {dims}")
     logger.debug(f"num_traces = {num_traces}")
+    logger.debug(f"grid_traces = {grid_traces}")
+    logger.debug(f"sparsity = {grid_traces / num_traces}")
 
     grid_sparsity_ratio_limit = os.getenv("MDIO__GRID__SPARSITY_RATIO_LIMIT", 10)
     try:
@@ -270,7 +272,7 @@ def segy_to_mdio(
         ...     mdio_path_or_buffer="s3://bucket/shot_file.mdio",
         ...     index_bytes=(17, 137, 13),
         ...     index_lengths=(4, 2, 4),
-        ...     index_names=("shot", "cable", "channel"),
+        ...     index_names=("shot_point", "cable", "channel"),
         ...     chunksize=(8, 2, 128, 1024),
         ...     grid_overrides={"ChannelWrap": True, "ChannelsPerCable": 800},
         ... )
@@ -283,6 +285,31 @@ def segy_to_mdio(
         >>>    grid_overrides={"AutoChannelWrap": True,
                                "AutoChannelTraceQC":  1000000}
 
+        For ingestion of pre-stack streamer data where the user needs to
+        access/index *common-channel gathers* (single gun) then the following
+        strategy can be used to densely ingest while indexing on gun number:
+
+        >>> segy_to_mdio(
+        ...     segy_path="prefix/shot_file.segy",
+        ...     mdio_path_or_buffer="s3://bucket/shot_file.mdio",
+        ...     index_bytes=(133, 171, 17, 137, 13),
+        ...     index_lengths=(2, 2, 4, 2, 4),
+        ...     index_names=("shot_line", "gun", "shot_point", "cable", "channel"),
+        ...     chunksize=(1, 1, 8, 1, 128, 1024),
+        ...     grid_overrides={
+        ...         "AutoShotWrap": True,
+        ...         "AutoChannelWrap": True,
+        ...         "AutoChannelTraceQC":  1000000
+        ...     },
+        ... )
+
+        For AutoShotWrap and AutoChannelWrap to work, the user must provide
+        "shot_line", "gun", "shot_point", "cable", "channel". For improved
+        common-channel performance consider modifying the chunksize to be
+        (1, 1, 32, 1, 32, 2048) for good common-shot and common-channel
+        performance or (1, 1, 128, 1, 1, 2048) for common-channel
+        performance.
+
         For cases with no well-defined trace header for indexing a NonBinned
         grid override is provided.This creates the index and attributes an
         incrementing integer to the trace for the index based on first in first
diff --git a/src/mdio/segy/geometry.py b/src/mdio/segy/geometry.py
@@ -60,6 +60,28 @@ class StreamerShotGeometryType(Enum):
     C = auto()
 
 
+class ShotGunGeometryType(Enum):
+    r"""Shot geometry template types for multi-gun acquisition.
+
+    For shot lines with multiple guns, we can have two configurations for
+    numbering shot_point. The desired index is to have the shot point index
+    for a given gun to be dense and unique (configuration A). Typically the
+    shot_point is unique for the line and therefore is not dense for each
+    gun (configuration B).
+
+    Configuration A:
+        Gun 1 ->         1------------------20
+        Gun 2 ->         1------------------20
+
+    Configuration B:
+        Gun 1 ->         1------------------39
+        Gun 2 ->         2------------------40
+
+    """
+    A = auto()
+    B = auto()
+
+
 def analyze_streamer_headers(
     index_headers: dict[str, npt.NDArray],
 ) -> tuple[npt.NDArray, npt.NDArray, npt.NDArray, StreamerShotGeometryType]:
@@ -91,6 +113,7 @@ def analyze_streamer_headers(
 
     # Check channel numbers do not overlap for case B
     geom_type = StreamerShotGeometryType.B
+
     for idx1, cable1 in enumerate(unique_cables):
         min_val1 = cable_chan_min[idx1]
         max_val1 = cable_chan_max[idx1]
@@ -124,13 +147,65 @@ def analyze_streamer_headers(
     return unique_cables, cable_chan_min, cable_chan_max, geom_type
 
 
+def analyze_shotlines_for_guns(
+    index_headers: dict[str, npt.NDArray],
+) -> tuple[npt.NDArray, npt.NDArray, ShotGunGeometryType]:
+    """Check input headers for SEG-Y input to help determine geometry of shots and guns.
+
+    This function reads in trace_qc_count headers and finds the unique gun values.
+    The function then checks to ensure shot numbers are dense.
+
+    Args:
+        index_headers: numpy array with index headers
+
+    Returns:
+        tuple of unique_shot_lines, unique_guns_in_shot_line, geom_type
+    """
+    # Find unique cable ids
+    unique_shot_lines = np.sort(np.unique(index_headers["shot_line"]))
+    unique_guns = np.sort(np.unique(index_headers["gun"]))
+    logger.info(f"unique_shot_lines: {unique_shot_lines}")
+    logger.info(f"unique_guns: {unique_guns}")
+
+    # Find channel min and max values for each cable
+    # unique_guns_in_shot_line = np.empty(unique_shot_lines.shape)
+    unique_guns_in_shot_line = dict()
+
+    geom_type = ShotGunGeometryType.B
+    # Check shot numbers are still unique if div/num_guns
+    for shot_line in unique_shot_lines:
+        shot_line_mask = index_headers["shot_line"] == shot_line
+        shot_current_sl = index_headers["shot_point"][shot_line_mask]
+        gun_current_sl = index_headers["gun"][shot_line_mask]
+
+        unique_guns_sl = np.sort(np.unique(gun_current_sl))
+        num_guns_sl = unique_guns_sl.shape[0]
+        # unique_guns_in_shot_line[idx] = list(unique_guns_sl)
+        unique_guns_in_shot_line[str(shot_line)] = list(unique_guns_sl)
+
+        for gun in unique_guns_sl:
+            gun_mask = gun_current_sl == gun
+            shots_current_sl_gun = shot_current_sl[gun_mask]
+            num_shots_sl = np.unique(shots_current_sl_gun).shape[0]
+            mod_shots = np.floor(shots_current_sl_gun / num_guns_sl)
+            if len(np.unique(mod_shots)) != num_shots_sl:
+                msg = (
+                    f"Shot line {shot_line} has {num_shots_sl} when using div by "
+                    f"{num_guns_sl} (num_guns) has  {np.unique(mod_shots)} unique mod shots."
+                )
+                logger.info(msg)
+                geom_type = ShotGunGeometryType.A
+                return unique_shot_lines, unique_guns_in_shot_line, geom_type
+    return unique_shot_lines, unique_guns_in_shot_line, geom_type
+
+
 def create_counter(
     depth: int,
     total_depth: int,
     unique_headers: dict[str, npt.NDArray],
     header_names: list[str],
 ):
-    """Helper funtion to create dictionary tree for counting trace key for auto index."""
+    """Helper function to create dictionary tree for counting trace key for auto index."""
     if depth == total_depth:
         return 0
 
@@ -490,6 +565,54 @@ def transform(
         return index_headers
 
 
+class AutoShotWrap(GridOverrideCommand):
+    """Automatically determine ShotGun acquisition type."""
+
+    required_keys = {"shot_line", "gun", "shot_point", "cable", "channel"}
+    required_parameters = None
+
+    def validate(
+        self,
+        index_headers: dict[str, npt.NDArray],
+        grid_overrides: dict[str, bool | int],
+    ) -> None:
+        """Validate if this transform should run on the type of data."""
+        self.check_required_keys(index_headers)
+        self.check_required_params(grid_overrides)
+
+    def transform(
+        self,
+        index_headers: dict[str, npt.NDArray],
+        grid_overrides: dict[str, bool | int],
+    ) -> dict[str, npt.NDArray]:
+        """Perform the grid transform."""
+        self.validate(index_headers, grid_overrides)
+
+        result = analyze_shotlines_for_guns(index_headers)
+        unique_shot_lines, unique_guns_in_shot_line, geom_type = result
+        logger.info(f"Ingesting dataset as shot type: {geom_type.name}")
+
+        # TODO: Add strict=True and remove noqa when min Python is 3.10
+        max_num_guns = 1
+        for shot_line in unique_shot_lines:
+            logger.info(
+                f"shot_line: {shot_line} has guns: {unique_guns_in_shot_line[str(shot_line)]}"
+            )
+            num_guns = len(unique_guns_in_shot_line[str(shot_line)])
+            if num_guns > max_num_guns:
+                max_num_guns = num_guns
+
+        # This might be slow and potentially could be improved with a rewrite
+        # to prevent so many lookups
+        if geom_type == ShotGunGeometryType.B:
+            for shot_line in unique_shot_lines:
+                shot_line_idxs = np.where(index_headers["shot_line"][:] == shot_line)
+                index_headers["shot_point"][shot_line_idxs] = np.floor(
+                    index_headers["shot_point"][shot_line_idxs] / max_num_guns
+                )
+        return index_headers
+
+
 class GridOverrider:
     """Executor for grid overrides.
 
@@ -503,6 +626,7 @@ def __init__(self):
         """Define allowed overrides and parameters here."""
         self.commands = {
             "AutoChannelWrap": AutoChannelWrap(),
+            "AutoShotWrap": AutoShotWrap(),
             "CalculateCable": CalculateCable(),
             "ChannelWrap": ChannelWrap(),
             "NonBinned": NonBinned(),
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -18,6 +18,7 @@ def create_segy_mock_4d(
     shots: list,
     cables: list,
     receivers_per_cable: list,
+    guns: list | None = None,
     chan_header_type: StreamerShotGeometryType = StreamerShotGeometryType.A,
     index_receivers: bool = True,
 ) -> str:
@@ -55,20 +56,30 @@ def create_segy_mock_4d(
         index_receivers = False
 
     shot_headers = np.hstack([np.repeat(shot, total_chan) for shot in shots])
+
+    gun_per_shot = []
+    for shot in shots:
+        gun_per_shot.append(guns[(shot % len(guns))])
+    gun_headers = np.hstack([np.repeat(gun, total_chan) for gun in gun_per_shot])
+
     cable_headers = np.tile(cable_headers, shot_count)
     channel_headers = np.tile(channel_headers, shot_count)
 
     with segyio.create(segy_file, spec) as f:
         for trc_idx in range(trace_count):
             shot = shot_headers[trc_idx]
+            gun = gun_headers[trc_idx]
             cable = cable_headers[trc_idx]
             channel = channel_headers[trc_idx]
+            source_line = 1
 
             # offset is byte location 37 - offset 4 bytes
             # fldr is byte location 9 - shot 4 byte
             # ep is byte location 17 - shot 4 byte
             # stae is byte location 137 - cable 2 byte
             # tracf is byte location 13 - channel 4 byte
+            # grnors is byte location 171 - gun 2 bytes
+            # styp is byte location 133 - source_line 2 bytes
 
             if index_receivers:
                 f.header[trc_idx].update(
@@ -77,6 +88,8 @@ def create_segy_mock_4d(
                     ep=shot,
                     stae=cable,
                     tracf=channel,
+                    grnors=gun,
+                    styp=source_line,
                 )
             else:
                 f.header[trc_idx].update(
@@ -98,7 +111,8 @@ def create_segy_mock_4d(
 def segy_mock_4d_shots(fake_segy_tmp: str) -> dict[StreamerShotGeometryType, str]:
     """Generate mock 4D shot SEG-Y files."""
     num_samples = 25
-    shots = [2, 3, 5]
+    shots = [2, 3, 5, 6, 7, 8, 9]
+    guns = [1, 2]
     cables = [0, 101, 201, 301]
     receivers_per_cable = [1, 5, 7, 5]
 
@@ -112,6 +126,7 @@ def segy_mock_4d_shots(fake_segy_tmp: str) -> dict[StreamerShotGeometryType, str
             cables=cables,
             receivers_per_cable=receivers_per_cable,
             chan_header_type=chan_header_type,
+            guns=guns,
         )
 
     return segy_paths
diff --git a/tests/integration/test_segy_import_export.py b/tests/integration/test_segy_import_export.py