|
| 1 | +"""SEG-Y async file support and utilities.""" |
| 2 | + |
| 3 | +from __future__ import annotations |
| 4 | + |
| 5 | +import asyncio |
| 6 | +import atexit |
| 7 | +import logging |
| 8 | +import os |
| 9 | +import threading |
| 10 | +from dataclasses import dataclass |
| 11 | +from typing import TYPE_CHECKING |
| 12 | +from typing import TypedDict |
| 13 | + |
| 14 | +import fsspec |
| 15 | +from fsspec.asyn import AsyncFileSystem |
| 16 | +from fsspec.utils import get_protocol |
| 17 | +from segy import SegyFile |
| 18 | +from segy.config import SegyFileSettings |
| 19 | + |
| 20 | +from mdio.segy.scalar import _get_coordinate_scalar |
| 21 | + |
| 22 | +if TYPE_CHECKING: |
| 23 | + from pathlib import Path |
| 24 | + |
| 25 | + from numpy import int32 |
| 26 | + from numpy.typing import NDArray |
| 27 | + from segy.config import SegyHeaderOverrides |
| 28 | + from segy.schema.segy import SegySpec |
| 29 | + |
| 30 | + |
| 31 | +logger = logging.getLogger(__name__) |
| 32 | + |
| 33 | +# Timeout in seconds for stopping async event loop threads during cleanup |
| 34 | +MDIO_ASYNCIO_THREAD_STOP_TIMEOUT = 5.0 |
| 35 | + |
| 36 | + |
| 37 | +class SegyFileArguments(TypedDict): |
| 38 | + """Arguments to open SegyFile instance creation.""" |
| 39 | + |
| 40 | + url: Path | str |
| 41 | + spec: SegySpec | None |
| 42 | + settings: SegyFileSettings | None |
| 43 | + header_overrides: SegyHeaderOverrides | None |
| 44 | + |
| 45 | + |
| 46 | +@dataclass |
| 47 | +class SegyFileInfo: |
| 48 | + """SEG-Y file header information.""" |
| 49 | + |
| 50 | + num_traces: int |
| 51 | + sample_labels: NDArray[int32] |
| 52 | + text_header: str |
| 53 | + binary_header_dict: dict |
| 54 | + raw_binary_headers: bytes |
| 55 | + coordinate_scalar: int |
| 56 | + |
| 57 | + |
| 58 | +def _start_asyncio_loop(segy_file_kwargs: SegyFileArguments) -> None: |
| 59 | + """Start asyncio event loop for async filesystems. |
| 60 | +
|
| 61 | + If the filesystem is async (e.g., S3, GCS, Azure), creates a new event loop |
| 62 | + in a daemon thread and injects it into the storage options. |
| 63 | +
|
| 64 | + Args: |
| 65 | + segy_file_kwargs: SEG-Y file arguments that will be modified to include the loop. |
| 66 | + """ |
| 67 | + protocol = get_protocol(str(segy_file_kwargs["url"])) |
| 68 | + # Get the filesystem class without instantiating it |
| 69 | + fs_class = fsspec.get_filesystem_class(protocol) |
| 70 | + # Only create event loop for async filesystems |
| 71 | + is_async = issubclass(fs_class, AsyncFileSystem) |
| 72 | + if not is_async: |
| 73 | + return |
| 74 | + |
| 75 | + # Create a new event loop and thread to run it in a daemon thread. |
| 76 | + loop_asyncio = asyncio.new_event_loop() |
| 77 | + th_asyncio = threading.Thread( |
| 78 | + target=loop_asyncio.run_forever, |
| 79 | + name=f"mdio-{os.getpid()}", |
| 80 | + daemon=True, |
| 81 | + ) |
| 82 | + th_asyncio.start() |
| 83 | + |
| 84 | + # Add the loop to the storage options to pass as a parameter to AsyncFileSystem. |
| 85 | + # Create a new settings object to avoid modifying the original (which may be shared). |
| 86 | + old_settings = segy_file_kwargs.get("settings") or SegyFileSettings() |
| 87 | + storage_options = {**(old_settings.storage_options or {}), "loop": loop_asyncio} |
| 88 | + segy_file_kwargs["settings"] = SegyFileSettings( |
| 89 | + endianness=old_settings.endianness, |
| 90 | + storage_options=storage_options, |
| 91 | + ) |
| 92 | + |
| 93 | + # Register a function to stop the event loop and join the thread. |
| 94 | + atexit.register(_stop_asyncio_loop, loop_asyncio, th_asyncio) |
| 95 | + |
| 96 | + |
| 97 | +def _stop_asyncio_loop(loop_asyncio: asyncio.AbstractEventLoop, th_asyncio: threading.Thread) -> None: |
| 98 | + """Stop the asyncio event loop and join the thread. |
| 99 | +
|
| 100 | + Args: |
| 101 | + loop_asyncio: The asyncio event loop to stop. |
| 102 | + th_asyncio: The thread running the event loop. |
| 103 | + """ |
| 104 | + if loop_asyncio.is_running(): |
| 105 | + loop_asyncio.call_soon_threadsafe(loop_asyncio.stop) |
| 106 | + |
| 107 | + th_asyncio.join(timeout=MDIO_ASYNCIO_THREAD_STOP_TIMEOUT) |
| 108 | + |
| 109 | + if th_asyncio.is_alive(): |
| 110 | + # Thread did not terminate within timeout, but daemon threads will be |
| 111 | + # terminated by Python interpreter on exit anyway |
| 112 | + logger.warning( |
| 113 | + "Async event loop thread '%s' did not terminate within %s seconds", |
| 114 | + th_asyncio.name, |
| 115 | + MDIO_ASYNCIO_THREAD_STOP_TIMEOUT, |
| 116 | + ) |
| 117 | + |
| 118 | + |
| 119 | +class SegyFileWrapper(SegyFile): |
| 120 | + """SEG-Y file that can be instantiated side by side with Zarr for cloud access. |
| 121 | +
|
| 122 | + This is a workaround for Zarr issues 3487 'Explicitly using fsspec and zarr FsspecStore causes |
| 123 | + RuntimeError "Task attached to a different loop"' |
| 124 | +
|
| 125 | + # TODO (Dmitriy Repin): when Zarr issue 3487 is resolved, we can remove this workaround |
| 126 | + # https://github.com/zarr-developers/zarr-python/issues/3487 |
| 127 | +
|
| 128 | + Args: |
| 129 | + url: Path to the SEG-Y file. |
| 130 | + spec: SEG-Y specification. |
| 131 | + settings: SEG-Y settings. |
| 132 | + header_overrides: SEG-Y header overrides. |
| 133 | + """ |
| 134 | + |
| 135 | + def __init__( |
| 136 | + self, |
| 137 | + url: Path | str, |
| 138 | + spec: SegySpec | None = None, |
| 139 | + settings: SegyFileSettings | None = None, |
| 140 | + header_overrides: SegyHeaderOverrides | None = None, |
| 141 | + ): |
| 142 | + args = SegyFileArguments( |
| 143 | + url=url, |
| 144 | + spec=spec, |
| 145 | + settings=settings, |
| 146 | + header_overrides=header_overrides, |
| 147 | + ) |
| 148 | + _start_asyncio_loop(args) |
| 149 | + super().__init__(**args) |
| 150 | + |
| 151 | + |
| 152 | +def get_segy_file_info(segy_file_kwargs: SegyFileArguments) -> SegyFileInfo: |
| 153 | + """Reads information from a SEG-Y file. |
| 154 | +
|
| 155 | + Args: |
| 156 | + segy_file_kwargs: Arguments to open SegyFile instance. |
| 157 | +
|
| 158 | + Returns: |
| 159 | + SegyFileInfo containing number of traces, sample labels, and header info. |
| 160 | + """ |
| 161 | + segy_file = SegyFileWrapper(**segy_file_kwargs) |
| 162 | + num_traces = segy_file.num_traces |
| 163 | + sample_labels = segy_file.sample_labels |
| 164 | + |
| 165 | + text_header = segy_file.text_header |
| 166 | + |
| 167 | + # Get header information directly |
| 168 | + raw_binary_headers = segy_file.fs.read_block( |
| 169 | + fn=segy_file.url, |
| 170 | + offset=segy_file.spec.binary_header.offset, |
| 171 | + length=segy_file.spec.binary_header.itemsize, |
| 172 | + ) |
| 173 | + |
| 174 | + # We read here twice, but it's ok for now. Only 400-bytes. |
| 175 | + binary_header_dict = segy_file.binary_header.to_dict() |
| 176 | + |
| 177 | + coordinate_scalar = _get_coordinate_scalar(segy_file) |
| 178 | + |
| 179 | + return SegyFileInfo( |
| 180 | + num_traces=num_traces, |
| 181 | + sample_labels=sample_labels, |
| 182 | + text_header=text_header, |
| 183 | + binary_header_dict=binary_header_dict, |
| 184 | + raw_binary_headers=raw_binary_headers, |
| 185 | + coordinate_scalar=coordinate_scalar, |
| 186 | + ) |
0 commit comments