Skip to content

Commit f904855

Browse files
authored
New "Cloud Native" mode for ingesting remote files from a cloud environment (#467)
* Update SEG-Y import for cloud native support and add test * Add s3fs dependency to test suite for cloud read test * Enable multiprocessing with spawn context for cloud safety * Add docs about buffered reads optimization for SEG-Y ingestion
1 parent f4a648d commit f904855

File tree

7 files changed

+71
-15
lines changed

7 files changed

+71
-15
lines changed

docs/usage.md

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,32 @@ Some useful examples are:
162162
- File Buffering and random access
163163
- Mount anything with FUSE
164164

165-
````{note}
165+
#### Buffered Reads in Ingestion
166+
167+
MDIO v0.8.2 introduces the `MDIO__IMPORT__CLOUD_NATIVE` environment variable to optimize
168+
SEG-Y header scans by balancing bandwidth usage with read latency through buffered reads.
169+
170+
**When to Use:** This variable is most effective in high-throughput environments like cloud-based ingestion
171+
systems but can also improve performance for mechanical drives or slow connections.
172+
173+
**How to Enable:** Set the variable to `{"True", "1", "true"}`. For example:
174+
175+
```console
176+
$ export MDIO__IMPORT__CLOUD_NATIVE="true"
177+
```
178+
179+
**How It Works:** Buffered reads minimize millions of remote requests during SEG-Y header scans:
180+
181+
- **Cloud Environments:** Ideal for high-throughput connections between cloud ingestion
182+
machines and object stores.
183+
- **Slow Connections:** Bandwidth is the bottleneck, may be faster without it.
184+
- **Local Reads:** May benefit mechanical drives; SSDs typically perform fine without it.
185+
186+
While buffered reads process the file twice, the tradeoff improves ingestion performance and
187+
reduces object-store request costs.
188+
189+
#### Chaining `fsspec` Protocols
190+
166191
When combining advanced protocols like `simplecache` and using a remote store like `s3` the
167192
URL can be chained like `simplecache::s3://bucket/prefix/file.mdio`. When doing this the
168193
`--storage-options` argument must explicitly state parameters for the cloud backend and the
@@ -181,10 +206,10 @@ extra protocol. For the above example it would look like this:
181206
```
182207

183208
In one line:
209+
184210
```json
185211
{"s3": {"key": "my_super_private_key", "secret": "my_super_private_secret"}, "simplecache": {"cache_storage": "/custom/temp/storage/path"}
186212
```
187-
````
188213

189214
## CLI Reference
190215

noxfile.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ def mypy(session: Session) -> None:
170170
def tests(session: Session) -> None:
171171
"""Run the test suite."""
172172
session.install(".")
173-
session.install("coverage[toml]", "pytest", "pygments", "pytest-dependency")
173+
session.install("coverage[toml]", "pytest", "pygments", "pytest-dependency", "s3fs")
174174
try:
175175
session.run("coverage", "run", "--parallel", "-m", "pytest", *session.posargs)
176176
finally:

src/mdio/commands/segy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@
6666

6767

6868
@cli.command(name="import")
69-
@argument("segy-path", type=Path(exists=True))
69+
@argument("segy-path", type=STRING)
7070
@argument("mdio-path", type=STRING)
7171
@option(
7272
"-loc",

src/mdio/segy/_workers.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from __future__ import annotations
44

5+
import os
56
from typing import TYPE_CHECKING
67
from typing import Any
78

@@ -37,7 +38,14 @@ def header_scan_worker(
3738
Returns:
3839
HeaderArray parsed from SEG-Y library.
3940
"""
40-
return segy_file.header[slice(*trace_range)]
41+
slice_ = slice(*trace_range)
42+
43+
cloud_native_mode = os.getenv("MDIO__IMPORT__CLOUD_NATIVE", default="False")
44+
45+
if cloud_native_mode.lower() in {"true", "1"}:
46+
return segy_file.trace[slice_].header
47+
48+
return segy_file.header[slice_]
4149

4250

4351
def trace_worker(

src/mdio/segy/parsers.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from __future__ import annotations
44

5+
import multiprocessing as mp
56
import os
67
from concurrent.futures import ProcessPoolExecutor
78
from itertools import repeat
@@ -46,15 +47,18 @@ def parse_index_headers(
4647
trace_ranges = []
4748
for idx in range(n_blocks):
4849
start, stop = idx * block_size, (idx + 1) * block_size
49-
if stop > trace_count:
50-
stop = trace_count
50+
stop = min(stop, trace_count)
5151

5252
trace_ranges.append((start, stop))
5353

54+
# For Unix async reads with s3fs/fsspec & multiprocessing,
55+
# use 'spawn' instead of default 'fork' to avoid deadlocks
56+
# on cloud stores. Slower but necessary. Default on Windows.
5457
num_workers = min(n_blocks, NUM_CPUS)
58+
context = mp.get_context("spawn")
5559

5660
tqdm_kw = dict(unit="block", dynamic_ncols=True)
57-
with ProcessPoolExecutor(num_workers) as executor:
61+
with ProcessPoolExecutor(num_workers, mp_context=context) as executor:
5862
# pool.imap is lazy
5963
lazy_work = executor.map(
6064
header_scan_worker, # fn

tests/conftest.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,17 @@ def fake_segy_tmp(tmp_path_factory):
1414

1515

1616
@pytest.fixture(scope="session")
17-
def segy_input(tmp_path_factory):
17+
def segy_input_uri():
18+
"""Path to dome dataset for cloud testing."""
19+
return "http://s3.amazonaws.com/teapot/filt_mig.sgy"
20+
21+
22+
@pytest.fixture(scope="session")
23+
def segy_input(segy_input_uri, tmp_path_factory):
1824
"""Download teapot dome dataset for testing."""
19-
url = "http://s3.amazonaws.com/teapot/filt_mig.sgy"
2025
tmp_dir = tmp_path_factory.mktemp("segy")
2126
tmp_file = path.join(tmp_dir, "teapot.segy")
22-
urlretrieve(url, tmp_file) # noqa: S310
27+
urlretrieve(segy_input_uri, tmp_file) # noqa: S310
2328

2429
return tmp_file
2530

tests/test_main.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Test cases for the __main__ module."""
22

3+
import os
34
from pathlib import Path
45

56
import pytest
@@ -8,18 +9,31 @@
89
from mdio import __main__
910

1011

11-
@pytest.fixture()
12+
@pytest.fixture
1213
def runner() -> CliRunner:
1314
"""Fixture for invoking command-line interfaces."""
1415
return CliRunner()
1516

1617

17-
@pytest.mark.dependency()
18+
@pytest.mark.dependency
1819
def test_main_succeeds(runner: CliRunner, segy_input: str, zarr_tmp: Path) -> None:
1920
"""It exits with a status code of zero."""
2021
cli_args = ["segy", "import", segy_input, str(zarr_tmp)]
21-
cli_args.extend(["-loc", "181,185"])
22-
cli_args.extend(["-names", "inline,crossline"])
22+
cli_args.extend(["--header-locations", "181,185"])
23+
cli_args.extend(["--header-names", "inline,crossline"])
24+
25+
result = runner.invoke(__main__.main, args=cli_args)
26+
assert result.exit_code == 0
27+
28+
29+
@pytest.mark.dependency(depends=["test_main_succeeds"])
30+
def test_main_cloud(runner: CliRunner, segy_input_uri: str, zarr_tmp: Path) -> None:
31+
"""It exits with a status code of zero."""
32+
os.environ["MDIO__IMPORT__CLOUD_NATIVE"] = "true"
33+
cli_args = ["segy", "import", str(segy_input_uri), str(zarr_tmp)]
34+
cli_args.extend(["--header-locations", "181,185"])
35+
cli_args.extend(["--header-names", "inline,crossline"])
36+
cli_args.extend(["--overwrite"])
2337

2438
result = runner.invoke(__main__.main, args=cli_args)
2539
assert result.exit_code == 0

0 commit comments

Comments
 (0)