Skip to content

Commit 7f9d580

Browse files
committed
use concurrent futures which handles and raises exceptions
1 parent 79e7008 commit 7f9d580

File tree

3 files changed

+22
-24
lines changed

3 files changed

+22
-24
lines changed

src/mdio/segy/_workers.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -214,11 +214,11 @@ def trace_worker(
214214
# However, we need pool.starmap because we have more than one
215215
# argument to make pool.map work with multiple arguments, we
216216
# wrap the function and consolidate arguments to one
217-
def trace_worker_map(args):
218-
"""Wrapper for trace worker to use with tqdm."""
217+
def trace_worker_wrapper(args):
218+
"""Wrapper to make it work with map and multiple arguments."""
219219
return trace_worker(*args)
220220

221221

222-
def header_scan_worker_map(args):
223-
"""Wrapper for header scan worker to use with tqdm."""
222+
def header_scan_worker_wrapper(args):
223+
"""Wrapper to make it work with map and multiple arguments."""
224224
return header_scan_worker(*args)

src/mdio/segy/blocked_io.py

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from __future__ import annotations
55

66
import multiprocessing as mp
7+
from concurrent.futures import ProcessPoolExecutor
78
from itertools import repeat
89

910
import numpy as np
@@ -19,7 +20,7 @@
1920

2021
from mdio.core import Grid
2122
from mdio.core.indexing import ChunkIterator
22-
from mdio.segy._workers import trace_worker_map
23+
from mdio.segy._workers import trace_worker_wrapper
2324
from mdio.segy.byte_utils import ByteOrder
2425
from mdio.segy.byte_utils import Dtype
2526
from mdio.segy.creation import concat_files
@@ -142,25 +143,22 @@ def to_zarr(
142143
repeat(segy_endian),
143144
)
144145

145-
# This is for Unix async writes to s3fs/fsspec, when using
146-
# multiprocessing. By default, Linux uses the 'fork' method.
147-
# 'spawn' is a little slower to spool up processes, but 'fork'
148-
# doesn't work. If you don't use this, processes get deadlocked
149-
# on cloud stores. 'spawn' is default in Windows.
146+
# For Unix async writes with s3fs/fsspec & multiprocessing,
147+
# use 'spawn' instead of default 'fork' to avoid deadlocks
148+
# on cloud stores. Slower but necessary. Default on Windows.
149+
num_workers = min(num_chunks, NUM_CORES)
150150
context = mp.get_context("spawn")
151+
executor = ProcessPoolExecutor(max_workers=num_workers, mp_context=context)
151152

152-
# This is the chunksize for multiprocessing. Not to be confused
153-
# with Zarr chunksize.
154-
num_workers = min(num_chunks, NUM_CORES)
153+
# Chunksize here is for multiprocessing, not Zarr chunksize.
155154
pool_chunksize, extra = divmod(num_chunks, num_workers * 4)
156155
pool_chunksize += 1 if extra else pool_chunksize
157156

158157
tqdm_kw = dict(unit="block", dynamic_ncols=True)
159-
with context.Pool(num_workers) as pool:
160-
# pool.imap is lazy
161-
lazy_work = pool.imap(
162-
func=trace_worker_map,
163-
iterable=parallel_inputs,
158+
with executor:
159+
lazy_work = executor.map(
160+
trace_worker_wrapper, # fn
161+
parallel_inputs, # iterables
164162
chunksize=pool_chunksize,
165163
)
166164

src/mdio/segy/parsers.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33

44
from __future__ import annotations
55

6+
from concurrent.futures import ProcessPoolExecutor
67
from itertools import repeat
78
from math import ceil
8-
from multiprocessing import Pool
99
from typing import Any
1010
from typing import Sequence
1111

@@ -15,7 +15,7 @@
1515
from tqdm.auto import tqdm
1616

1717
from mdio.core import Dimension
18-
from mdio.segy._workers import header_scan_worker_map
18+
from mdio.segy._workers import header_scan_worker_wrapper
1919

2020

2121
NUM_CORES = cpu_count(logical=False)
@@ -117,11 +117,11 @@ def parse_trace_headers(
117117
num_workers = min(n_blocks, NUM_CORES)
118118

119119
tqdm_kw = dict(unit="block", dynamic_ncols=True)
120-
with Pool(num_workers) as pool:
120+
with ProcessPoolExecutor(num_workers) as executor:
121121
# pool.imap is lazy
122-
lazy_work = pool.imap(
123-
func=header_scan_worker_map,
124-
iterable=parallel_inputs,
122+
lazy_work = executor.map(
123+
header_scan_worker_wrapper, # fn
124+
parallel_inputs, # iterables
125125
chunksize=2, # Not array chunks. This is for `multiprocessing`
126126
)
127127

0 commit comments

Comments
 (0)