Skip to content

Commit ac21d1c

Browse files
committed
Automatic chunking of live_maks for grids that exceed blosc's maximum elements.
1 parent de274f2 commit ac21d1c

File tree

2 files changed

+179
-32
lines changed

2 files changed

+179
-32
lines changed

src/mdio/converters/segy.py

Lines changed: 39 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from segy.schema import HeaderField
1818

1919
from mdio.api.io_utils import process_url
20+
from mdio.constants import INT32_MAX
2021
from mdio.converters.exceptions import EnvironmentFormatError
2122
from mdio.converters.exceptions import GridTraceCountError
2223
from mdio.converters.exceptions import GridTraceSparsityError
@@ -116,7 +117,6 @@ def segy_to_mdio( # noqa: C901
116117
storage_options_output: dict[str, Any] | None = None,
117118
overwrite: bool = False,
118119
grid_overrides: dict | None = None,
119-
live_mask_chunksize: Sequence[int] | None = None,
120120
) -> None:
121121
"""Convert SEG-Y file to MDIO format.
122122
@@ -171,16 +171,10 @@ def segy_to_mdio( # noqa: C901
171171
Default is `None` (will assume anonymous)
172172
overwrite: Toggle for overwriting existing store
173173
grid_overrides: Option to add grid overrides. See examples.
174-
live_mask_chunksize: Chunk size for live mask. This has limited
175-
support across the MDIO api.
176-
Default is `None` (will do no chunking)
177-
178174
Raises:
179175
GridTraceCountError: Raised if grid won't hold all traces in the
180176
SEG-Y file.
181-
ValueError: If length of chunk sizes don't match number of dimensions
182-
or live_mask_chunksize is not None and lenght of live_mask_chunksize
183-
is not equal to number of dimensions minus one.
177+
ValueError: If length of chunk sizes don't match number of dimensions.
184178
NotImplementedError: If can't determine chunking automatically for 4D+.
185179
186180
Examples:
@@ -348,20 +342,6 @@ def segy_to_mdio( # noqa: C901
348342
... chunksize=(8, 2, 256, 512),
349343
... grid_overrides={"HasDuplicates": True},
350344
... )
351-
352-
>>> segy_to_mdio(
353-
... segy_path="prefix/shot_file.segy",
354-
... mdio_path_or_buffer="s3://bucket/shot_file.mdio",
355-
... index_bytes=(133, 171, 17, 137, 13),
356-
... index_lengths=(2, 2, 4, 2, 4),
357-
... index_names=("shot_line", "gun", "shot_point", "cable", "channel"),
358-
... chunksize=(1, 1, 8, 1, 128, 1024),
359-
... grid_overrides={
360-
... "AutoShotWrap": True,
361-
... "AutoChannelWrap": True,
362-
... "AutoChannelTraceQC": 1000000
363-
... },
364-
... live_mask_chunksize=(1, 1, 8, 1, 128),
365345
"""
366346
if index_names is None:
367347
index_names = [f"dim_{i}" for i in range(len(index_bytes))]
@@ -377,14 +357,6 @@ def segy_to_mdio( # noqa: C901
377357
)
378358
raise ValueError(message)
379359

380-
if live_mask_chunksize is not None:
381-
if len(live_mask_chunksize) != len(index_bytes):
382-
message = (
383-
f"Length of live_mask_chunksize={len(live_mask_chunksize)} must be ",
384-
f"equal to array dimensions={len(index_bytes)}",
385-
)
386-
raise ValueError(message)
387-
388360
# Handle storage options and check permissions etc
389361
if storage_options_input is None:
390362
storage_options_input = {}
@@ -452,8 +424,7 @@ def segy_to_mdio( # noqa: C901
452424
trace_count = np.count_nonzero(grid.live_mask)
453425
write_attribute(name="trace_count", zarr_group=zarr_root, attribute=trace_count)
454426

455-
if live_mask_chunksize is None:
456-
live_mask_chunksize = -1
427+
live_mask_chunksize = _calculate_live_mask_chunksize(grid)
457428

458429
# Note, live mask is not chunked since it's bool and small.
459430
zarr_root["metadata"].create_dataset(
@@ -525,3 +496,39 @@ def segy_to_mdio( # noqa: C901
525496
)
526497

527498
zarr.consolidate_metadata(store_nocache)
499+
500+
501+
def _calculate_live_mask_chunksize(grid: Grid) -> Sequence[int] | int:
502+
"""Calculate the optimal chunksize for the live mask.
503+
504+
Args:
505+
grid: The grid to calculate the chunksize for.
506+
"""
507+
if np.sum(grid.live_mask) < INT32_MAX:
508+
# Base case where we don't need to chunk the live mask
509+
return -1
510+
511+
# Calculate the optimal chunksize for the live mask
512+
total_elements = np.prod(grid.shape[:-1]) # Exclude sample dimension
513+
num_chunks = np.ceil(total_elements / INT32_MAX).astype(int)
514+
515+
# Calculate chunk size for each dimension
516+
chunks = []
517+
remaining_elements = total_elements
518+
519+
for dim_size in grid.shape[:-1]: # Exclude sample dimension
520+
# Calculate how many chunks we need in this dimension
521+
# We want to distribute chunks evenly across dimensions
522+
dim_chunks = max(
523+
1,
524+
int(
525+
np.ceil(
526+
dim_size / np.ceil(np.power(num_chunks, 1 / len(grid.shape[:-1])))
527+
)
528+
),
529+
)
530+
chunk_size = int(np.ceil(dim_size / dim_chunks))
531+
chunks.append(chunk_size)
532+
remaining_elements //= dim_chunks
533+
534+
return tuple(chunks)
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
"""Test live mask chunk size calculation."""
2+
3+
import numpy as np
4+
import pytest
5+
6+
from mdio.converters.segy import _calculate_live_mask_chunksize
7+
from mdio.core import Grid, Dimension
8+
from mdio.constants import INT32_MAX
9+
10+
11+
def test_small_grid_no_chunking():
12+
"""Test that small grids return -1 (no chunking needed)."""
13+
# Create a small grid that fits within INT32_MAX
14+
dims = [
15+
Dimension(coords=range(0, 100, 1), name="dim1"),
16+
Dimension(coords=range(0, 100, 1), name="dim2"),
17+
Dimension(coords=range(0, 100, 1), name="sample")
18+
]
19+
grid = Grid(dims=dims)
20+
grid.live_mask = np.ones((100, 100), dtype=bool)
21+
22+
result = _calculate_live_mask_chunksize(grid)
23+
assert result == -1
24+
25+
26+
def test_large_2d_grid_chunking():
27+
"""Test exact chunk size calculation for a 2D grid that exceeds INT32_MAX."""
28+
# Create a grid that exceeds INT32_MAX (2,147,483,647)
29+
# Using 50,000 x 50,000 = 2,500,000,000 elements
30+
dims = [
31+
Dimension(coords=range(0, 50000, 1), name="dim1"),
32+
Dimension(coords=range(0, 50000, 1), name="dim2"),
33+
Dimension(coords=range(0, 100, 1), name="sample")
34+
]
35+
grid = Grid(dims=dims)
36+
grid.live_mask = np.ones((50000, 50000), dtype=bool)
37+
38+
result = _calculate_live_mask_chunksize(grid)
39+
40+
# Calculate expected values
41+
total_elements = 50000 * 50000
42+
num_chunks = np.ceil(total_elements / INT32_MAX).astype(int)
43+
dim_chunks = int(np.ceil(50000 / np.ceil(np.power(num_chunks, 1/2))))
44+
expected_chunk_size = int(np.ceil(50000 / dim_chunks))
45+
46+
assert result == (expected_chunk_size, expected_chunk_size)
47+
48+
49+
def test_large_3d_grid_chunking():
50+
"""Test exact chunk size calculation for a 3D grid that exceeds INT32_MAX."""
51+
# Create a 3D grid that exceeds INT32_MAX
52+
# Using 1500 x 1500 x 1500 = 3,375,000,000 elements
53+
dims = [
54+
Dimension(coords=range(0, 1500, 1), name="dim1"),
55+
Dimension(coords=range(0, 1500, 1), name="dim2"),
56+
Dimension(coords=range(0, 1500, 1), name="dim3"),
57+
Dimension(coords=range(0, 100, 1), name="sample")
58+
]
59+
grid = Grid(dims=dims)
60+
grid.live_mask = np.ones((1500, 1500, 1500), dtype=bool)
61+
62+
result = _calculate_live_mask_chunksize(grid)
63+
64+
# Calculate expected values
65+
total_elements = 1500 * 1500 * 1500
66+
num_chunks = np.ceil(total_elements / INT32_MAX).astype(int)
67+
dim_chunks = int(np.ceil(1500 / np.ceil(np.power(num_chunks, 1/3))))
68+
expected_chunk_size = int(np.ceil(1500 / dim_chunks))
69+
70+
assert result == (expected_chunk_size, expected_chunk_size, expected_chunk_size)
71+
72+
73+
def test_uneven_dimensions_chunking():
74+
"""Test exact chunk size calculation for uneven dimensions."""
75+
# Create a grid with uneven dimensions that exceeds INT32_MAX
76+
# Using 50,000 x 50,000 = 2,500,000,000 elements (exceeds INT32_MAX)
77+
# But with uneven chunking: 50,000 x 25,000
78+
dims = [
79+
Dimension(coords=range(0, 50000, 1), name="dim1"),
80+
Dimension(coords=range(0, 50000, 1), name="dim2"),
81+
Dimension(coords=range(0, 100, 1), name="sample")
82+
]
83+
grid = Grid(dims=dims)
84+
grid.live_mask = np.ones((50000, 50000), dtype=bool)
85+
86+
result = _calculate_live_mask_chunksize(grid)
87+
88+
# Calculate expected values
89+
total_elements = 50000 * 50000
90+
num_chunks = np.ceil(total_elements / INT32_MAX).astype(int)
91+
dim_chunks = int(np.ceil(50000 / np.ceil(np.power(num_chunks, 1/2))))
92+
expected_chunk_size = int(np.ceil(50000 / dim_chunks))
93+
94+
assert result == (expected_chunk_size, expected_chunk_size)
95+
96+
97+
def test_prestack_land_survey_chunking():
98+
"""Test exact chunk size calculation for a dense pre-stack land survey grid."""
99+
# Create a dense pre-stack land survey grid that exceeds INT32_MAX
100+
# Using realistic dimensions:
101+
# - 1000 shot points
102+
# - 1000 receiver points
103+
# - 100 offsets
104+
# - 36 azimuths
105+
# Total elements: 1000 * 1000 * 100 * 36 = 3,600,000,000 elements
106+
dims = [
107+
Dimension(coords=range(0, 1000, 1), name="shot_point"),
108+
Dimension(coords=range(0, 1000, 1), name="receiver_point"),
109+
Dimension(coords=range(0, 100, 1), name="offset"),
110+
Dimension(coords=range(0, 36, 1), name="azimuth"),
111+
Dimension(coords=range(0, 1000, 1), name="sample")
112+
]
113+
grid = Grid(dims=dims)
114+
grid.live_mask = np.ones((1000, 1000, 100, 36), dtype=bool)
115+
116+
result = _calculate_live_mask_chunksize(grid)
117+
118+
# Calculate expected values
119+
total_elements = 1000 * 1000 * 100 * 36
120+
num_chunks = np.ceil(total_elements / INT32_MAX).astype(int)
121+
dim_chunks = int(np.ceil(1000 / np.ceil(np.power(num_chunks, 1/4))))
122+
expected_chunk_size = int(np.ceil(1000 / dim_chunks))
123+
124+
# For a 4D grid, we expect chunk sizes to be distributed across all dimensions
125+
# The chunk size should be the same for all dimensions since they're all equally important
126+
assert result == (expected_chunk_size, expected_chunk_size, expected_chunk_size, expected_chunk_size)
127+
128+
129+
def test_edge_case_empty_grid():
130+
"""Test empty grid edge case."""
131+
dims = [
132+
Dimension(coords=range(0, 0, 1), name="dim1"),
133+
Dimension(coords=range(0, 0, 1), name="dim2"),
134+
Dimension(coords=range(0, 100, 1), name="sample")
135+
]
136+
grid = Grid(dims=dims)
137+
grid.live_mask = np.zeros((0, 0), dtype=bool)
138+
139+
result = _calculate_live_mask_chunksize(grid)
140+
assert result == -1 # Empty grid shouldn't need chunking

0 commit comments

Comments
 (0)