Skip to content

Commit f63188d

Browse files
committed
Update to use Dask's chunk generation and ~500MiB chunk sizes
1 parent 5d01ffe commit f63188d

File tree

3 files changed

+199
-146
lines changed

3 files changed

+199
-146
lines changed

src/mdio/converters/segy.py

Lines changed: 129 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
from mdio.segy.helpers_segy import create_zarr_hierarchy
2929
from mdio.segy.utilities import get_grid_plan
3030

31+
from dask.array.core import normalize_chunks
32+
from dask.array.rechunk import _balance_chunksizes
3133

3234
logger = logging.getLogger(__name__)
3335

@@ -509,122 +511,141 @@ def _calculate_live_mask_chunksize(grid: Grid) -> Sequence[int]:
509511
A sequence of integers representing the optimal chunk size for each dimension
510512
of the grid.
511513
"""
512-
return _calculate_optimal_chunksize(grid.live_mask, INT32_MAX)
514+
return _calculate_optimal_chunksize(grid.live_mask, INT32_MAX//4)
513515

514516

515517
def _calculate_optimal_chunksize( # noqa: C901
516518
volume: np.ndarray | zarr.Array, n_bytes: int
517519
) -> Sequence[int]:
518-
"""Calculate a uniform chunk shape for an N-dimensional data volume such that...
519-
520-
0. The product of the chunk dimensions multiplied by the element size does not
521-
exceed n_bytes.
522-
1. The chunk shape is "regular" – each chunk dimension is a divisor of the
523-
overall volume shape.
524-
2. If an exact match is impossible, the chunk shape chosen maximizes the number of
525-
elements (minimizing the unused bytes).
526-
3. The computation is efficient.
527-
528-
The computation efficiency is broken down as follows:
529-
530-
- Divisor Computation: For each of the N dimensions (assume size ~ n), it checks
531-
up to n numbers, so this part is roughly O(N * n).
532-
For example, if you have a 3D array where each dimension is about 100,
533-
it does around 3*100 = 300 steps.
534-
- DFS Search: In the worst-case, the DFS explores about D choices per dimension
535-
(D = average number of divisors) leading to O(D^N) combinations.
536-
In practice, D is small (often < 10), so for a 2D array this is around 10^2
537-
(about 100 combinations) and for a 3D array about 10^3 (roughly 1,000 combinations).
538-
Since N is typically small (often <6), this exponential term behaves like a
539-
constant factor.
520+
"""Calculate the optimal chunksize for an N-dimensional data volume.
540521
541522
Args:
542-
volume : np.ndarray | zarr.Array
543-
An N-dimensional array-like object (e.g. np.ndarray or zarr array).
544-
n_bytes : int
545-
Maximum allowed number of bytes per chunk (>= 1).
523+
volume: The volume to calculate the chunksize for.
524+
n_bytes: The maximum allowed number of bytes per chunk.
546525
547526
Returns:
548-
Sequence[int]
549-
A tuple representing the optimal chunk shape (number of elements along each axis).
550-
551-
Raises:
552-
ValueError if n_bytes is less than the number of bytes of one element.
527+
A sequence of integers representing the optimal chunk size for each dimension
528+
of the grid.
553529
"""
554-
# Get volume shape and element size.
555530
shape = volume.shape
556-
557-
if volume.size == 0:
558-
logging.warning("Chunking calculation received empty volume shape...")
559-
return volume.shape
560-
561-
itemsize = volume.dtype.itemsize
562-
563-
# Maximum number of elements that can fit in a chunk
564-
# (we ignore any extra bytes; must not exceed n_bytes).
565-
max_elements_allowed = n_bytes // itemsize
566-
if max_elements_allowed < 1:
567-
raise ValueError("n_bytes is too small to hold even one element of the volume.")
568-
569-
n_dims = len(shape)
570-
571-
def get_divisors(n: int) -> list[int]:
572-
"""Return a sorted list of all positive divisors of n.
573-
574-
Args:
575-
n: The number to compute the divisors of.
576-
577-
Returns:
578-
A sorted list of all positive divisors of n.
579-
"""
580-
divs = []
581-
# It is efficient enough for typical dimension sizes.
582-
for i in range(1, n + 1):
583-
if n % i == 0:
584-
divs.append(i)
585-
return sorted(divs)
586-
587-
# For each dimension, compute the list of allowed chunk sizes (divisors).
588-
divisors_list = [get_divisors(d) for d in shape]
589-
590-
# For pruning: precompute the maximum possible product achievable from axis i to N-1.
591-
# This is the product of the maximum divisors for each remaining axis.
592-
max_possible = [1] * (n_dims + 1)
593-
for i in range(n_dims - 1, -1, -1):
594-
max_possible[i] = max(divisors_list[i]) * max_possible[i + 1]
595-
596-
best_product = 0
597-
best_combination = [None] * n_dims
598-
current_chunk = [None] * n_dims
599-
600-
def dfs(dim: int, current_product: int) -> None:
601-
"""Depth-first search to find the optimal chunk shape.
602-
603-
Args:
604-
dim: The current dimension to process.
605-
current_product: The current product of the chunk dimensions.
606-
"""
607-
nonlocal best_product
608-
# If all dimensions have been processed, update best combination if needed.
609-
if dim == n_dims:
610-
if current_product > best_product:
611-
best_product = current_product
612-
best_combination[:] = current_chunk[:]
613-
return
614-
615-
# Prune branches: even if we take the maximum allowed for all remaining dimensions,
616-
# if we cannot exceed best_product, then skip.
617-
if current_product * max_possible[dim] < best_product:
618-
return
619-
620-
# Iterate over allowed divisors for the current axis,
621-
# trying larger candidates first so that high products are found early.
622-
for candidate in sorted(divisors_list[dim], reverse=True):
623-
new_product = current_product * candidate
624-
if new_product > max_elements_allowed:
625-
continue # This candidate would exceed the byte restriction.
626-
current_chunk[dim] = candidate
627-
dfs(dim + 1, new_product)
628-
629-
dfs(0, 1)
630-
return tuple(best_combination)
531+
chunks = normalize_chunks(
532+
"auto",
533+
shape,
534+
dtype=volume.dtype,
535+
limit=n_bytes,
536+
)
537+
return tuple(_balance_chunksizes(chunk)[0] for chunk in chunks)
538+
539+
540+
541+
# 0. The product of the chunk dimensions multiplied by the element size does not
542+
# exceed n_bytes.
543+
# 1. The chunk shape is "regular" – each chunk dimension is a divisor of the
544+
# overall volume shape.
545+
# 2. If an exact match is impossible, the chunk shape chosen maximizes the number of
546+
# elements (minimizing the unused bytes).
547+
# 3. The computation is efficient.
548+
549+
# The computation efficiency is broken down as follows:
550+
551+
# - Divisor Computation: For each of the N dimensions (assume size ~ n), it checks
552+
# up to n numbers, so this part is roughly O(N * n).
553+
# For example, if you have a 3D array where each dimension is about 100,
554+
# it does around 3*100 = 300 steps.
555+
# - DFS Search: In the worst-case, the DFS explores about D choices per dimension
556+
# (D = average number of divisors) leading to O(D^N) combinations.
557+
# In practice, D is small (often < 10), so for a 2D array this is around 10^2
558+
# (about 100 combinations) and for a 3D array about 10^3 (roughly 1,000 combinations).
559+
# Since N is typically small (often <6), this exponential term behaves like a
560+
# constant factor.
561+
562+
# Args:
563+
# volume : np.ndarray | zarr.Array
564+
# An N-dimensional array-like object (e.g. np.ndarray or zarr array).
565+
# n_bytes : int
566+
# Maximum allowed number of bytes per chunk (>= 1).
567+
568+
# Returns:
569+
# Sequence[int]
570+
# A tuple representing the optimal chunk shape (number of elements along each axis).
571+
572+
# Raises:
573+
# ValueError if n_bytes is less than the number of bytes of one element.
574+
# """
575+
# # Get volume shape and element size.
576+
# shape = volume.shape
577+
578+
# if volume.size == 0:
579+
# logging.warning("Chunking calculation received empty volume shape...")
580+
# return volume.shape
581+
582+
# itemsize = volume.dtype.itemsize
583+
584+
# # Maximum number of elements that can fit in a chunk
585+
# # (we ignore any extra bytes; must not exceed n_bytes).
586+
# max_elements_allowed = n_bytes // itemsize
587+
# if max_elements_allowed < 1:
588+
# raise ValueError("n_bytes is too small to hold even one element of the volume.")
589+
590+
# n_dims = len(shape)
591+
592+
# def get_divisors(n: int) -> list[int]:
593+
# """Return a sorted list of all positive divisors of n.
594+
595+
# Args:
596+
# n: The number to compute the divisors of.
597+
598+
# Returns:
599+
# A sorted list of all positive divisors of n.
600+
# """
601+
# divs = []
602+
# # It is efficient enough for typical dimension sizes.
603+
# for i in range(1, n + 1):
604+
# if n % i == 0:
605+
# divs.append(i)
606+
# return sorted(divs)
607+
608+
# # For each dimension, compute the list of allowed chunk sizes (divisors).
609+
# divisors_list = [get_divisors(d) for d in shape]
610+
611+
# # For pruning: precompute the maximum possible product achievable from axis i to N-1.
612+
# # This is the product of the maximum divisors for each remaining axis.
613+
# max_possible = [1] * (n_dims + 1)
614+
# for i in range(n_dims - 1, -1, -1):
615+
# max_possible[i] = max(divisors_list[i]) * max_possible[i + 1]
616+
617+
# best_product = 0
618+
# best_combination = [None] * n_dims
619+
# current_chunk = [None] * n_dims
620+
621+
# def dfs(dim: int, current_product: int) -> None:
622+
# """Depth-first search to find the optimal chunk shape.
623+
624+
# Args:
625+
# dim: The current dimension to process.
626+
# current_product: The current product of the chunk dimensions.
627+
# """
628+
# nonlocal best_product
629+
# # If all dimensions have been processed, update best combination if needed.
630+
# if dim == n_dims:
631+
# if current_product > best_product:
632+
# best_product = current_product
633+
# best_combination[:] = current_chunk[:]
634+
# return
635+
636+
# # Prune branches: even if we take the maximum allowed for all remaining dimensions,
637+
# # if we cannot exceed best_product, then skip.
638+
# if current_product * max_possible[dim] < best_product:
639+
# return
640+
641+
# # Iterate over allowed divisors for the current axis,
642+
# # trying larger candidates first so that high products are found early.
643+
# for candidate in sorted(divisors_list[dim], reverse=True):
644+
# new_product = current_product * candidate
645+
# if new_product > max_elements_allowed:
646+
# continue # This candidate would exceed the byte restriction.
647+
# current_chunk[dim] = candidate
648+
# dfs(dim + 1, new_product)
649+
650+
# dfs(0, 1)
651+
# return tuple(best_combination)

tests/conftest.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,18 @@
11
"""Test configuration before everything runs."""
22

3+
import warnings
34
from os import path
45
from urllib.request import urlretrieve
56

67
import pytest
78

9+
# Suppress Dask's chunk balancing warning
10+
warnings.filterwarnings(
11+
"ignore",
12+
message="Could not balance chunks to be equal",
13+
category=UserWarning,
14+
module="dask.array.rechunk",
15+
)
816

917
@pytest.fixture(scope="session")
1018
def fake_segy_tmp(tmp_path_factory):

0 commit comments

Comments
 (0)