Skip to content

Commit c0af04d

Browse files
BrianMichelltasansal
authored andcommitted
Re-implement on-the-fly chunking for non-dimension coordinates and live-mask
1 parent bd541e1 commit c0af04d

File tree

2 files changed

+38
-0
lines changed

2 files changed

+38
-0
lines changed

src/mdio/converters/segy.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@
3232
from mdio.converters.exceptions import GridTraceSparsityError
3333
from mdio.converters.type_converter import to_structured_type
3434
from mdio.core.grid import Grid
35+
from mdio.core.utils_write import MAX_COORDINATES_BYTES
36+
from mdio.core.utils_write import MAX_SIZE_LIVE_MASK
37+
from mdio.core.utils_write import get_constrained_chunksize
3538
from mdio.segy import blocked_io
3639
from mdio.segy.utilities import get_grid_plan
3740

@@ -429,6 +432,36 @@ def enhanced_add_variables() -> None:
429432
return mdio_template
430433

431434

435+
def _chunk_variable(ds: Dataset, variable_name: str) -> None:
436+
"""Determines the chunking for a Varible in the Dataset."""
437+
idx = -1
438+
for i in range(len(ds.variables)):
439+
if ds.variables[i].name == variable_name:
440+
idx = i
441+
break
442+
443+
def determine_target_size(var_type: str) -> int:
444+
"""Determines the target size (in bytes) for a Variable based on its type."""
445+
if var_type == "bool":
446+
return MAX_SIZE_LIVE_MASK
447+
return MAX_COORDINATES_BYTES
448+
449+
# Create the chunk grid metadata
450+
var_type = ds.variables[idx].data_type
451+
full_shape = tuple(dim.size for dim in ds.variables[idx].dimensions)
452+
target_size = determine_target_size(var_type)
453+
454+
chunk_shape = get_constrained_chunksize(full_shape, var_type, target_size)
455+
chunks = RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=chunk_shape))
456+
457+
# Update the variable's metadata with the new chunk grid
458+
if ds.variables[idx].metadata is None:
459+
# ds.variables[idx].metadata = VariableMetadata(chunk_shape=chunks.chunk_shape)
460+
ds.variables[idx].metadata = VariableMetadata(chunk_grid=chunks)
461+
else:
462+
ds.variables[idx].metadata.chunk_grid = chunks
463+
464+
432465
def segy_to_mdio( # noqa PLR0913
433466
segy_spec: SegySpec,
434467
mdio_template: AbstractDatasetTemplate,
@@ -487,6 +520,10 @@ def segy_to_mdio( # noqa PLR0913
487520

488521
_add_grid_override_to_metadata(dataset=mdio_ds, grid_overrides=grid_overrides)
489522

523+
_chunk_variable(ds=mdio_ds, variable_name="trace_mask") # trace_mask is a Variable and not a Coordinate
524+
for coord in mdio_template.coordinate_names:
525+
_chunk_variable(ds=mdio_ds, variable_name=coord)
526+
490527
xr_dataset: xr_Dataset = to_xarray_dataset(mdio_ds=mdio_ds)
491528

492529
xr_dataset, drop_vars_delayed = _populate_coordinates(

src/mdio/core/utils_write.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111

1212
MAX_SIZE_LIVE_MASK = 512 * 1024**2
13+
MAX_COORDINATES_BYTES = 128 * 1024**2
1314

1415
JsonSerializable = str | int | float | bool | None | dict[str, "JsonSerializable"] | list["JsonSerializable"]
1516

0 commit comments

Comments
 (0)