Skip to content

Commit 1eec54b

Browse files
committed
Re-implement on-the-fly chunking for non-dimension coordinates and live-mask
1 parent 954661c commit 1eec54b

File tree

3 files changed

+42
-1
lines changed

3 files changed

+42
-1
lines changed

src/mdio/converters/segy.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,20 @@
1414

1515
from mdio.api.io import _normalize_path
1616
from mdio.api.io import to_mdio
17+
from mdio.builder.schemas.chunk_grid import RegularChunkGrid
18+
from mdio.builder.schemas.chunk_grid import RegularChunkShape
1719
from mdio.builder.schemas.v1.units import LengthUnitEnum
1820
from mdio.builder.schemas.v1.units import LengthUnitModel
21+
from mdio.builder.schemas.v1.variable import VariableMetadata
1922
from mdio.builder.xarray_builder import to_xarray_dataset
2023
from mdio.converters.exceptions import EnvironmentFormatError
2124
from mdio.converters.exceptions import GridTraceCountError
2225
from mdio.converters.exceptions import GridTraceSparsityError
2326
from mdio.converters.type_converter import to_structured_type
2427
from mdio.core.grid import Grid
28+
from mdio.core.utils_write import MAX_COORDINATES_BYTES
29+
from mdio.core.utils_write import MAX_SIZE_LIVE_MASK
30+
from mdio.core.utils_write import get_constrained_chunksize
2531
from mdio.segy import blocked_io
2632
from mdio.segy.utilities import get_grid_plan
2733

@@ -330,6 +336,36 @@ def _add_segy_ingest_attributes(dataset: Dataset, segy_file: SegyFile, grid_over
330336
dataset.metadata.attributes.update(segy_attributes)
331337

332338

339+
def _chunk_variable(ds: Dataset, variable_name: str) -> None:
340+
"""Determines the chunking for a Varible in the Dataset."""
341+
idx = -1
342+
for i in range(len(ds.variables)):
343+
if ds.variables[i].name == variable_name:
344+
idx = i
345+
break
346+
347+
def determine_target_size(var_type: str) -> int:
348+
"""Determines the target size (in bytes) for a Variable based on its type."""
349+
if var_type == "bool":
350+
return MAX_SIZE_LIVE_MASK
351+
return MAX_COORDINATES_BYTES
352+
353+
# Create the chunk grid metadata
354+
var_type = ds.variables[idx].data_type
355+
full_shape = tuple(dim.size for dim in ds.variables[idx].dimensions)
356+
target_size = determine_target_size(var_type)
357+
358+
chunk_shape = get_constrained_chunksize(full_shape, var_type, target_size)
359+
chunks = RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=chunk_shape))
360+
361+
# Update the variable's metadata with the new chunk grid
362+
if ds.variables[idx].metadata is None:
363+
# ds.variables[idx].metadata = VariableMetadata(chunk_shape=chunks.chunk_shape)
364+
ds.variables[idx].metadata = VariableMetadata(chunk_grid=chunks)
365+
else:
366+
ds.variables[idx].metadata.chunk_grid = chunks
367+
368+
333369
def segy_to_mdio( # noqa PLR0913
334370
segy_spec: SegySpec,
335371
mdio_template: AbstractDatasetTemplate,
@@ -379,6 +415,10 @@ def segy_to_mdio( # noqa PLR0913
379415

380416
_add_segy_ingest_attributes(dataset=mdio_ds, segy_file=segy_file, grid_overrides=grid_overrides)
381417

418+
_chunk_variable(ds=mdio_ds, variable_name="trace_mask") # trace_mask is a Variable and not a Coordinate
419+
for coord in mdio_template.coordinate_names:
420+
_chunk_variable(ds=mdio_ds, variable_name=coord)
421+
382422
xr_dataset: xr_Dataset = to_xarray_dataset(mdio_ds=mdio_ds)
383423

384424
xr_dataset, drop_vars_delayed = _populate_coordinates(

src/mdio/core/utils_write.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111

1212
MAX_SIZE_LIVE_MASK = 512 * 1024**2
13+
MAX_COORDINATES_BYTES = 128 * 1024**2
1314

1415
JsonSerializable = str | int | float | bool | None | dict[str, "JsonSerializable"] | list["JsonSerializable"]
1516

uv.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)