@@ -354,6 +354,7 @@ def segy_to_mdio( # noqa: PLR0913, PLR0915
354354 ... grid_overrides={"HasDuplicates": True},
355355 ... )
356356 """
357+ print ("Entering segy_to_mdio" )
357358 index_names = index_names or [f"dim_{ i } " for i in range (len (index_bytes ))]
358359 index_types = index_types or ["int32" ] * len (index_bytes )
359360
@@ -368,6 +369,7 @@ def segy_to_mdio( # noqa: PLR0913, PLR0915
368369 storage_options_input = storage_options_input or {}
369370 storage_options_output = storage_options_output or {}
370371
372+ print ("pre-setup" )
371373 # Open SEG-Y with MDIO's SegySpec. Endianness will be inferred.
372374 mdio_spec = mdio_segy_spec ()
373375 segy_settings = SegySettings (storage_options = storage_options_input )
@@ -377,31 +379,43 @@ def segy_to_mdio( # noqa: PLR0913, PLR0915
377379 binary_header = segy .binary_header
378380 num_traces = segy .num_traces
379381
382+ print ("pre-index" )
380383 # Index the dataset using a spec that interprets the user provided index headers.
381- index_fields = []
384+ index_fields : list [ HeaderField ] = []
382385 for name , byte , format_ in zip (index_names , index_bytes , index_types , strict = True ):
383386 index_fields .append (HeaderField (name = name , byte = byte , format = format_ ))
384387 mdio_spec_grid = mdio_spec .customize (trace_header_fields = index_fields )
385388 segy_grid = SegyFile (url = segy_path , spec = mdio_spec_grid , settings = segy_settings )
386389
390+ print ("pre-get_grid_plan" )
387391 dimensions , chunksize , index_headers = get_grid_plan (
388392 segy_file = segy_grid ,
389393 return_headers = True ,
390394 chunksize = chunksize ,
391395 grid_overrides = grid_overrides ,
392396 )
393397 grid = Grid (dims = dimensions )
398+ print ("pre-grid_density_qc" )
394399 grid_density_qc (grid , num_traces )
400+ print ("pre-build_map" )
395401 grid .build_map (index_headers )
396402
397- # Check grid validity by comparing trace numbers
398- if np .sum (grid .live_mask ) != num_traces :
403+ print ("pre-valid_mask" )
404+ # Check grid validity by ensuring every trace's header-index is within dimension bounds
405+ valid_mask = np .ones (grid .num_traces , dtype = bool )
406+ for d_idx in range (len (grid .header_index_arrays )):
407+ coords = grid .header_index_arrays [d_idx ]
408+ valid_mask &= (coords < grid .shape [d_idx ])
409+ valid_count = int (np .count_nonzero (valid_mask ))
410+ if valid_count != num_traces :
399411 for dim_name in grid .dim_names :
400- dim_min , dim_max = grid .get_min (dim_name ), grid .get_max (dim_name )
412+ dim_min = grid .get_min (dim_name )
413+ dim_max = grid .get_max (dim_name )
401414 logger .warning ("%s min: %s max: %s" , dim_name , dim_min , dim_max )
402415 logger .warning ("Ingestion grid shape: %s." , grid .shape )
403- raise GridTraceCountError (np . sum ( grid . live_mask ) , num_traces )
416+ raise GridTraceCountError (valid_count , num_traces )
404417
418+ print ("pre-chunksize" )
405419 if chunksize is None :
406420 dim_count = len (index_names ) + 1
407421 if dim_count == 2 : # noqa: PLR2004
@@ -424,6 +438,7 @@ def segy_to_mdio( # noqa: PLR0913, PLR0915
424438 suffix = [str (idx ) for idx , value in enumerate (suffix ) if value is not None ]
425439 suffix = "" .join (suffix )
426440
441+ print ("pre-compressors" )
427442 compressors = get_compressor (lossless , compression_tolerance )
428443 header_dtype = segy .spec .trace .header .dtype .newbyteorder ("=" )
429444 var_conf = MDIOVariableConfig (
@@ -435,6 +450,7 @@ def segy_to_mdio( # noqa: PLR0913, PLR0915
435450 )
436451 config = MDIOCreateConfig (path = mdio_path_or_buffer , grid = grid , variables = [var_conf ])
437452
453+ print ("pre-create_empty" )
438454 root_group = create_empty (
439455 config ,
440456 overwrite = overwrite ,
@@ -446,23 +462,61 @@ def segy_to_mdio( # noqa: PLR0913, PLR0915
446462 data_array = data_group [f"chunked_{ suffix } " ]
447463 header_array = meta_group [f"chunked_{ suffix } _trace_headers" ]
448464
449- # Write actual live mask and metadata to empty MDIO
450- meta_group ["live_mask" ][:] = grid .live_mask [:]
451- nonzero_count = np .count_nonzero (grid .live_mask )
465+ print ("pre-live_mask" )
466+ live_mask_array = meta_group ["live_mask" ]
467+ # 'live_mask_array' has the same first N–1 dims as 'grid.shape[:-1]'
468+ # Build a ChunkIterator over the live_mask (no sample axis)
469+ from mdio .core .indexing import ChunkIterator
470+
471+ chunker = ChunkIterator (live_mask_array , chunk_samples = False )
472+ for chunk_indices in chunker :
473+ # chunk_indices is a tuple of N–1 slice objects
474+ trace_ids = grid .get_traces_for_chunk (chunk_indices )
475+ if trace_ids .size == 0 :
476+ continue
477+
478+ # Build a temporary boolean block of shape = chunk shape
479+ block_shape = tuple (sl .stop - sl .start for sl in chunk_indices )
480+ block = np .zeros (block_shape , dtype = bool )
481+
482+ # Compute local coords within this block for each trace_id
483+ local_coords : list [np .ndarray ] = []
484+ for dim_idx , sl in enumerate (chunk_indices ):
485+ hdr_arr = grid .header_index_arrays [dim_idx ]
486+ local_idx = (hdr_arr [trace_ids ] - sl .start ).astype (int )
487+ local_coords .append (local_idx )
488+
489+ # Mark live cells in the temporary block
490+ block [tuple (local_coords )] = True
491+
492+ # Write the entire block to Zarr at once
493+ live_mask_array .set_basic_selection (selection = chunk_indices , value = block )
494+
495+ nonzero_count = grid .num_traces
496+
497+ print ("pre-write_attribute" )
452498 write_attribute (name = "trace_count" , zarr_group = root_group , attribute = nonzero_count )
453499 write_attribute (name = "text_header" , zarr_group = meta_group , attribute = text_header .split ("\n " ))
454500 write_attribute (name = "binary_header" , zarr_group = meta_group , attribute = binary_header .to_dict ())
455501
502+ print ("pre-to_zarr" )
456503 # Write traces
504+ zarr_root = mdio_path_or_buffer # the same path you passed earlier to create_empty
505+ data_var = f"data/chunked_{ suffix } "
506+ header_var = f"metadata/chunked_{ suffix } _trace_headers"
507+
457508 stats = blocked_io .to_zarr (
458509 segy_file = segy ,
459510 grid = grid ,
460- data_array = data_array ,
461- header_array = header_array ,
511+ zarr_root_path = zarr_root ,
512+ data_var_path = data_var ,
513+ header_var_path = header_var ,
462514 )
463515
516+ print ("pre-write_attribute" )
464517 # Write actual stats
465518 for key , value in stats .items ():
466519 write_attribute (name = key , zarr_group = root_group , attribute = value )
467520
468- zarr .consolidate_metadata (root_group .store )
521+ print ("pre-consolidate_metadata" )
522+ zarr .consolidate_metadata (root_group .store )
0 commit comments