Begin debugging grid sparsity OOM issues.

BrianMichell · BrianMichell · commit 78c8f8a4965f · 2025-06-18T14:31:58.000Z
diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py
@@ -375,38 +375,47 @@ def segy_to_mdio(  # noqa: PLR0913, PLR0915, PLR0912
     storage_options_input = storage_options_input or {}
     storage_options_output = storage_options_output or {}
 
+    print("Opening SEG-Y...")
+
     # Open SEG-Y with MDIO's SegySpec. Endianness will be inferred.
     mdio_spec = mdio_segy_spec()
+    print("MDIO spec created")
     segy_settings = SegySettings(storage_options=storage_options_input)
     segy = SegyFile(url=segy_path, spec=mdio_spec, settings=segy_settings)
+    print("SEG-Y file opened")
 
     text_header = segy.text_header
     binary_header = segy.binary_header
     num_traces = segy.num_traces
-
+    print("num_traces", num_traces)
     # Index the dataset using a spec that interprets the user provided index headers.
     index_fields = []
     for name, byte, format_ in zip(index_names, index_bytes, index_types, strict=True):
         index_fields.append(HeaderField(name=name, byte=byte, format=format_))
     mdio_spec_grid = mdio_spec.customize(trace_header_fields=index_fields)
     segy_grid = SegyFile(url=segy_path, spec=mdio_spec_grid, settings=segy_settings)
-
+    print("SEGY grid created")
     dimensions, chunksize, index_headers = get_grid_plan(
         segy_file=segy_grid,
         return_headers=True,
         chunksize=chunksize,
         grid_overrides=grid_overrides,
     )
+    print("grid plan created")
     grid = Grid(dims=dimensions)
+    print("grid created")
     grid_density_qc(grid, num_traces)
+    print("grid density qc done")
     grid.build_map(index_headers)
 
     # Check grid validity by ensuring every trace's header-index is within dimension bounds
     valid_mask = np.ones(grid.num_traces, dtype=bool)
+    print("valid_mask shape", valid_mask.shape)
     for d_idx in range(len(grid.header_index_arrays)):
         coords = grid.header_index_arrays[d_idx]
         valid_mask &= coords < grid.shape[d_idx]
     valid_count = int(np.count_nonzero(valid_mask))
+    print("valid_count", valid_count)
     if valid_count != num_traces:
         for dim_name in grid.dim_names:
             dim_min = grid.get_min(dim_name)
@@ -417,6 +426,8 @@ def segy_to_mdio(  # noqa: PLR0913, PLR0915, PLR0912
 
     import gc
 
+    # raise Exception("Stop here")
+
     del valid_mask
     gc.collect()
 
@@ -453,6 +464,8 @@ def segy_to_mdio(  # noqa: PLR0913, PLR0915, PLR0912
     )
     config = MDIOCreateConfig(path=mdio_path_or_buffer, grid=grid, variables=[var_conf])
 
+    print("Creating empty...")
+
     root_group = create_empty(
         config,
         overwrite=overwrite,
@@ -464,7 +477,10 @@ def segy_to_mdio(  # noqa: PLR0913, PLR0915, PLR0912
     data_array = data_group[f"chunked_{suffix}"]
     header_array = meta_group[f"chunked_{suffix}_trace_headers"]
 
+    print("Creating live mask...")
+
     live_mask_array = meta_group["live_mask"]
+    print(live_mask_array.shape)
     # 'live_mask_array' has the same first N–1 dims as 'grid.shape[:-1]'
     # Build a ChunkIterator over the live_mask (no sample axis)
     from mdio.core.indexing import ChunkIterator
diff --git a/src/mdio/segy/geometry.py b/src/mdio/segy/geometry.py
@@ -202,18 +202,22 @@ def create_counter(
     total_depth: int,
     unique_headers: dict[str, NDArray],
     header_names: list[str],
-) -> dict[str, dict]:
-    """Helper function to create dictionary tree for counting trace key for auto index."""
-    if depth == total_depth:
-        return 0
-
-    counter = {}
-
-    header_key = header_names[depth]
-    for header in unique_headers[header_key]:
-        counter[header] = create_counter(depth + 1, total_depth, unique_headers, header_names)
-
-    return counter
+) -> dict[tuple, int]:
+    """Helper function to create flat counter dictionary for counting trace keys for auto index.
+    
+    This is a memory-efficient version that returns an empty dict since we now process
+    traces directly in create_trace_index without pre-allocating the counter structure.
+    
+    Args:
+        depth: Current recursion depth (unused in new implementation)
+        total_depth: Total depth of headers (unused in new implementation) 
+        unique_headers: Dictionary of unique header values (unused in new implementation)
+        header_names: List of header names (unused in new implementation)
+        
+    Returns:
+        Empty dictionary - actual counting happens in create_trace_index
+    """
+    return {}
 
 
 def create_trace_index(
@@ -223,63 +227,88 @@ def create_trace_index(
     header_names: list[str],
     dtype: DTypeLike = np.int16,
 ) -> NDArray | None:
-    """Update dictionary counter tree for counting trace key for auto index."""
+    """Memory-efficient trace index creation that processes traces in a single pass.
+    
+    Args:
+        depth: Number of header dimensions to process
+        counter: Counter dictionary (unused in new implementation)
+        index_headers: numpy array with index headers
+        header_names: List of header field names
+        dtype: numpy type for value of created trace header
+        
+    Returns:
+        HeaderArray with added 'trace' field containing trace indices, or None if depth is 0
+    """
     if depth == 0:
         # If there's no hierarchical depth, no tracing needed.
         return None
 
-    # Add index header
+    # Add trace field
     trace_no_field = np.zeros(index_headers.shape, dtype=dtype)
     index_headers = rfn.append_fields(index_headers, "trace", trace_no_field, usemask=False)
-
-    # Extract the relevant columns upfront
-    headers = [index_headers[name] for name in header_names[:depth]]
-    for idx, idx_values in enumerate(zip(*headers, strict=True)):
-        if depth == 1:
-            counter[idx_values[0]] += 1
-            index_headers["trace"][idx] = counter[idx_values[0]]
-        else:
-            sub_counter = counter
-            for idx_value in idx_values[:-1]:
-                sub_counter = sub_counter[idx_value]
-            sub_counter[idx_values[-1]] += 1
-            index_headers["trace"][idx] = sub_counter[idx_values[-1]]
-
+    
+    # Use a flat dictionary with tuple keys instead of nested dictionaries
+    # This avoids pre-allocating memory for all possible combinations
+    flat_counter = {}
+    
+    # Only use the first 'depth' header names
+    relevant_header_names = header_names[:depth]
+    
+    # Process each trace in a single pass
+    for idx in range(len(index_headers)):
+        # Create tuple key from header values for this trace
+        key = tuple(index_headers[name][idx] for name in relevant_header_names)
+        
+        # Increment counter for this combination and assign trace number
+        flat_counter[key] = flat_counter.get(key, 0) + 1
+        index_headers["trace"][idx] = flat_counter[key]
+    
     return index_headers
 
 
 def analyze_non_indexed_headers(index_headers: HeaderArray, dtype: DTypeLike = np.int16) -> NDArray:
     """Check input headers for SEG-Y input to help determine geometry.
 
-    This function reads in trace_qc_count headers and finds the unique cable values. Then, it
-    checks to make sure channel numbers for different cables do not overlap.
+    This function reads in trace_qc_count headers and creates trace indices efficiently.
+    Uses a memory-efficient approach that doesn't pre-allocate large nested dictionaries.
 
     Args:
         index_headers: numpy array with index headers
         dtype: numpy type for value of created trace header.
 
     Returns:
-        Dict container header name as key and numpy array of values as value
+        HeaderArray with added 'trace' field containing trace indices
     """
-    # Find unique cable ids
     t_start = time.perf_counter()
-    unique_headers = {}
-    total_depth = 0
-    header_names = []
-    for header_key in index_headers.dtype.names:
-        if header_key != "trace":
-            unique_headers[header_key] = np.sort(np.unique(index_headers[header_key]))
-            header_names.append(header_key)
-            total_depth += 1
-
-    counter = create_counter(0, total_depth, unique_headers, header_names)
-
-    index_headers = create_trace_index(
-        total_depth, counter, index_headers, header_names, dtype=dtype
-    )
-
+    
+    # Get header names excluding 'trace' if it already exists
+    header_names = [name for name in index_headers.dtype.names if name != "trace"]
+    
+    if not header_names:
+        # No headers to process, just add trace numbers sequentially
+        trace_no_field = np.arange(1, len(index_headers) + 1, dtype=dtype)
+        index_headers = rfn.append_fields(index_headers, "trace", trace_no_field, usemask=False)
+        return index_headers
+    
+    # Create trace field
+    trace_no_field = np.zeros(index_headers.shape, dtype=dtype)
+    index_headers = rfn.append_fields(index_headers, "trace", trace_no_field, usemask=False)
+    
+    # Use a flat dictionary with tuple keys instead of nested dictionaries
+    # This avoids pre-allocating memory for all possible combinations
+    counter = {}
+    
+    # Process each trace in a single pass
+    for idx in range(len(index_headers)):
+        # Create tuple key from header values for this trace
+        key = tuple(index_headers[name][idx] for name in header_names)
+        
+        # Increment counter for this combination and assign trace number
+        counter[key] = counter.get(key, 0) + 1
+        index_headers["trace"][idx] = counter[key]
+    
     t_stop = time.perf_counter()
-    logger.debug("Time spent generating trace index: %.4f s", t_start - t_stop)
+    logger.debug("Time spent generating trace index: %.4f s", t_stop - t_start)
     return index_headers
 
 
diff --git a/src/mdio/segy/parsers.py b/src/mdio/segy/parsers.py
@@ -39,7 +39,9 @@ def parse_index_headers(
         current block. Array is of type byte_type except IBM32 which is mapped to FLOAT32.
     """
     trace_count = segy_file.num_traces
+    print(f"trace_count: {trace_count}")
     n_blocks = int(ceil(trace_count / block_size))
+    print(f"n_blocks: {n_blocks}")
 
     trace_ranges = []
     for idx in range(n_blocks):
@@ -69,5 +71,104 @@ def parse_index_headers(
         # This executes the lazy work.
         headers: list[HeaderArray] = list(lazy_work)
 
+    print("Concatenating headers...", flush=True)
+    # raise Exception("Stop here")
+    # ret = memory_efficient_concatenate(headers)
+    ret = np.concatenate(headers)
+    print("Finished!", flush=True)
     # Merge blocks before return
-    return np.concatenate(headers)
+    return ret
+
+
+def memory_efficient_concatenate(headers: list[HeaderArray]) -> HeaderArray:
+    """Memory-efficient concatenation for many small header arrays.
+    
+    Pre-allocates the target array and copies data in place to avoid
+    the memory fragmentation and intermediate allocations that occur
+    with np.concatenate on many small arrays.
+    
+    Args:
+        headers: List of HeaderArray objects to concatenate
+        
+    Returns:
+        Single concatenated HeaderArray
+    """
+
+    # Heartbeat 1: Function entry
+    with open("heartbeat_1_entry.txt", "w") as f:
+        f.write("Entered memory_efficient_concatenate\n")
+        f.flush()
+
+    if not headers:
+        raise ValueError("Cannot concatenate empty list of arrays")
+    
+    # Heartbeat 2: Before size calculation
+    with open("heartbeat_2_calculating_size.txt", "w") as f:
+        f.write(f"Starting size calculation for {len(headers)} arrays\n")
+        f.flush()
+    
+    # Calculate total size and get array metadata
+    total_length = sum(len(arr) for arr in headers)
+    first_array = headers[0]
+    target_dtype = first_array.dtype
+    
+    # Heartbeat 3: Before allocation
+    estimated_size_mb = total_length * target_dtype.itemsize / 1024**2
+    with open("heartbeat_3_before_allocation.txt", "w") as f:
+        f.write(f"About to allocate {total_length:,} elements, estimated {estimated_size_mb:.1f} MB\n")
+        f.flush()
+    
+    print(f"Pre-allocating result array: {total_length:,} elements, "
+          f"dtype={target_dtype}, estimated size={estimated_size_mb:.1f} MB")
+    
+    # Pre-allocate the final array - this is the key optimization
+    result = np.empty(total_length, dtype=target_dtype)
+    
+    # Heartbeat 4: After allocation
+    actual_size_mb = result.nbytes / 1024**2
+    with open("heartbeat_4_allocated.txt", "w") as f:
+        f.write(f"Successfully allocated array, actual size {actual_size_mb:.1f} MB\n")
+        f.flush()
+    
+    # Copy arrays sequentially into pre-allocated space
+    current_pos = 0
+    batch_size = 100  # Process in batches to provide progress feedback
+    
+    # Heartbeat 5: Before copying loop
+    with open("heartbeat_5_start_copying.txt", "w") as f:
+        f.write("Starting copy loop\n")
+        f.flush()
+    
+    for i in range(0, len(headers), batch_size):
+        batch_end = min(i + batch_size, len(headers))
+        
+        # Process this batch
+        for j in range(i, batch_end):
+            arr = headers[j]
+            if arr is None:  # Skip if already processed
+                continue
+                
+            end_pos = current_pos + len(arr)
+            
+            # Direct copy into pre-allocated space - no intermediate allocations
+            result[current_pos:end_pos] = arr
+            current_pos = end_pos
+            
+            # Help garbage collector by clearing reference
+            headers[j] = None
+        
+        # Progress update and heartbeat for major milestones
+        if batch_end % (5 * batch_size) == 0 or batch_end == len(headers):
+            progress_pct = (batch_end / len(headers)) * 100
+            with open(f"heartbeat_6_progress_{int(progress_pct)}.txt", "w") as f:
+                f.write(f"Progress: {progress_pct:.1f}% ({batch_end:,}/{len(headers):,})\n")
+                f.flush()
+            print(f"Concatenation progress: {progress_pct:.1f}% ({batch_end:,}/{len(headers):,} arrays)")
+    
+    # Heartbeat 7: Completion
+    with open("heartbeat_7_complete.txt", "w") as f:
+        f.write(f"Concatenation complete. Final size: {result.nbytes / 1024**2:.1f} MB\n")
+        f.flush()
+    
+    print(f"Concatenation complete. Final array size: {result.nbytes / 1024**2:.1f} MB")
+    return result
diff --git a/src/mdio/segy/utilities.py b/src/mdio/segy/utilities.py