@@ -202,18 +202,22 @@ def create_counter(
202202 total_depth : int ,
203203 unique_headers : dict [str , NDArray ],
204204 header_names : list [str ],
205- ) -> dict [str , dict ]:
206- """Helper function to create dictionary tree for counting trace key for auto index."""
207- if depth == total_depth :
208- return 0
209-
210- counter = {}
211-
212- header_key = header_names [depth ]
213- for header in unique_headers [header_key ]:
214- counter [header ] = create_counter (depth + 1 , total_depth , unique_headers , header_names )
215-
216- return counter
205+ ) -> dict [tuple , int ]:
206+ """Helper function to create flat counter dictionary for counting trace keys for auto index.
207+
208+ This is a memory-efficient version that returns an empty dict since we now process
209+ traces directly in create_trace_index without pre-allocating the counter structure.
210+
211+ Args:
212+ depth: Current recursion depth (unused in new implementation)
213+ total_depth: Total depth of headers (unused in new implementation)
214+ unique_headers: Dictionary of unique header values (unused in new implementation)
215+ header_names: List of header names (unused in new implementation)
216+
217+ Returns:
218+ Empty dictionary - actual counting happens in create_trace_index
219+ """
220+ return {}
217221
218222
219223def create_trace_index (
@@ -223,63 +227,88 @@ def create_trace_index(
223227 header_names : list [str ],
224228 dtype : DTypeLike = np .int16 ,
225229) -> NDArray | None :
226- """Update dictionary counter tree for counting trace key for auto index."""
230+ """Memory-efficient trace index creation that processes traces in a single pass.
231+
232+ Args:
233+ depth: Number of header dimensions to process
234+ counter: Counter dictionary (unused in new implementation)
235+ index_headers: numpy array with index headers
236+ header_names: List of header field names
237+ dtype: numpy type for value of created trace header
238+
239+ Returns:
240+ HeaderArray with added 'trace' field containing trace indices, or None if depth is 0
241+ """
227242 if depth == 0 :
228243 # If there's no hierarchical depth, no tracing needed.
229244 return None
230245
231- # Add index header
246+ # Add trace field
232247 trace_no_field = np .zeros (index_headers .shape , dtype = dtype )
233248 index_headers = rfn .append_fields (index_headers , "trace" , trace_no_field , usemask = False )
234-
235- # Extract the relevant columns upfront
236- headers = [index_headers [name ] for name in header_names [:depth ]]
237- for idx , idx_values in enumerate (zip (* headers , strict = True )):
238- if depth == 1 :
239- counter [idx_values [0 ]] += 1
240- index_headers ["trace" ][idx ] = counter [idx_values [0 ]]
241- else :
242- sub_counter = counter
243- for idx_value in idx_values [:- 1 ]:
244- sub_counter = sub_counter [idx_value ]
245- sub_counter [idx_values [- 1 ]] += 1
246- index_headers ["trace" ][idx ] = sub_counter [idx_values [- 1 ]]
247-
249+
250+ # Use a flat dictionary with tuple keys instead of nested dictionaries
251+ # This avoids pre-allocating memory for all possible combinations
252+ flat_counter = {}
253+
254+ # Only use the first 'depth' header names
255+ relevant_header_names = header_names [:depth ]
256+
257+ # Process each trace in a single pass
258+ for idx in range (len (index_headers )):
259+ # Create tuple key from header values for this trace
260+ key = tuple (index_headers [name ][idx ] for name in relevant_header_names )
261+
262+ # Increment counter for this combination and assign trace number
263+ flat_counter [key ] = flat_counter .get (key , 0 ) + 1
264+ index_headers ["trace" ][idx ] = flat_counter [key ]
265+
248266 return index_headers
249267
250268
251269def analyze_non_indexed_headers (index_headers : HeaderArray , dtype : DTypeLike = np .int16 ) -> NDArray :
252270 """Check input headers for SEG-Y input to help determine geometry.
253271
254- This function reads in trace_qc_count headers and finds the unique cable values. Then, it
255- checks to make sure channel numbers for different cables do not overlap .
272+ This function reads in trace_qc_count headers and creates trace indices efficiently.
273+ Uses a memory-efficient approach that doesn't pre-allocate large nested dictionaries .
256274
257275 Args:
258276 index_headers: numpy array with index headers
259277 dtype: numpy type for value of created trace header.
260278
261279 Returns:
262- Dict container header name as key and numpy array of values as value
280+ HeaderArray with added 'trace' field containing trace indices
263281 """
264- # Find unique cable ids
265282 t_start = time .perf_counter ()
266- unique_headers = {}
267- total_depth = 0
268- header_names = []
269- for header_key in index_headers .dtype .names :
270- if header_key != "trace" :
271- unique_headers [header_key ] = np .sort (np .unique (index_headers [header_key ]))
272- header_names .append (header_key )
273- total_depth += 1
274-
275- counter = create_counter (0 , total_depth , unique_headers , header_names )
276-
277- index_headers = create_trace_index (
278- total_depth , counter , index_headers , header_names , dtype = dtype
279- )
280-
283+
284+ # Get header names excluding 'trace' if it already exists
285+ header_names = [name for name in index_headers .dtype .names if name != "trace" ]
286+
287+ if not header_names :
288+ # No headers to process, just add trace numbers sequentially
289+ trace_no_field = np .arange (1 , len (index_headers ) + 1 , dtype = dtype )
290+ index_headers = rfn .append_fields (index_headers , "trace" , trace_no_field , usemask = False )
291+ return index_headers
292+
293+ # Create trace field
294+ trace_no_field = np .zeros (index_headers .shape , dtype = dtype )
295+ index_headers = rfn .append_fields (index_headers , "trace" , trace_no_field , usemask = False )
296+
297+ # Use a flat dictionary with tuple keys instead of nested dictionaries
298+ # This avoids pre-allocating memory for all possible combinations
299+ counter = {}
300+
301+ # Process each trace in a single pass
302+ for idx in range (len (index_headers )):
303+ # Create tuple key from header values for this trace
304+ key = tuple (index_headers [name ][idx ] for name in header_names )
305+
306+ # Increment counter for this combination and assign trace number
307+ counter [key ] = counter .get (key , 0 ) + 1
308+ index_headers ["trace" ][idx ] = counter [key ]
309+
281310 t_stop = time .perf_counter ()
282- logger .debug ("Time spent generating trace index: %.4f s" , t_start - t_stop )
311+ logger .debug ("Time spent generating trace index: %.4f s" , t_stop - t_start )
283312 return index_headers
284313
285314
0 commit comments