@@ -75,6 +75,25 @@ def is_null(x):
75
75
76
76
77
77
def null_for_np_type (dtype ):
78
+ """
79
+ Return the appropriate null value for a given numpy dtype.
80
+
81
+ Parameters
82
+ ----------
83
+ dtype : np.dtype
84
+ NumPy data type to get null value for
85
+
86
+ Returns
87
+ -------
88
+ scalar
89
+ NaT for datetime/timedelta types, np.nan for other types
90
+
91
+ Notes
92
+ -----
93
+ For datetime64 and timedelta64 dtypes (kind 'm' or 'M'), returns
94
+ the appropriate NaT (Not a Time) value. For all other dtypes,
95
+ returns np.nan.
96
+ """
78
97
if dtype .kind in "mM" :
79
98
return np .array (["NaT" ], dtype = dtype )[0 ]
80
99
else :
@@ -132,6 +151,25 @@ def _get_initial_value(
132
151
133
152
134
153
def _njit_scalar_reduce (func ):
154
+ """
155
+ Decorator to create numba-compiled scalar reduction functions.
156
+
157
+ Parameters
158
+ ----------
159
+ func : callable
160
+ Scalar reduction function taking two arguments
161
+
162
+ Returns
163
+ -------
164
+ staticmethod
165
+ Numba-compiled version of the function with standard signatures
166
+
167
+ Notes
168
+ -----
169
+ This decorator compiles the function with predefined signatures for
170
+ common numeric types (float64, uint64, int64) and enables nogil mode
171
+ for better performance in multithreaded environments.
172
+ """
135
173
return staticmethod (nb .njit (_SCALAR_SIGNATURES , nogil = True )(func ))
136
174
137
175
@@ -280,6 +318,26 @@ def _nb_reduce_arr_list_in_parallel(
280
318
def reduction_return_type_and_empty_result_for_op_and_type (
281
319
dtype , op : Literal ["count" , "min" , "max" , "sum" , "sum_square" , "mean" ]
282
320
):
321
+ """
322
+ Determine the return dtype and empty result value for a reduction operation.
323
+
324
+ Parameters
325
+ ----------
326
+ dtype : np.dtype
327
+ Input array dtype
328
+ op : {"count", "min", "max", "sum", "sum_square", "mean"}
329
+ Reduction operation to perform
330
+
331
+ Returns
332
+ -------
333
+ tuple
334
+ (return_dtype, empty_result_value) for the given operation and input dtype
335
+
336
+ Notes
337
+ -----
338
+ This function defines the type promotion rules and empty result values
339
+ for various reduction operations on different input dtypes.
340
+ """
283
341
if op == "count" :
284
342
return np .int64 , 0
285
343
elif op in ("min" , "max" ):
@@ -311,6 +369,28 @@ def reduction_return_type_and_empty_result_for_op_and_type(
311
369
312
370
313
371
def _nullify_below_mincount (result , count , min_count ):
372
+ """
373
+ Set result elements to null where count is below minimum threshold.
374
+
375
+ Parameters
376
+ ----------
377
+ result : np.ndarray
378
+ Result array to modify
379
+ count : np.ndarray
380
+ Count of valid values for each result element
381
+ min_count : int
382
+ Minimum number of non-null values required
383
+
384
+ Returns
385
+ -------
386
+ np.ndarray
387
+ Modified result array with nullified values
388
+
389
+ Notes
390
+ -----
391
+ For unsigned integer dtypes, uses MIN_INT as null value.
392
+ For all other dtypes, uses np.nan as null value.
393
+ """
314
394
if result .dtype .kind in "ui" :
315
395
null = MIN_INT
316
396
else :
@@ -340,6 +420,62 @@ def _chunk_arr_into_arr_list(
340
420
axis : Optional [int ],
341
421
mask : Optional [np .ndarray ] = None ,
342
422
) -> NumbaList :
423
+ """
424
+ Split arrays into chunks for parallel processing in reduction operations.
425
+
426
+ Parameters
427
+ ----------
428
+ values : np.ndarray
429
+ Input array to be chunked. Must be 1D or 2D.
430
+ multi_threading : bool
431
+ If True, split array into multiple chunks for parallel processing.
432
+ If False, return single chunk (no parallelization).
433
+ axis : int or None
434
+ Reduction axis. For 2D arrays:
435
+ - axis=0: transpose array so reduction operates along columns
436
+ - axis=1: keep array as-is, reduction operates along rows
437
+ - axis=None: flatten to 1D for scalar reduction
438
+ mask : np.ndarray, optional
439
+ Boolean mask indicating null values. If provided, will be split
440
+ consistently with values array.
441
+
442
+ Returns
443
+ -------
444
+ tuple
445
+ - arr_list : NumbaList
446
+ List of array chunks ready for parallel processing
447
+ - mask_list : NumbaList
448
+ List of corresponding mask chunks (empty if mask=None)
449
+ - final_length : int
450
+ Length of the final reduction dimension. 0 for 1D arrays,
451
+ number of columns/rows for 2D arrays.
452
+
453
+ Notes
454
+ -----
455
+ Thread count is determined automatically based on array size when
456
+ multi_threading=True, with a maximum of 6 threads and minimum of 1.
457
+ Arrays smaller than 1 million elements use single threading.
458
+
459
+ For 1D arrays, the array is split into n_threads chunks along axis 0.
460
+ For 2D arrays, the array is either transposed (axis=0) or used as-is
461
+ (axis=1) to prepare for row-wise or column-wise reductions.
462
+
463
+ Raises
464
+ ------
465
+ ValueError
466
+ If input array has more than 2 dimensions.
467
+
468
+ Examples
469
+ --------
470
+ >>> arr = np.array([[1, 2, 3], [4, 5, 6]])
471
+ >>> arr_list, mask_list, final_length = _chunk_arr_into_arr_list(
472
+ ... arr, multi_threading=False, axis=0
473
+ ... )
474
+ >>> final_length
475
+ 3
476
+ >>> len(arr_list)
477
+ 3 # transposed, so 3 columns become 3 arrays
478
+ """
343
479
ndim = values .ndim
344
480
if multi_threading :
345
481
# TODO: be smarter about this choice. numba is handling the distribution of the compute
@@ -560,6 +696,27 @@ def nb_reduce(
560
696
561
697
562
698
def _cast_to_timelike (arr , to_dtype ):
699
+ """
700
+ Convert a float array to timelike (datetime/timedelta) dtype.
701
+
702
+ Parameters
703
+ ----------
704
+ arr : np.ndarray
705
+ Float array to convert
706
+ to_dtype : np.dtype
707
+ Target datetime or timedelta dtype
708
+
709
+ Returns
710
+ -------
711
+ np.ndarray
712
+ Array converted to timelike dtype with NaN values replaced by MIN_INT
713
+
714
+ Notes
715
+ -----
716
+ This function is used to convert float arrays back to timelike dtypes
717
+ after reduction operations. NaN values are replaced with MIN_INT before
718
+ conversion to preserve null representation in integer-based time types.
719
+ """
563
720
isnan = np .isnan (arr )
564
721
if isnan .any ():
565
722
arr [isnan ] = MIN_INT
0 commit comments