Skip to content

Commit 1dcec15

Browse files
committed
Add docs for some methods in nanops_numba.py
1 parent d3d3c11 commit 1dcec15

File tree

1 file changed

+157
-0
lines changed

1 file changed

+157
-0
lines changed

pandas/core/nanops_numba.py

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,25 @@ def is_null(x):
7575

7676

7777
def null_for_np_type(dtype):
78+
"""
79+
Return the appropriate null value for a given numpy dtype.
80+
81+
Parameters
82+
----------
83+
dtype : np.dtype
84+
NumPy data type to get null value for
85+
86+
Returns
87+
-------
88+
scalar
89+
NaT for datetime/timedelta types, np.nan for other types
90+
91+
Notes
92+
-----
93+
For datetime64 and timedelta64 dtypes (kind 'm' or 'M'), returns
94+
the appropriate NaT (Not a Time) value. For all other dtypes,
95+
returns np.nan.
96+
"""
7897
if dtype.kind in "mM":
7998
return np.array(["NaT"], dtype=dtype)[0]
8099
else:
@@ -132,6 +151,25 @@ def _get_initial_value(
132151

133152

134153
def _njit_scalar_reduce(func):
154+
"""
155+
Decorator to create numba-compiled scalar reduction functions.
156+
157+
Parameters
158+
----------
159+
func : callable
160+
Scalar reduction function taking two arguments
161+
162+
Returns
163+
-------
164+
staticmethod
165+
Numba-compiled version of the function with standard signatures
166+
167+
Notes
168+
-----
169+
This decorator compiles the function with predefined signatures for
170+
common numeric types (float64, uint64, int64) and enables nogil mode
171+
for better performance in multithreaded environments.
172+
"""
135173
return staticmethod(nb.njit(_SCALAR_SIGNATURES, nogil=True)(func))
136174

137175

@@ -280,6 +318,26 @@ def _nb_reduce_arr_list_in_parallel(
280318
def reduction_return_type_and_empty_result_for_op_and_type(
281319
dtype, op: Literal["count", "min", "max", "sum", "sum_square", "mean"]
282320
):
321+
"""
322+
Determine the return dtype and empty result value for a reduction operation.
323+
324+
Parameters
325+
----------
326+
dtype : np.dtype
327+
Input array dtype
328+
op : {"count", "min", "max", "sum", "sum_square", "mean"}
329+
Reduction operation to perform
330+
331+
Returns
332+
-------
333+
tuple
334+
(return_dtype, empty_result_value) for the given operation and input dtype
335+
336+
Notes
337+
-----
338+
This function defines the type promotion rules and empty result values
339+
for various reduction operations on different input dtypes.
340+
"""
283341
if op == "count":
284342
return np.int64, 0
285343
elif op in ("min", "max"):
@@ -311,6 +369,28 @@ def reduction_return_type_and_empty_result_for_op_and_type(
311369

312370

313371
def _nullify_below_mincount(result, count, min_count):
372+
"""
373+
Set result elements to null where count is below minimum threshold.
374+
375+
Parameters
376+
----------
377+
result : np.ndarray
378+
Result array to modify
379+
count : np.ndarray
380+
Count of valid values for each result element
381+
min_count : int
382+
Minimum number of non-null values required
383+
384+
Returns
385+
-------
386+
np.ndarray
387+
Modified result array with nullified values
388+
389+
Notes
390+
-----
391+
For unsigned integer dtypes, uses MIN_INT as null value.
392+
For all other dtypes, uses np.nan as null value.
393+
"""
314394
if result.dtype.kind in "ui":
315395
null = MIN_INT
316396
else:
@@ -340,6 +420,62 @@ def _chunk_arr_into_arr_list(
340420
axis: Optional[int],
341421
mask: Optional[np.ndarray] = None,
342422
) -> NumbaList:
423+
"""
424+
Split arrays into chunks for parallel processing in reduction operations.
425+
426+
Parameters
427+
----------
428+
values : np.ndarray
429+
Input array to be chunked. Must be 1D or 2D.
430+
multi_threading : bool
431+
If True, split array into multiple chunks for parallel processing.
432+
If False, return single chunk (no parallelization).
433+
axis : int or None
434+
Reduction axis. For 2D arrays:
435+
- axis=0: transpose array so reduction operates along columns
436+
- axis=1: keep array as-is, reduction operates along rows
437+
- axis=None: flatten to 1D for scalar reduction
438+
mask : np.ndarray, optional
439+
Boolean mask indicating null values. If provided, will be split
440+
consistently with values array.
441+
442+
Returns
443+
-------
444+
tuple
445+
- arr_list : NumbaList
446+
List of array chunks ready for parallel processing
447+
- mask_list : NumbaList
448+
List of corresponding mask chunks (empty if mask=None)
449+
- final_length : int
450+
Length of the final reduction dimension. 0 for 1D arrays,
451+
number of columns/rows for 2D arrays.
452+
453+
Notes
454+
-----
455+
Thread count is determined automatically based on array size when
456+
multi_threading=True, with a maximum of 6 threads and minimum of 1.
457+
Arrays smaller than 1 million elements use single threading.
458+
459+
For 1D arrays, the array is split into n_threads chunks along axis 0.
460+
For 2D arrays, the array is either transposed (axis=0) or used as-is
461+
(axis=1) to prepare for row-wise or column-wise reductions.
462+
463+
Raises
464+
------
465+
ValueError
466+
If input array has more than 2 dimensions.
467+
468+
Examples
469+
--------
470+
>>> arr = np.array([[1, 2, 3], [4, 5, 6]])
471+
>>> arr_list, mask_list, final_length = _chunk_arr_into_arr_list(
472+
... arr, multi_threading=False, axis=0
473+
... )
474+
>>> final_length
475+
3
476+
>>> len(arr_list)
477+
3 # transposed, so 3 columns become 3 arrays
478+
"""
343479
ndim = values.ndim
344480
if multi_threading:
345481
# TODO: be smarter about this choice. numba is handling the distribution of the compute
@@ -560,6 +696,27 @@ def nb_reduce(
560696

561697

562698
def _cast_to_timelike(arr, to_dtype):
699+
"""
700+
Convert a float array to timelike (datetime/timedelta) dtype.
701+
702+
Parameters
703+
----------
704+
arr : np.ndarray
705+
Float array to convert
706+
to_dtype : np.dtype
707+
Target datetime or timedelta dtype
708+
709+
Returns
710+
-------
711+
np.ndarray
712+
Array converted to timelike dtype with NaN values replaced by MIN_INT
713+
714+
Notes
715+
-----
716+
This function is used to convert float arrays back to timelike dtypes
717+
after reduction operations. NaN values are replaced with MIN_INT before
718+
conversion to preserve null representation in integer-based time types.
719+
"""
563720
isnan = np.isnan(arr)
564721
if isnan.any():
565722
arr[isnan] = MIN_INT

0 commit comments

Comments
 (0)