Add docs for some methods in nanops_numba.py

eoincondron · eoincondron · commit 1dcec15f4e62 · 2025-09-26T11:48:16.000+01:00
diff --git a/pandas/core/nanops_numba.py b/pandas/core/nanops_numba.py
@@ -75,6 +75,25 @@ def is_null(x):
 
 
 def null_for_np_type(dtype):
+    """
+    Return the appropriate null value for a given numpy dtype.
+
+    Parameters
+    ----------
+    dtype : np.dtype
+        NumPy data type to get null value for
+
+    Returns
+    -------
+    scalar
+        NaT for datetime/timedelta types, np.nan for other types
+
+    Notes
+    -----
+    For datetime64 and timedelta64 dtypes (kind 'm' or 'M'), returns
+    the appropriate NaT (Not a Time) value. For all other dtypes,
+    returns np.nan.
+    """
     if dtype.kind in "mM":
         return np.array(["NaT"], dtype=dtype)[0]
     else:
@@ -132,6 +151,25 @@ def _get_initial_value(
 
 
 def _njit_scalar_reduce(func):
+    """
+    Decorator to create numba-compiled scalar reduction functions.
+
+    Parameters
+    ----------
+    func : callable
+        Scalar reduction function taking two arguments
+
+    Returns
+    -------
+    staticmethod
+        Numba-compiled version of the function with standard signatures
+
+    Notes
+    -----
+    This decorator compiles the function with predefined signatures for
+    common numeric types (float64, uint64, int64) and enables nogil mode
+    for better performance in multithreaded environments.
+    """
     return staticmethod(nb.njit(_SCALAR_SIGNATURES, nogil=True)(func))
 
 
@@ -280,6 +318,26 @@ def _nb_reduce_arr_list_in_parallel(
 def reduction_return_type_and_empty_result_for_op_and_type(
     dtype, op: Literal["count", "min", "max", "sum", "sum_square", "mean"]
 ):
+    """
+    Determine the return dtype and empty result value for a reduction operation.
+
+    Parameters
+    ----------
+    dtype : np.dtype
+        Input array dtype
+    op : {"count", "min", "max", "sum", "sum_square", "mean"}
+        Reduction operation to perform
+
+    Returns
+    -------
+    tuple
+        (return_dtype, empty_result_value) for the given operation and input dtype
+
+    Notes
+    -----
+    This function defines the type promotion rules and empty result values
+    for various reduction operations on different input dtypes.
+    """
     if op == "count":
         return np.int64, 0
     elif op in ("min", "max"):
@@ -311,6 +369,28 @@ def reduction_return_type_and_empty_result_for_op_and_type(
 
 
 def _nullify_below_mincount(result, count, min_count):
+    """
+    Set result elements to null where count is below minimum threshold.
+
+    Parameters
+    ----------
+    result : np.ndarray
+        Result array to modify
+    count : np.ndarray
+        Count of valid values for each result element
+    min_count : int
+        Minimum number of non-null values required
+
+    Returns
+    -------
+    np.ndarray
+        Modified result array with nullified values
+
+    Notes
+    -----
+    For unsigned integer dtypes, uses MIN_INT as null value.
+    For all other dtypes, uses np.nan as null value.
+    """
     if result.dtype.kind in "ui":
         null = MIN_INT
     else:
@@ -340,6 +420,62 @@ def _chunk_arr_into_arr_list(
     axis: Optional[int],
     mask: Optional[np.ndarray] = None,
 ) -> NumbaList:
+    """
+    Split arrays into chunks for parallel processing in reduction operations.
+
+    Parameters
+    ----------
+    values : np.ndarray
+        Input array to be chunked. Must be 1D or 2D.
+    multi_threading : bool
+        If True, split array into multiple chunks for parallel processing.
+        If False, return single chunk (no parallelization).
+    axis : int or None
+        Reduction axis. For 2D arrays:
+        - axis=0: transpose array so reduction operates along columns
+        - axis=1: keep array as-is, reduction operates along rows
+        - axis=None: flatten to 1D for scalar reduction
+    mask : np.ndarray, optional
+        Boolean mask indicating null values. If provided, will be split
+        consistently with values array.
+
+    Returns
+    -------
+    tuple
+        - arr_list : NumbaList
+            List of array chunks ready for parallel processing
+        - mask_list : NumbaList
+            List of corresponding mask chunks (empty if mask=None)
+        - final_length : int
+            Length of the final reduction dimension. 0 for 1D arrays,
+            number of columns/rows for 2D arrays.
+
+    Notes
+    -----
+    Thread count is determined automatically based on array size when
+    multi_threading=True, with a maximum of 6 threads and minimum of 1.
+    Arrays smaller than 1 million elements use single threading.
+
+    For 1D arrays, the array is split into n_threads chunks along axis 0.
+    For 2D arrays, the array is either transposed (axis=0) or used as-is
+    (axis=1) to prepare for row-wise or column-wise reductions.
+
+    Raises
+    ------
+    ValueError
+        If input array has more than 2 dimensions.
+
+    Examples
+    --------
+    >>> arr = np.array([[1, 2, 3], [4, 5, 6]])
+    >>> arr_list, mask_list, final_length = _chunk_arr_into_arr_list(
+    ...     arr, multi_threading=False, axis=0
+    ... )
+    >>> final_length
+    3
+    >>> len(arr_list)
+    3  # transposed, so 3 columns become 3 arrays
+    """
     ndim = values.ndim
     if multi_threading:
         # TODO: be smarter about this choice. numba is handling the distribution of the compute
@@ -560,6 +696,27 @@ def nb_reduce(
 
 
 def _cast_to_timelike(arr, to_dtype):
+    """
+    Convert a float array to timelike (datetime/timedelta) dtype.
+
+    Parameters
+    ----------
+    arr : np.ndarray
+        Float array to convert
+    to_dtype : np.dtype
+        Target datetime or timedelta dtype
+
+    Returns
+    -------
+    np.ndarray
+        Array converted to timelike dtype with NaN values replaced by MIN_INT
+
+    Notes
+    -----
+    This function is used to convert float arrays back to timelike dtypes
+    after reduction operations. NaN values are replaced with MIN_INT before
+    conversion to preserve null representation in integer-based time types.
+    """
     isnan = np.isnan(arr)
     if isnan.any():
         arr[isnan] = MIN_INT