BUG: handle overflow in group_sum (#63132)

Alvaro-Kothe · web-flow · commit 19ad2ecc2d52 · 2025-11-25T11:00:18.000-08:00
diff --git a/.gitignore b/.gitignore
@@ -21,6 +21,7 @@
 .noseids
 .ipynb_checkpoints
 .tags
+tags
 .cache/
 .vscode/
 
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -1243,6 +1243,7 @@ Groupby/resample/rolling
 - Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupby.groups` that would not respect groupby argument ``dropna`` (:issue:`55919`)
 - Bug in :meth:`.DataFrameGroupBy.median` where nat values gave an incorrect result. (:issue:`57926`)
 - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`)
+- Bug in :meth:`.DataFrameGroupBy.sum` and :meth:`.SeriesGroupby.groups` returning ``NaN`` on overflow. These methods now returns ``inf`` or ``-inf`` on overflow. (:issue:`60303`)
 - Bug in :meth:`.DataFrameGroupBy` reductions where non-Boolean values were allowed for the ``numeric_only`` argument; passing a non-Boolean value will now raise (:issue:`62778`)
 - Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`)
 - Bug in :meth:`.Series.rolling` when used with a :class:`.BaseIndexer` subclass and computing min/max (:issue:`46726`)
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -5,6 +5,7 @@ from cython cimport (
 )
 from libc.math cimport (
     NAN,
+    isfinite,
     sqrt,
 )
 from libc.stdlib cimport (
@@ -778,9 +779,9 @@ def group_sum(
                 if not isna_entry:
                     nobs[lab, j] += 1
 
-                    if sum_t is object:
+                    if sum_t is object or sum_t is int64_t or sum_t is uint64_t:
                         # NB: this does not use 'compensation' like the non-object
-                        #  track does.
+                        #  and non-integer track does.
                         if nobs[lab, j] == 1:
                             # i.e. we haven't added anything yet; avoid TypeError
                             #  if e.g. val is a str and sumx[lab, j] is 0
@@ -793,13 +794,29 @@ def group_sum(
                         y = val - compensation[lab, j]
                         t = sumx[lab, j] + y
                         compensation[lab, j] = t - sumx[lab, j] - y
-                        if compensation[lab, j] != compensation[lab, j]:
-                            # GH#53606
+
+                        # Handle float overflow
+                        if (
+                            sum_t is float32_t or sum_t is float64_t
+                        ) and not isfinite(compensation[lab, j]):
+                            # GH#53606; GH#60303
                             # If val is +/- infinity compensation is NaN
                             # which would lead to results being NaN instead
                             # of +/- infinity. We cannot use util.is_nan
                             # because of no gil
                             compensation[lab, j] = 0
+
+                        # Handle complex overflow
+                        if (
+                            sum_t is complex64_t or sum_t is complex128_t
+                        ) and not isfinite(compensation[lab, j].real):
+                            compensation[lab, j].real = 0
+
+                        if (
+                            sum_t is complex64_t or sum_t is complex128_t
+                        ) and not isfinite(compensation[lab, j].imag):
+                            compensation[lab, j].imag = 0
+
                         sumx[lab, j] = t
                 elif not skipna:
                     if uses_mask:
diff --git a/pandas/tests/groupby/test_libgroupby.py b/pandas/tests/groupby/test_libgroupby.py
@@ -329,3 +329,48 @@ def test_cython_group_sum_Inf_at_beginning_and_end(values, out):
         actual,
         expected,
     )
+
+
+@pytest.mark.parametrize(
+    "values, expected_values",
+    [
+        (np.finfo(np.float64).max, [[np.inf]]),
+        (np.finfo(np.float64).min, [[-np.inf]]),
+        (
+            np.complex128(np.finfo(np.float64).min + np.finfo(np.float64).max * 1j),
+            [[complex(-np.inf, np.inf)]],
+        ),
+        (
+            np.complex128(np.finfo(np.float64).max + np.finfo(np.float64).min * 1j),
+            [[complex(np.inf, -np.inf)]],
+        ),
+        (
+            np.complex128(np.finfo(np.float64).max + np.finfo(np.float64).max * 1j),
+            [[complex(np.inf, np.inf)]],
+        ),
+        (
+            np.complex128(np.finfo(np.float64).min + np.finfo(np.float64).min * 1j),
+            [[complex(-np.inf, -np.inf)]],
+        ),
+        (
+            np.complex128(3.0 + np.finfo(np.float64).min * 1j),
+            [[complex(9.0, -np.inf)]],
+        ),
+        (
+            np.complex128(np.finfo(np.float64).max + 3 * 1j),
+            [[complex(np.inf, 9.0)]],
+        ),
+    ],
+)
+def test_cython_group_sum_overflow(values, expected_values):
+    # GH-60303
+    data = np.array([[values] for _ in range(3)])
+    labels = np.array([0, 0, 0], dtype=np.intp)
+    counts = np.array([0], dtype="int64")
+
+    expected = np.array(expected_values, dtype=values.dtype)
+    actual = np.zeros_like(expected)
+
+    group_sum(actual, counts, data, labels, None, is_datetimelike=False)
+
+    tm.assert_numpy_array_equal(actual, expected)