Skip to content

Commit 19ad2ec

Browse files
authored
BUG: handle overflow in group_sum (#63132)
1 parent a7e0900 commit 19ad2ec

File tree

4 files changed

+68
-4
lines changed

4 files changed

+68
-4
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
.noseids
2222
.ipynb_checkpoints
2323
.tags
24+
tags
2425
.cache/
2526
.vscode/
2627

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1243,6 +1243,7 @@ Groupby/resample/rolling
12431243
- Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupby.groups` that would not respect groupby argument ``dropna`` (:issue:`55919`)
12441244
- Bug in :meth:`.DataFrameGroupBy.median` where nat values gave an incorrect result. (:issue:`57926`)
12451245
- Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`)
1246+
- Bug in :meth:`.DataFrameGroupBy.sum` and :meth:`.SeriesGroupby.groups` returning ``NaN`` on overflow. These methods now returns ``inf`` or ``-inf`` on overflow. (:issue:`60303`)
12461247
- Bug in :meth:`.DataFrameGroupBy` reductions where non-Boolean values were allowed for the ``numeric_only`` argument; passing a non-Boolean value will now raise (:issue:`62778`)
12471248
- Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`)
12481249
- Bug in :meth:`.Series.rolling` when used with a :class:`.BaseIndexer` subclass and computing min/max (:issue:`46726`)

pandas/_libs/groupby.pyx

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ from cython cimport (
55
)
66
from libc.math cimport (
77
NAN,
8+
isfinite,
89
sqrt,
910
)
1011
from libc.stdlib cimport (
@@ -778,9 +779,9 @@ def group_sum(
778779
if not isna_entry:
779780
nobs[lab, j] += 1
780781

781-
if sum_t is object:
782+
if sum_t is object or sum_t is int64_t or sum_t is uint64_t:
782783
# NB: this does not use 'compensation' like the non-object
783-
# track does.
784+
# and non-integer track does.
784785
if nobs[lab, j] == 1:
785786
# i.e. we haven't added anything yet; avoid TypeError
786787
# if e.g. val is a str and sumx[lab, j] is 0
@@ -793,13 +794,29 @@ def group_sum(
793794
y = val - compensation[lab, j]
794795
t = sumx[lab, j] + y
795796
compensation[lab, j] = t - sumx[lab, j] - y
796-
if compensation[lab, j] != compensation[lab, j]:
797-
# GH#53606
797+
798+
# Handle float overflow
799+
if (
800+
sum_t is float32_t or sum_t is float64_t
801+
) and not isfinite(compensation[lab, j]):
802+
# GH#53606; GH#60303
798803
# If val is +/- infinity compensation is NaN
799804
# which would lead to results being NaN instead
800805
# of +/- infinity. We cannot use util.is_nan
801806
# because of no gil
802807
compensation[lab, j] = 0
808+
809+
# Handle complex overflow
810+
if (
811+
sum_t is complex64_t or sum_t is complex128_t
812+
) and not isfinite(compensation[lab, j].real):
813+
compensation[lab, j].real = 0
814+
815+
if (
816+
sum_t is complex64_t or sum_t is complex128_t
817+
) and not isfinite(compensation[lab, j].imag):
818+
compensation[lab, j].imag = 0
819+
803820
sumx[lab, j] = t
804821
elif not skipna:
805822
if uses_mask:

pandas/tests/groupby/test_libgroupby.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,3 +329,48 @@ def test_cython_group_sum_Inf_at_beginning_and_end(values, out):
329329
actual,
330330
expected,
331331
)
332+
333+
334+
@pytest.mark.parametrize(
335+
"values, expected_values",
336+
[
337+
(np.finfo(np.float64).max, [[np.inf]]),
338+
(np.finfo(np.float64).min, [[-np.inf]]),
339+
(
340+
np.complex128(np.finfo(np.float64).min + np.finfo(np.float64).max * 1j),
341+
[[complex(-np.inf, np.inf)]],
342+
),
343+
(
344+
np.complex128(np.finfo(np.float64).max + np.finfo(np.float64).min * 1j),
345+
[[complex(np.inf, -np.inf)]],
346+
),
347+
(
348+
np.complex128(np.finfo(np.float64).max + np.finfo(np.float64).max * 1j),
349+
[[complex(np.inf, np.inf)]],
350+
),
351+
(
352+
np.complex128(np.finfo(np.float64).min + np.finfo(np.float64).min * 1j),
353+
[[complex(-np.inf, -np.inf)]],
354+
),
355+
(
356+
np.complex128(3.0 + np.finfo(np.float64).min * 1j),
357+
[[complex(9.0, -np.inf)]],
358+
),
359+
(
360+
np.complex128(np.finfo(np.float64).max + 3 * 1j),
361+
[[complex(np.inf, 9.0)]],
362+
),
363+
],
364+
)
365+
def test_cython_group_sum_overflow(values, expected_values):
366+
# GH-60303
367+
data = np.array([[values] for _ in range(3)])
368+
labels = np.array([0, 0, 0], dtype=np.intp)
369+
counts = np.array([0], dtype="int64")
370+
371+
expected = np.array(expected_values, dtype=values.dtype)
372+
actual = np.zeros_like(expected)
373+
374+
group_sum(actual, counts, data, labels, None, is_datetimelike=False)
375+
376+
tm.assert_numpy_array_equal(actual, expected)

0 commit comments

Comments
 (0)