Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
.noseids
.ipynb_checkpoints
.tags
tags
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this needed?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tags is an index file generated by ctags and I use it to navigate in Cython code. I thnk this file should be added to the .gitignore since it's auto generated and shouldn't go into version control.

If you don't want this file going into this PR, I can remove it and create a separate PR for it.

.cache/
.vscode/

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1237,6 +1237,7 @@ Groupby/resample/rolling
- Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupby.groups` that would not respect groupby argument ``dropna`` (:issue:`55919`)
- Bug in :meth:`.DataFrameGroupBy.median` where nat values gave an incorrect result. (:issue:`57926`)
- Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`)
- Bug in :meth:`.DataFrameGroupBy.sum` and :meth:`.SeriesGroupby.groups` returning ``NaN`` on overflow. These methods now returns ``inf`` or ``-inf`` on overflow. (:issue:`60303`)
- Bug in :meth:`.DataFrameGroupBy` reductions where non-Boolean values were allowed for the ``numeric_only`` argument; passing a non-Boolean value will now raise (:issue:`62778`)
- Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`)
- Bug in :meth:`.Series.rolling` when used with a :class:`.BaseIndexer` subclass and computing min/max (:issue:`46726`)
Expand Down
25 changes: 21 additions & 4 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ from cython cimport (
)
from libc.math cimport (
NAN,
isfinite,
sqrt,
)
from libc.stdlib cimport (
Expand Down Expand Up @@ -778,9 +779,9 @@ def group_sum(
if not isna_entry:
nobs[lab, j] += 1

if sum_t is object:
if sum_t is object or sum_t is int64_t or sum_t is uint64_t:
# NB: this does not use 'compensation' like the non-object
# track does.
# and non-integer track does.
if nobs[lab, j] == 1:
# i.e. we haven't added anything yet; avoid TypeError
# if e.g. val is a str and sumx[lab, j] is 0
Expand All @@ -793,13 +794,29 @@ def group_sum(
y = val - compensation[lab, j]
t = sumx[lab, j] + y
compensation[lab, j] = t - sumx[lab, j] - y
if compensation[lab, j] != compensation[lab, j]:
# GH#53606

# Handle float overflow
if (
sum_t is float32_t or sum_t is float64_t
) and not isfinite(compensation[lab, j]):
# GH#53606; GH#60303
# If val is +/- infinity compensation is NaN
# which would lead to results being NaN instead
# of +/- infinity. We cannot use util.is_nan
# because of no gil
compensation[lab, j] = 0

# Handle complex overflow
if (
sum_t is complex64_t or sum_t is complex128_t
) and not isfinite(compensation[lab, j].real):
compensation[lab, j].real = 0

if (
sum_t is complex64_t or sum_t is complex128_t
) and not isfinite(compensation[lab, j].imag):
compensation[lab, j].imag = 0

sumx[lab, j] = t
elif not skipna:
if uses_mask:
Expand Down
45 changes: 45 additions & 0 deletions pandas/tests/groupby/test_libgroupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,3 +329,48 @@ def test_cython_group_sum_Inf_at_beginning_and_end(values, out):
actual,
expected,
)


@pytest.mark.parametrize(
"values, expected_values",
[
(np.finfo(np.float64).max, [[np.inf]]),
(np.finfo(np.float64).min, [[-np.inf]]),
(
np.complex128(np.finfo(np.float64).min + np.finfo(np.float64).max * 1j),
[[complex(-np.inf, np.inf)]],
),
(
np.complex128(np.finfo(np.float64).max + np.finfo(np.float64).min * 1j),
[[complex(np.inf, -np.inf)]],
),
(
np.complex128(np.finfo(np.float64).max + np.finfo(np.float64).max * 1j),
[[complex(np.inf, np.inf)]],
),
(
np.complex128(np.finfo(np.float64).min + np.finfo(np.float64).min * 1j),
[[complex(-np.inf, -np.inf)]],
),
(
np.complex128(3.0 + np.finfo(np.float64).min * 1j),
[[complex(9.0, -np.inf)]],
),
(
np.complex128(np.finfo(np.float64).max + 3 * 1j),
[[complex(np.inf, 9.0)]],
),
],
)
def test_cython_group_sum_overflow(values, expected_values):
# GH-60303
data = np.array([[values] for _ in range(3)])
labels = np.array([0, 0, 0], dtype=np.intp)
counts = np.array([0], dtype="int64")

expected = np.array(expected_values, dtype=values.dtype)
actual = np.zeros_like(expected)

group_sum(actual, counts, data, labels, None, is_datetimelike=False)

tm.assert_numpy_array_equal(actual, expected)
Loading