Skip to content
Closed
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/reference/testing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ Exceptions and warnings
errors.MergeError
errors.NoBufferPresent
errors.NullFrequencyError
errors.NullKeyWarning
errors.NumbaUtilError
errors.NumExprClobberingError
errors.OptionError
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -9148,7 +9148,7 @@ def groupby(
sort: bool = True,
group_keys: bool = True,
observed: bool = True,
dropna: bool = True,
dropna: bool | lib.NoDefault = lib.no_default,
) -> DataFrameGroupBy:
from pandas.core.groupby.generic import DataFrameGroupBy

Expand Down
17 changes: 12 additions & 5 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,12 @@ def __repr__(self) -> str:
# TODO: Better repr for GroupBy object
return object.__repr__(self)

@property
def dropna(self) -> bool:
if self._dropna is lib.no_default:
return True
return self._dropna
Comment on lines +489 to +493
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know the implementation is trivial, but this is redundant with Grouper. I'm not sure we can get around it while still being a class property, but should the default value be referenced as a constant defined just once?


@final
@property
def groups(self) -> dict[Hashable, Index]:
Expand Down Expand Up @@ -1053,7 +1059,7 @@ def __init__(
sort: bool = True,
group_keys: bool = True,
observed: bool = False,
dropna: bool = True,
dropna: bool | lib.NoDefault = lib.no_default,
) -> None:
self._selection = selection

Expand All @@ -1064,7 +1070,7 @@ def __init__(
self.keys = keys
self.sort = sort
self.group_keys = group_keys
self.dropna = dropna
self._dropna = dropna

if grouper is None:
grouper, exclusions, obj = get_grouper(
Expand All @@ -1073,7 +1079,7 @@ def __init__(
level=level,
sort=sort,
observed=observed,
dropna=self.dropna,
dropna=self._dropna,
)

self.observed = observed
Expand Down Expand Up @@ -2664,7 +2670,8 @@ def _value_counts(
groupings,
sort=False,
observed=self.observed,
dropna=self.dropna,
# TODO: Should we pass through lib.no_default?
dropna=self._dropna,
)
result_series = cast(Series, gb.size())
result_series.name = name
Expand Down Expand Up @@ -2695,7 +2702,7 @@ def _value_counts(
indexed_group_size = result_series.groupby(
result_series.index.droplevel(levels),
sort=self.sort,
dropna=self.dropna,
dropna=self._dropna,
# GH#43999 - deprecation of observed=False
observed=False,
).transform("sum")
Expand Down
64 changes: 49 additions & 15 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,18 @@
TYPE_CHECKING,
final,
)
import warnings

import numpy as np

from pandas._libs import lib
from pandas._libs.tslibs import OutOfBoundsDatetime
from pandas.errors import InvalidIndexError
from pandas.errors import (
InvalidIndexError,
NullKeyWarning,
)
from pandas.util._decorators import cache_readonly
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.common import (
is_list_like,
Expand Down Expand Up @@ -55,6 +61,13 @@
from pandas.core.generic import NDFrame


_NULL_KEY_MESSAGE = (
"`dropna` is not specified but grouper encountered null group keys. These keys "
"will be dropped from the result by default. To keep null keys, set `dropna=True`, "
"or to hide this warning and drop null keys, set `dropna=False`."
)
Comment on lines +66 to +70
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this a standard approach for a warning message that could be hit from two lines of code?



class Grouper:
"""
A Grouper allows the user to specify a groupby instruction for an object.
Expand Down Expand Up @@ -246,7 +259,7 @@ class Grouper:
"""

sort: bool
dropna: bool
dropna: bool | lib.NoDefault
_grouper: Index | None

_attributes: tuple[str, ...] = ("key", "level", "freq", "sort", "dropna")
Expand All @@ -264,7 +277,7 @@ def __init__(
level=None,
freq=None,
sort: bool = False,
dropna: bool = True,
dropna: bool | lib.NoDefault = lib.no_default,
) -> None:
self.key = key
self.level = level
Expand Down Expand Up @@ -442,7 +455,7 @@ def __init__(
sort: bool = True,
observed: bool = False,
in_axis: bool = False,
dropna: bool = True,
dropna: bool | lib.NoDefault = lib.no_default,
uniques: ArrayLike | None = None,
) -> None:
self.level = level
Expand Down Expand Up @@ -599,6 +612,12 @@ def codes(self) -> npt.NDArray[np.signedinteger]:
def uniques(self) -> ArrayLike:
return self._codes_and_uniques[1]

@property
def dropna(self) -> bool:
if self._dropna is lib.no_default:
return True
return self._dropna

@cache_readonly
def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
uniques: ArrayLike
Expand All @@ -617,11 +636,11 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
else:
ucodes = np.arange(len(categories))

has_dropped_na = False
if not self._dropna:
na_mask = cat.isna()
if np.any(na_mask):
has_dropped_na = True
has_na_values = False
na_mask = cat.isna()
if np.any(na_mask):
has_na_values = True
if not self.dropna:
if self._sort:
# NA goes at the end, gets `largest non-NA code + 1`
na_code = len(categories)
Expand All @@ -637,11 +656,18 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
)
codes = cat.codes

if has_dropped_na:
if not self._sort:
# NA code is based on first appearance, increment higher codes
codes = np.where(codes >= na_code, codes + 1, codes)
codes = np.where(na_mask, na_code, codes)
if has_na_values:
if not self.dropna:
if not self._sort:
# NA code is based on first appearance, increment higher codes
codes = np.where(codes >= na_code, codes + 1, codes)
codes = np.where(na_mask, na_code, codes)
elif self._dropna is lib.no_default:
warnings.warn(
_NULL_KEY_MESSAGE,
NullKeyWarning,
stacklevel=find_stack_level(),
)

return codes, uniques

Expand All @@ -660,8 +686,16 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
# error: Incompatible types in assignment (expression has type "Union[
# ndarray[Any, Any], Index]", variable has type "Categorical")
codes, uniques = algorithms.factorize( # type: ignore[assignment]
self.grouping_vector, sort=self._sort, use_na_sentinel=self._dropna
self.grouping_vector, sort=self._sort, use_na_sentinel=self.dropna
)
# TODO: Is `min(codes)` or `-1 in codes` faster?
if self._dropna is lib.no_default and (codes == -1).any():
warnings.warn(
_NULL_KEY_MESSAGE,
NullKeyWarning,
stacklevel=find_stack_level(),
)

return codes, uniques

@cache_readonly
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1972,7 +1972,7 @@ def groupby(
sort: bool = True,
group_keys: bool = True,
observed: bool = False,
dropna: bool = True,
dropna: bool | lib.NoDefault = lib.no_default,
) -> SeriesGroupBy:
from pandas.core.groupby.generic import SeriesGroupBy

Expand Down
24 changes: 24 additions & 0 deletions pandas/errors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -878,6 +878,29 @@ class CategoricalConversionWarning(Warning):
"""


class NullKeyWarning(Warning):
"""
Warning raised when grouping on null/NA keys with default `dropna` argument.

This warning helps ensure data integrity and alerts users to potential issues
during grouping/aggregating when the default value of `dropna` would lead to
null keys being dropped from the output.

For more information, see discussion of [PDEP-11](#53094)

See Also
--------
DataFrame.groupby : Group DataFrame using a mapper or by a Series of columns.
DataFrame.pivot_table : Create a spreadsheet-style pivot table as a DataFrame.

Examples
--------
>>> df = pd.DataFrame({"A": ["a", None], "B": [1, 2]})
>>> df.groupby(["A"]).sum() # doctest: +SKIP
... # NullKeyWarning: ...
"""


class LossySetitemError(Exception):
"""
Raised when trying to do a __setitem__ on an np.ndarray that is not lossless.
Expand Down Expand Up @@ -927,6 +950,7 @@ class InvalidComparison(Exception):
"MergeError",
"NoBufferPresent",
"NullFrequencyError",
"NullKeyWarning",
"NumExprClobberingError",
"NumbaUtilError",
"OptionError",
Expand Down
65 changes: 65 additions & 0 deletions pandas/tests/groupby/test_groupby_dropna.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,71 @@ def test_groupby_nan_included():
assert list(result.keys())[0:2] == ["g1", "g2"]


@pytest.mark.parametrize(
"by",
[
pytest.param("group", id="column"),
pytest.param(pd.Series(["g1", np.nan, "g1", "g2", np.nan]), id="Series"),
pytest.param(
pd.Series(["g1", np.nan, "g1", "g2", np.nan]).astype("category"),
id="Categorical",
),
pytest.param("_index", id="index"),
pytest.param(["group", "group2"], id="multikey"),
],
)
@pytest.mark.parametrize("dropna", [True, False, None])
def test_groupby_nan_included_warns(by, dropna):
# GH 61339
data = {
"group": ["g1", np.nan, "g1", "g2", np.nan],
"group2": ["g1", "g2", np.nan, "g2", np.nan],
"B": [0, 1, 2, 3, 4],
}
df = pd.DataFrame(data)
if isinstance(by, str) and by == "_index":
df = df.set_index("group")
by = "group"

kwargs = {}
warning_type = pd.errors.NullKeyWarning
if dropna is not None:
kwargs = {"dropna": dropna}
warning_type = None

with tm.assert_produces_warning(warning_type):
grouped = df.groupby(by, **kwargs)
result = grouped.indices # noqa:F841


@pytest.mark.parametrize(
"by_type",
[
"level",
"argument",
],
)
@pytest.mark.parametrize("dropna", [True, False, None])
def test_groupby_series_nan_included_warns(by_type, dropna):
# GH 61339
index = ["a", "a", "b", np.nan]
ser = pd.Series([1, 2, 3, 3])

if by_type == "level":
ser = ser.set_axis(index, axis=0)
kwargs = {"level": 0}
elif by_type == "argument":
kwargs = {"by": index}

warning_type = pd.errors.NullKeyWarning
if dropna is not None:
kwargs["dropna"] = dropna
warning_type = None

with tm.assert_produces_warning(warning_type):
ser.groupby(**kwargs).sum()


def test_groupby_drop_nan_with_multi_index():
# GH 39895
df = pd.DataFrame([[np.nan, 0, 1]], columns=["a", "b", "c"])
Expand Down
6 changes: 4 additions & 2 deletions pandas/tests/groupby/test_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,9 @@ def test_min_count(func, min_count, value):
def test_indices_with_missing():
# GH 9304
df = DataFrame({"a": [1, 1, np.nan], "b": [2, 3, 4], "c": [5, 6, 7]})
g = df.groupby(["a", "b"])
result = g.indices
# GH 61339
with tm.assert_produces_warning(pd.errors.NullKeyWarning):
g = df.groupby(["a", "b"])
result = g.indices
expected = {(1.0, 2): np.array([0]), (1.0, 3): np.array([1])}
assert result == expected
Loading