Skip to content

Commit 47cabb2

Browse files
committed
implement dropna null key warning
1 parent d1c5053 commit 47cabb2

File tree

3 files changed

+62
-21
lines changed

3 files changed

+62
-21
lines changed

pandas/core/frame.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9148,7 +9148,7 @@ def groupby(
91489148
sort: bool = True,
91499149
group_keys: bool = True,
91509150
observed: bool = True,
9151-
dropna: bool = True,
9151+
dropna: bool | lib.NoDefault = lib.no_default,
91529152
) -> DataFrameGroupBy:
91539153
from pandas.core.groupby.generic import DataFrameGroupBy
91549154

pandas/core/groupby/groupby.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -486,6 +486,12 @@ def __repr__(self) -> str:
486486
# TODO: Better repr for GroupBy object
487487
return object.__repr__(self)
488488

489+
@property
490+
def dropna(self) -> bool:
491+
if self._dropna is lib.no_default:
492+
return True
493+
return self._dropna
494+
489495
@final
490496
@property
491497
def groups(self) -> dict[Hashable, Index]:
@@ -1053,7 +1059,7 @@ def __init__(
10531059
sort: bool = True,
10541060
group_keys: bool = True,
10551061
observed: bool = False,
1056-
dropna: bool = True,
1062+
dropna: bool | lib.NoDefault = lib.no_default,
10571063
) -> None:
10581064
self._selection = selection
10591065

@@ -1064,7 +1070,7 @@ def __init__(
10641070
self.keys = keys
10651071
self.sort = sort
10661072
self.group_keys = group_keys
1067-
self.dropna = dropna
1073+
self._dropna = dropna
10681074

10691075
if grouper is None:
10701076
grouper, exclusions, obj = get_grouper(
@@ -1073,7 +1079,7 @@ def __init__(
10731079
level=level,
10741080
sort=sort,
10751081
observed=observed,
1076-
dropna=self.dropna,
1082+
dropna=self._dropna,
10771083
)
10781084

10791085
self.observed = observed
@@ -2664,7 +2670,8 @@ def _value_counts(
26642670
groupings,
26652671
sort=False,
26662672
observed=self.observed,
2667-
dropna=self.dropna,
2673+
# TODO: Should we pass through lib.no_default?
2674+
dropna=self._dropna,
26682675
)
26692676
result_series = cast(Series, gb.size())
26702677
result_series.name = name
@@ -2695,7 +2702,7 @@ def _value_counts(
26952702
indexed_group_size = result_series.groupby(
26962703
result_series.index.droplevel(levels),
26972704
sort=self.sort,
2698-
dropna=self.dropna,
2705+
dropna=self._dropna,
26992706
# GH#43999 - deprecation of observed=False
27002707
observed=False,
27012708
).transform("sum")

pandas/core/groupby/grouper.py

Lines changed: 49 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,18 @@
99
TYPE_CHECKING,
1010
final,
1111
)
12+
import warnings
1213

1314
import numpy as np
1415

16+
from pandas._libs import lib
1517
from pandas._libs.tslibs import OutOfBoundsDatetime
16-
from pandas.errors import InvalidIndexError
18+
from pandas.errors import (
19+
InvalidIndexError,
20+
NullKeyWarning,
21+
)
1722
from pandas.util._decorators import cache_readonly
23+
from pandas.util._exceptions import find_stack_level
1824

1925
from pandas.core.dtypes.common import (
2026
is_list_like,
@@ -55,6 +61,13 @@
5561
from pandas.core.generic import NDFrame
5662

5763

64+
_NULL_KEY_MESSAGE = (
65+
"`dropna` is not specified but grouper encountered null group keys. These keys "
66+
"will be dropped from the result by default. To keep null keys, set `dropna=True`, "
67+
"or to hide this warning and drop null keys, set `dropna=False`."
68+
)
69+
70+
5871
class Grouper:
5972
"""
6073
A Grouper allows the user to specify a groupby instruction for an object.
@@ -246,7 +259,7 @@ class Grouper:
246259
"""
247260

248261
sort: bool
249-
dropna: bool
262+
dropna: bool | lib.NoDefault
250263
_grouper: Index | None
251264

252265
_attributes: tuple[str, ...] = ("key", "level", "freq", "sort", "dropna")
@@ -264,7 +277,7 @@ def __init__(
264277
level=None,
265278
freq=None,
266279
sort: bool = False,
267-
dropna: bool = True,
280+
dropna: bool | lib.NoDefault = lib.no_default,
268281
) -> None:
269282
self.key = key
270283
self.level = level
@@ -442,7 +455,7 @@ def __init__(
442455
sort: bool = True,
443456
observed: bool = False,
444457
in_axis: bool = False,
445-
dropna: bool = True,
458+
dropna: bool | lib.NoDefault = lib.no_default,
446459
uniques: ArrayLike | None = None,
447460
) -> None:
448461
self.level = level
@@ -599,6 +612,12 @@ def codes(self) -> npt.NDArray[np.signedinteger]:
599612
def uniques(self) -> ArrayLike:
600613
return self._codes_and_uniques[1]
601614

615+
@property
616+
def dropna(self) -> bool:
617+
if self._dropna is lib.no_default:
618+
return True
619+
return self._dropna
620+
602621
@cache_readonly
603622
def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
604623
uniques: ArrayLike
@@ -617,11 +636,11 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
617636
else:
618637
ucodes = np.arange(len(categories))
619638

620-
has_dropped_na = False
621-
if not self._dropna:
622-
na_mask = cat.isna()
623-
if np.any(na_mask):
624-
has_dropped_na = True
639+
has_na_values = False
640+
na_mask = cat.isna()
641+
if np.any(na_mask):
642+
has_na_values = True
643+
if not self.dropna:
625644
if self._sort:
626645
# NA goes at the end, gets `largest non-NA code + 1`
627646
na_code = len(categories)
@@ -637,11 +656,18 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
637656
)
638657
codes = cat.codes
639658

640-
if has_dropped_na:
641-
if not self._sort:
642-
# NA code is based on first appearance, increment higher codes
643-
codes = np.where(codes >= na_code, codes + 1, codes)
644-
codes = np.where(na_mask, na_code, codes)
659+
if has_na_values:
660+
if not self.dropna:
661+
if not self._sort:
662+
# NA code is based on first appearance, increment higher codes
663+
codes = np.where(codes >= na_code, codes + 1, codes)
664+
codes = np.where(na_mask, na_code, codes)
665+
elif self._dropna is lib.no_default:
666+
warnings.warn(
667+
_NULL_KEY_MESSAGE,
668+
NullKeyWarning,
669+
stacklevel=find_stack_level(),
670+
)
645671

646672
return codes, uniques
647673

@@ -660,8 +686,16 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
660686
# error: Incompatible types in assignment (expression has type "Union[
661687
# ndarray[Any, Any], Index]", variable has type "Categorical")
662688
codes, uniques = algorithms.factorize( # type: ignore[assignment]
663-
self.grouping_vector, sort=self._sort, use_na_sentinel=self._dropna
689+
self.grouping_vector, sort=self._sort, use_na_sentinel=self.dropna
664690
)
691+
# TODO: Is `min(codes)` or `-1 in codes` faster?
692+
if self._dropna is lib.no_default and (codes == -1).any():
693+
warnings.warn(
694+
_NULL_KEY_MESSAGE,
695+
NullKeyWarning,
696+
stacklevel=find_stack_level(),
697+
)
698+
665699
return codes, uniques
666700

667701
@cache_readonly

0 commit comments

Comments
 (0)