implement dropna null key warning

tehunter · tehunter · commit 47cabb285ea5 · 2025-04-24T20:39:28.000Z
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -9148,7 +9148,7 @@ def groupby(
         sort: bool = True,
         group_keys: bool = True,
         observed: bool = True,
-        dropna: bool = True,
+        dropna: bool | lib.NoDefault = lib.no_default,
     ) -> DataFrameGroupBy:
         from pandas.core.groupby.generic import DataFrameGroupBy
 
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -486,6 +486,12 @@ def __repr__(self) -> str:
         # TODO: Better repr for GroupBy object
         return object.__repr__(self)
 
+    @property
+    def dropna(self) -> bool:
+        if self._dropna is lib.no_default:
+            return True
+        return self._dropna
+
     @final
     @property
     def groups(self) -> dict[Hashable, Index]:
@@ -1053,7 +1059,7 @@ def __init__(
         sort: bool = True,
         group_keys: bool = True,
         observed: bool = False,
-        dropna: bool = True,
+        dropna: bool | lib.NoDefault = lib.no_default,
     ) -> None:
         self._selection = selection
 
@@ -1064,7 +1070,7 @@ def __init__(
         self.keys = keys
         self.sort = sort
         self.group_keys = group_keys
-        self.dropna = dropna
+        self._dropna = dropna
 
         if grouper is None:
             grouper, exclusions, obj = get_grouper(
@@ -1073,7 +1079,7 @@ def __init__(
                 level=level,
                 sort=sort,
                 observed=observed,
-                dropna=self.dropna,
+                dropna=self._dropna,
             )
 
         self.observed = observed
@@ -2664,7 +2670,8 @@ def _value_counts(
             groupings,
             sort=False,
             observed=self.observed,
-            dropna=self.dropna,
+            # TODO: Should we pass through lib.no_default?
+            dropna=self._dropna,
         )
         result_series = cast(Series, gb.size())
         result_series.name = name
@@ -2695,7 +2702,7 @@ def _value_counts(
             indexed_group_size = result_series.groupby(
                 result_series.index.droplevel(levels),
                 sort=self.sort,
-                dropna=self.dropna,
+                dropna=self._dropna,
                 # GH#43999 - deprecation of observed=False
                 observed=False,
             ).transform("sum")
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
@@ -9,12 +9,18 @@
     TYPE_CHECKING,
     final,
 )
+import warnings
 
 import numpy as np
 
+from pandas._libs import lib
 from pandas._libs.tslibs import OutOfBoundsDatetime
-from pandas.errors import InvalidIndexError
+from pandas.errors import (
+    InvalidIndexError,
+    NullKeyWarning,
+)
 from pandas.util._decorators import cache_readonly
+from pandas.util._exceptions import find_stack_level
 
 from pandas.core.dtypes.common import (
     is_list_like,
@@ -55,6 +61,13 @@
     from pandas.core.generic import NDFrame
 
 
+_NULL_KEY_MESSAGE = (
+    "`dropna` is not specified but grouper encountered null group keys. These keys "
+    "will be dropped from the result by default. To keep null keys, set `dropna=True`, "
+    "or to hide this warning and drop null keys, set `dropna=False`."
+)
+
+
 class Grouper:
     """
     A Grouper allows the user to specify a groupby instruction for an object.
@@ -246,7 +259,7 @@ class Grouper:
     """
 
     sort: bool
-    dropna: bool
+    dropna: bool | lib.NoDefault
     _grouper: Index | None
 
     _attributes: tuple[str, ...] = ("key", "level", "freq", "sort", "dropna")
@@ -264,7 +277,7 @@ def __init__(
         level=None,
         freq=None,
         sort: bool = False,
-        dropna: bool = True,
+        dropna: bool | lib.NoDefault = lib.no_default,
     ) -> None:
         self.key = key
         self.level = level
@@ -442,7 +455,7 @@ def __init__(
         sort: bool = True,
         observed: bool = False,
         in_axis: bool = False,
-        dropna: bool = True,
+        dropna: bool | lib.NoDefault = lib.no_default,
         uniques: ArrayLike | None = None,
     ) -> None:
         self.level = level
@@ -599,6 +612,12 @@ def codes(self) -> npt.NDArray[np.signedinteger]:
     def uniques(self) -> ArrayLike:
         return self._codes_and_uniques[1]
 
+    @property
+    def dropna(self) -> bool:
+        if self._dropna is lib.no_default:
+            return True
+        return self._dropna
+
     @cache_readonly
     def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
         uniques: ArrayLike
@@ -617,11 +636,11 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
             else:
                 ucodes = np.arange(len(categories))
 
-            has_dropped_na = False
-            if not self._dropna:
-                na_mask = cat.isna()
-                if np.any(na_mask):
-                    has_dropped_na = True
+            has_na_values = False
+            na_mask = cat.isna()
+            if np.any(na_mask):
+                has_na_values = True
+                if not self.dropna:
                     if self._sort:
                         # NA goes at the end, gets `largest non-NA code + 1`
                         na_code = len(categories)
@@ -637,11 +656,18 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
             )
             codes = cat.codes
 
-            if has_dropped_na:
-                if not self._sort:
-                    # NA code is based on first appearance, increment higher codes
-                    codes = np.where(codes >= na_code, codes + 1, codes)
-                codes = np.where(na_mask, na_code, codes)
+            if has_na_values:
+                if not self.dropna:
+                    if not self._sort:
+                        # NA code is based on first appearance, increment higher codes
+                        codes = np.where(codes >= na_code, codes + 1, codes)
+                    codes = np.where(na_mask, na_code, codes)
+                elif self._dropna is lib.no_default:
+                    warnings.warn(
+                        _NULL_KEY_MESSAGE,
+                        NullKeyWarning,
+                        stacklevel=find_stack_level(),
+                    )
 
             return codes, uniques
 
@@ -660,8 +686,16 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
             # error: Incompatible types in assignment (expression has type "Union[
             # ndarray[Any, Any], Index]", variable has type "Categorical")
             codes, uniques = algorithms.factorize(  # type: ignore[assignment]
-                self.grouping_vector, sort=self._sort, use_na_sentinel=self._dropna
+                self.grouping_vector, sort=self._sort, use_na_sentinel=self.dropna
             )
+            # TODO: Is `min(codes)` or `-1 in codes` faster?
+            if self._dropna is lib.no_default and (codes == -1).any():
+                warnings.warn(
+                    _NULL_KEY_MESSAGE,
+                    NullKeyWarning,
+                    stacklevel=find_stack_level(),
+                )
+
         return codes, uniques
 
     @cache_readonly