9
9
TYPE_CHECKING ,
10
10
final ,
11
11
)
12
+ import warnings
12
13
13
14
import numpy as np
14
15
16
+ from pandas ._libs import lib
15
17
from pandas ._libs .tslibs import OutOfBoundsDatetime
16
- from pandas .errors import InvalidIndexError
18
+ from pandas .errors import (
19
+ InvalidIndexError ,
20
+ NullKeyWarning ,
21
+ )
17
22
from pandas .util ._decorators import cache_readonly
23
+ from pandas .util ._exceptions import find_stack_level
18
24
19
25
from pandas .core .dtypes .common import (
20
26
is_list_like ,
55
61
from pandas .core .generic import NDFrame
56
62
57
63
64
+ _NULL_KEY_MESSAGE = (
65
+ "`dropna` is not specified but grouper encountered null group keys. These keys "
66
+ "will be dropped from the result by default. To keep null keys, set `dropna=True`, "
67
+ "or to hide this warning and drop null keys, set `dropna=False`."
68
+ )
69
+
70
+
58
71
class Grouper :
59
72
"""
60
73
A Grouper allows the user to specify a groupby instruction for an object.
@@ -246,7 +259,7 @@ class Grouper:
246
259
"""
247
260
248
261
sort : bool
249
- dropna : bool
262
+ dropna : bool | lib . NoDefault
250
263
_grouper : Index | None
251
264
252
265
_attributes : tuple [str , ...] = ("key" , "level" , "freq" , "sort" , "dropna" )
@@ -264,7 +277,7 @@ def __init__(
264
277
level = None ,
265
278
freq = None ,
266
279
sort : bool = False ,
267
- dropna : bool = True ,
280
+ dropna : bool | lib . NoDefault = lib . no_default ,
268
281
) -> None :
269
282
self .key = key
270
283
self .level = level
@@ -442,7 +455,7 @@ def __init__(
442
455
sort : bool = True ,
443
456
observed : bool = False ,
444
457
in_axis : bool = False ,
445
- dropna : bool = True ,
458
+ dropna : bool | lib . NoDefault = lib . no_default ,
446
459
uniques : ArrayLike | None = None ,
447
460
) -> None :
448
461
self .level = level
@@ -599,6 +612,12 @@ def codes(self) -> npt.NDArray[np.signedinteger]:
599
612
def uniques (self ) -> ArrayLike :
600
613
return self ._codes_and_uniques [1 ]
601
614
615
+ @property
616
+ def dropna (self ) -> bool :
617
+ if self ._dropna is lib .no_default :
618
+ return True
619
+ return self ._dropna
620
+
602
621
@cache_readonly
603
622
def _codes_and_uniques (self ) -> tuple [npt .NDArray [np .signedinteger ], ArrayLike ]:
604
623
uniques : ArrayLike
@@ -617,11 +636,11 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
617
636
else :
618
637
ucodes = np .arange (len (categories ))
619
638
620
- has_dropped_na = False
621
- if not self . _dropna :
622
- na_mask = cat . isna ()
623
- if np . any ( na_mask ):
624
- has_dropped_na = True
639
+ has_na_values = False
640
+ na_mask = cat . isna ()
641
+ if np . any ( na_mask ):
642
+ has_na_values = True
643
+ if not self . dropna :
625
644
if self ._sort :
626
645
# NA goes at the end, gets `largest non-NA code + 1`
627
646
na_code = len (categories )
@@ -637,11 +656,18 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
637
656
)
638
657
codes = cat .codes
639
658
640
- if has_dropped_na :
641
- if not self ._sort :
642
- # NA code is based on first appearance, increment higher codes
643
- codes = np .where (codes >= na_code , codes + 1 , codes )
644
- codes = np .where (na_mask , na_code , codes )
659
+ if has_na_values :
660
+ if not self .dropna :
661
+ if not self ._sort :
662
+ # NA code is based on first appearance, increment higher codes
663
+ codes = np .where (codes >= na_code , codes + 1 , codes )
664
+ codes = np .where (na_mask , na_code , codes )
665
+ elif self ._dropna is lib .no_default :
666
+ warnings .warn (
667
+ _NULL_KEY_MESSAGE ,
668
+ NullKeyWarning ,
669
+ stacklevel = find_stack_level (),
670
+ )
645
671
646
672
return codes , uniques
647
673
@@ -660,8 +686,16 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
660
686
# error: Incompatible types in assignment (expression has type "Union[
661
687
# ndarray[Any, Any], Index]", variable has type "Categorical")
662
688
codes , uniques = algorithms .factorize ( # type: ignore[assignment]
663
- self .grouping_vector , sort = self ._sort , use_na_sentinel = self ._dropna
689
+ self .grouping_vector , sort = self ._sort , use_na_sentinel = self .dropna
664
690
)
691
+ # TODO: Is `min(codes)` or `-1 in codes` faster?
692
+ if self ._dropna is lib .no_default and (codes == - 1 ).any ():
693
+ warnings .warn (
694
+ _NULL_KEY_MESSAGE ,
695
+ NullKeyWarning ,
696
+ stacklevel = find_stack_level (),
697
+ )
698
+
665
699
return codes , uniques
666
700
667
701
@cache_readonly
0 commit comments