Skip to content

Commit 9a76ad6

Browse files
committed
DEPR: Categorical with values not present in categories
1 parent 3940df8 commit 9a76ad6

File tree

26 files changed

+222
-91
lines changed

26 files changed

+222
-91
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -611,6 +611,7 @@ Other Deprecations
611611
- Deprecated :meth:`Timestamp.utcfromtimestamp`, use ``Timestamp.fromtimestamp(ts, "UTC")`` instead (:issue:`56680`)
612612
- Deprecated :meth:`Timestamp.utcnow`, use ``Timestamp.now("UTC")`` instead (:issue:`56680`)
613613
- Deprecated ``pd.core.internals.api.maybe_infer_ndim`` (:issue:`40226`)
614+
- Deprecated allowing constructing or casting to :class:`Categorical` with non-NA values that are not present in specified ``dtype.categories`` (:issue:`40996`)
614615
- Deprecated allowing non-keyword arguments in :meth:`DataFrame.all`, :meth:`DataFrame.min`, :meth:`DataFrame.max`, :meth:`DataFrame.sum`, :meth:`DataFrame.prod`, :meth:`DataFrame.mean`, :meth:`DataFrame.median`, :meth:`DataFrame.sem`, :meth:`DataFrame.var`, :meth:`DataFrame.std`, :meth:`DataFrame.skew`, :meth:`DataFrame.kurt`, :meth:`Series.all`, :meth:`Series.min`, :meth:`Series.max`, :meth:`Series.sum`, :meth:`Series.prod`, :meth:`Series.mean`, :meth:`Series.median`, :meth:`Series.sem`, :meth:`Series.var`, :meth:`Series.std`, :meth:`Series.skew`, and :meth:`Series.kurt`. (:issue:`57087`)
615616
- Deprecated allowing non-keyword arguments in :meth:`Series.to_markdown` except ``buf``. (:issue:`57280`)
616617
- Deprecated allowing non-keyword arguments in :meth:`Series.to_string` except ``buf``. (:issue:`57280`)

pandas/core/arrays/categorical.py

Lines changed: 57 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
cast,
1212
overload,
1313
)
14+
import warnings
1415

1516
import numpy as np
1617

@@ -23,6 +24,7 @@
2324
)
2425
from pandas._libs.arrays import NDArrayBacked
2526
from pandas.compat.numpy import function as nv
27+
from pandas.util._exceptions import find_stack_level
2628
from pandas.util._validators import validate_bool_kwarg
2729

2830
from pandas.core.dtypes.cast import (
@@ -479,7 +481,11 @@ def __init__(
479481
elif isinstance(values.dtype, CategoricalDtype):
480482
old_codes = extract_array(values)._codes
481483
codes = recode_for_categories(
482-
old_codes, values.dtype.categories, dtype.categories, copy=copy
484+
old_codes,
485+
values.dtype.categories,
486+
dtype.categories,
487+
copy=copy,
488+
warn=True,
483489
)
484490

485491
else:
@@ -535,7 +541,13 @@ def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self:
535541
# The _from_scalars strictness doesn't make much sense in this case.
536542
raise NotImplementedError
537543

538-
res = cls._from_sequence(scalars, dtype=dtype)
544+
with warnings.catch_warnings():
545+
warnings.filterwarnings(
546+
"ignore",
547+
"Constructing a Categorical with a dtype and values",
548+
FutureWarning,
549+
)
550+
res = cls._from_sequence(scalars, dtype=dtype)
539551

540552
# if there are any non-category elements in scalars, these will be
541553
# converted to NAs in res.
@@ -576,6 +588,15 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
576588
dtype = self.dtype.update_dtype(dtype)
577589
self = self.copy() if copy else self
578590
result = self._set_dtype(dtype, copy=False)
591+
wrong = result.isna() & ~self.isna()
592+
if wrong.any():
593+
warnings.warn(
594+
"Constructing a Categorical with a dtype and values containing "
595+
"non-null entries not in that dtype's categories is deprecated "
596+
"and will raise in a future version.",
597+
FutureWarning,
598+
stacklevel=find_stack_level(),
599+
)
579600

580601
elif isinstance(dtype, ExtensionDtype):
581602
return super().astype(dtype, copy=copy)
@@ -670,14 +691,16 @@ def _from_inferred_categories(
670691
if known_categories:
671692
# Recode from observation order to dtype.categories order.
672693
categories = dtype.categories
673-
codes = recode_for_categories(inferred_codes, cats, categories, copy=False)
694+
codes = recode_for_categories(
695+
inferred_codes, cats, categories, copy=False, warn=True
696+
)
674697
elif not cats.is_monotonic_increasing:
675698
# Sort categories and recode for unknown categories.
676699
unsorted = cats.copy()
677700
categories = cats.sort_values()
678701

679702
codes = recode_for_categories(
680-
inferred_codes, unsorted, categories, copy=False
703+
inferred_codes, unsorted, categories, copy=False, warn=True
681704
)
682705
dtype = CategoricalDtype(categories, ordered=False)
683706
else:
@@ -1156,7 +1179,7 @@ def set_categories(
11561179
codes = cat._codes
11571180
else:
11581181
codes = recode_for_categories(
1159-
cat.codes, cat.categories, new_dtype.categories, copy=False
1182+
cat.codes, cat.categories, new_dtype.categories, copy=False, warn=False
11601183
)
11611184
NDArrayBacked.__init__(cat, codes, new_dtype)
11621185
return cat
@@ -3004,11 +3027,25 @@ def _get_codes_for_values(
30043027
If `values` is known to be a Categorical, use recode_for_categories instead.
30053028
"""
30063029
codes = categories.get_indexer_for(values)
3030+
wrong = (codes == -1) & ~isna(values)
3031+
if wrong.any():
3032+
warnings.warn(
3033+
"Constructing a Categorical with a dtype and values containing "
3034+
"non-null entries not in that dtype's categories is deprecated "
3035+
"and will raise in a future version.",
3036+
FutureWarning,
3037+
stacklevel=find_stack_level(),
3038+
)
30073039
return coerce_indexer_dtype(codes, categories)
30083040

30093041

30103042
def recode_for_categories(
3011-
codes: np.ndarray, old_categories, new_categories, *, copy: bool
3043+
codes: np.ndarray,
3044+
old_categories,
3045+
new_categories,
3046+
*,
3047+
copy: bool = True,
3048+
warn: bool = False,
30123049
) -> np.ndarray:
30133050
"""
30143051
Convert a set of codes for to a new set of categories
@@ -3019,6 +3056,8 @@ def recode_for_categories(
30193056
old_categories, new_categories : Index
30203057
copy: bool, default True
30213058
Whether to copy if the codes are unchanged.
3059+
warn : bool, default False
3060+
Whether to warn on silent-NA mapping.
30223061
30233062
Returns
30243063
-------
@@ -3043,9 +3082,18 @@ def recode_for_categories(
30433082
return codes.copy()
30443083
return codes
30453084

3046-
indexer = coerce_indexer_dtype(
3047-
new_categories.get_indexer_for(old_categories), new_categories
3048-
)
3085+
codes_in_old_cats = new_categories.get_indexer_for(old_categories)
3086+
if warn:
3087+
wrong = codes_in_old_cats == -1
3088+
if wrong.any():
3089+
warnings.warn(
3090+
"Constructing a Categorical with a dtype and values containing "
3091+
"non-null entries not in that dtype's categories is deprecated "
3092+
"and will raise in a future version.",
3093+
FutureWarning,
3094+
stacklevel=find_stack_level(),
3095+
)
3096+
indexer = coerce_indexer_dtype(codes_in_old_cats, new_categories)
30493097
new_codes = take_nd(indexer, codes, fill_value=-1)
30503098
return new_codes
30513099

pandas/core/groupby/ops.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -721,7 +721,7 @@ def groups(self) -> dict[Hashable, Index]:
721721
return self.groupings[0].groups
722722
result_index, ids = self.result_index_and_ids
723723
values = result_index._values
724-
categories = Categorical(ids, categories=range(len(result_index)))
724+
categories = Categorical.from_codes(ids, categories=range(len(result_index)))
725725
result = {
726726
# mypy is not aware that group has to be an integer
727727
values[group]: self.axis.take(axis_ilocs) # type: ignore[call-overload]

pandas/core/indexes/category.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,12 @@ def _is_dtype_compat(self, other: Index) -> Categorical:
258258
else:
259259
values = other
260260

261+
codes = self.categories.get_indexer(values)
262+
if ((codes == -1) & ~values.isna()).any():
263+
# GH#37667 see test_equals_non_category
264+
raise TypeError(
265+
"categories must match existing categories when appending"
266+
)
261267
cat = Categorical(other, dtype=self.dtype)
262268
other = CategoricalIndex(cat)
263269
if not other.isin(values).all():

pandas/tests/arrays/categorical/test_api.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -289,8 +289,16 @@ def test_set_categories(self):
289289
],
290290
)
291291
def test_set_categories_many(self, values, categories, new_categories, ordered):
292-
c = Categorical(values, categories)
293-
expected = Categorical(values, new_categories, ordered)
292+
msg = "Constructing a Categorical with a dtype and values containing"
293+
294+
warn1 = FutureWarning if set(values).difference(categories) else None
295+
with tm.assert_produces_warning(warn1, match=msg):
296+
c = Categorical(values, categories)
297+
298+
warn2 = FutureWarning if set(values).difference(new_categories) else None
299+
with tm.assert_produces_warning(warn2, match=msg):
300+
expected = Categorical(values, new_categories, ordered)
301+
294302
result = c.set_categories(new_categories, ordered=ordered)
295303
tm.assert_categorical_equal(result, expected)
296304

pandas/tests/arrays/categorical/test_astype.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -121,8 +121,11 @@ def test_astype_category(self, dtype_ordered, ordered):
121121

122122
# non-standard categories
123123
dtype = CategoricalDtype(list("adc"), dtype_ordered)
124-
result = cat.astype(dtype)
125-
expected = Categorical(data, dtype=dtype)
124+
msg = "Constructing a Categorical with a dtype and values containing"
125+
with tm.assert_produces_warning(FutureWarning, match=msg):
126+
result = cat.astype(dtype)
127+
with tm.assert_produces_warning(FutureWarning, match=msg):
128+
expected = Categorical(data, dtype=dtype)
126129
tm.assert_categorical_equal(result, expected)
127130

128131
if dtype_ordered is False:

pandas/tests/arrays/categorical/test_constructors.py

Lines changed: 44 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -228,14 +228,15 @@ def test_constructor(self):
228228
# two arrays
229229
# - when the first is an integer dtype and the second is not
230230
# - when the resulting codes are all -1/NaN
231-
with tm.assert_produces_warning(None):
231+
msg = "Constructing a Categorical with a dtype and values containing"
232+
with tm.assert_produces_warning(FutureWarning, match=msg):
232233
Categorical([0, 1, 2, 0, 1, 2], categories=["a", "b", "c"])
233234

234-
with tm.assert_produces_warning(None):
235+
with tm.assert_produces_warning(FutureWarning, match=msg):
235236
Categorical([0, 1, 2, 0, 1, 2], categories=[3, 4, 5])
236237

237238
# the next one are from the old docs
238-
with tm.assert_produces_warning(None):
239+
with tm.assert_produces_warning(FutureWarning, match=msg):
239240
Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3])
240241
cat = Categorical([1, 2], categories=[1, 2, 3])
241242

@@ -247,12 +248,16 @@ def test_constructor_with_existing_categories(self):
247248
# GH25318: constructing with pd.Series used to bogusly skip recoding
248249
# categories
249250
c0 = Categorical(["a", "b", "c", "a"])
250-
c1 = Categorical(["a", "b", "c", "a"], categories=["b", "c"])
251+
msg = "Constructing a Categorical with a dtype and values containing"
252+
with tm.assert_produces_warning(FutureWarning, match=msg):
253+
c1 = Categorical(["a", "b", "c", "a"], categories=["b", "c"])
251254

252-
c2 = Categorical(c0, categories=c1.categories)
255+
with tm.assert_produces_warning(FutureWarning, match=msg):
256+
c2 = Categorical(c0, categories=c1.categories)
253257
tm.assert_categorical_equal(c1, c2)
254258

255-
c3 = Categorical(Series(c0), categories=c1.categories)
259+
with tm.assert_produces_warning(FutureWarning, match=msg):
260+
c3 = Categorical(Series(c0), categories=c1.categories)
256261
tm.assert_categorical_equal(c1, c3)
257262

258263
def test_constructor_not_sequence(self):
@@ -430,10 +435,13 @@ def test_constructor_dtype_and_others_raises(self):
430435

431436
@pytest.mark.parametrize("categories", [None, ["a", "b"], ["a", "c"]])
432437
def test_constructor_str_category(self, categories, ordered):
433-
result = Categorical(
434-
["a", "b"], categories=categories, ordered=ordered, dtype="category"
435-
)
436-
expected = Categorical(["a", "b"], categories=categories, ordered=ordered)
438+
warn = FutureWarning if categories == ["a", "c"] else None
439+
msg = "Constructing a Categorical with a dtype and values containing"
440+
with tm.assert_produces_warning(warn, match=msg):
441+
result = Categorical(
442+
["a", "b"], categories=categories, ordered=ordered, dtype="category"
443+
)
444+
expected = Categorical(["a", "b"], categories=categories, ordered=ordered)
437445
tm.assert_categorical_equal(result, expected)
438446

439447
def test_constructor_str_unknown(self):
@@ -450,10 +458,12 @@ def test_constructor_np_strs(self):
450458
def test_constructor_from_categorical_with_dtype(self):
451459
dtype = CategoricalDtype(["a", "b", "c"], ordered=True)
452460
values = Categorical(["a", "b", "d"])
453-
result = Categorical(values, dtype=dtype)
461+
msg = "Constructing a Categorical with a dtype and values containing"
462+
with tm.assert_produces_warning(FutureWarning, match=msg):
463+
result = Categorical(values, dtype=dtype)
454464
# We use dtype.categories, not values.categories
455465
expected = Categorical(
456-
["a", "b", "d"], categories=["a", "b", "c"], ordered=True
466+
["a", "b", None], categories=["a", "b", "c"], ordered=True
457467
)
458468
tm.assert_categorical_equal(result, expected)
459469

@@ -470,16 +480,19 @@ def test_constructor_from_categorical_with_unknown_dtype(self):
470480
def test_constructor_from_categorical_string(self):
471481
values = Categorical(["a", "b", "d"])
472482
# use categories, ordered
473-
result = Categorical(
474-
values, categories=["a", "b", "c"], ordered=True, dtype="category"
475-
)
483+
msg = "Constructing a Categorical with a dtype and values containing"
484+
with tm.assert_produces_warning(FutureWarning, match=msg):
485+
result = Categorical(
486+
values, categories=["a", "b", "c"], ordered=True, dtype="category"
487+
)
476488
expected = Categorical(
477-
["a", "b", "d"], categories=["a", "b", "c"], ordered=True
489+
["a", "b", None], categories=["a", "b", "c"], ordered=True
478490
)
479491
tm.assert_categorical_equal(result, expected)
480492

481493
# No string
482-
result = Categorical(values, categories=["a", "b", "c"], ordered=True)
494+
with tm.assert_produces_warning(FutureWarning, match=msg):
495+
result = Categorical(values, categories=["a", "b", "c"], ordered=True)
483496
tm.assert_categorical_equal(result, expected)
484497

485498
def test_constructor_with_categorical_categories(self):
@@ -661,17 +674,25 @@ def test_from_inferred_categories_dtype(self):
661674
cats = ["a", "b", "d"]
662675
codes = np.array([0, 1, 0, 2], dtype="i8")
663676
dtype = CategoricalDtype(["c", "b", "a"], ordered=True)
664-
result = Categorical._from_inferred_categories(cats, codes, dtype)
677+
msg = "Constructing a Categorical with a dtype and values containing"
678+
with tm.assert_produces_warning(
679+
FutureWarning, match=msg, check_stacklevel=False
680+
):
681+
result = Categorical._from_inferred_categories(cats, codes, dtype)
665682
expected = Categorical(
666-
["a", "b", "a", "d"], categories=["c", "b", "a"], ordered=True
683+
["a", "b", "a", None], categories=["c", "b", "a"], ordered=True
667684
)
668685
tm.assert_categorical_equal(result, expected)
669686

670687
def test_from_inferred_categories_coerces(self):
671688
cats = ["1", "2", "bad"]
672689
codes = np.array([0, 0, 1, 2], dtype="i8")
673690
dtype = CategoricalDtype([1, 2])
674-
result = Categorical._from_inferred_categories(cats, codes, dtype)
691+
msg = "Constructing a Categorical with a dtype and values containing"
692+
with tm.assert_produces_warning(
693+
FutureWarning, match=msg, check_stacklevel=False
694+
):
695+
result = Categorical._from_inferred_categories(cats, codes, dtype)
675696
expected = Categorical([1, 1, 2, np.nan])
676697
tm.assert_categorical_equal(result, expected)
677698

@@ -722,7 +743,9 @@ def test_interval(self):
722743

723744
# extra
724745
values = pd.interval_range(8, 11, periods=3)
725-
cat = Categorical(values, categories=idx)
746+
msg = "Constructing a Categorical with a dtype and values containing"
747+
with tm.assert_produces_warning(FutureWarning, match=msg):
748+
cat = Categorical(values, categories=idx)
726749
expected_codes = np.array([8, 9, -1], dtype="int8")
727750
tm.assert_numpy_array_equal(cat.codes, expected_codes)
728751
tm.assert_index_equal(cat.categories, idx)

0 commit comments

Comments
 (0)