diff --git a/doc/source/whatsnew/v2.3.3.rst b/doc/source/whatsnew/v2.3.3.rst index aaed7544d9975..0a029f683b6cb 100644 --- a/doc/source/whatsnew/v2.3.3.rst +++ b/doc/source/whatsnew/v2.3.3.rst @@ -18,6 +18,16 @@ Most changes in this release are related to :class:`StringDtype` which will become the default string dtype in pandas 3.0. See :ref:`whatsnew_230.upcoming_changes` for more details. +.. _whatsnew_233.string_fixes.improvements: + +Improvements +^^^^^^^^^^^^ +- Update :meth:`DataFrame.select_dtypes` to keep selecting ``str`` columns when + specifying ``include=["object"]`` for backwards compatibility. In a future + release, this will be deprecated and code for pandas 3+ should be updated to + do ``include=["str"]`` (:issue:`61916`) + + .. _whatsnew_233.string_fixes.bugs: Bug fixes diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index ccc80368b4106..a0f2d0447ea8c 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -865,7 +865,9 @@ def invalidate_string_dtypes(dtype_set: set[DtypeObj]) -> None: np.dtype(" np.ndarray: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f61e231736e31..414f02f41e9f0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5150,10 +5150,14 @@ def check_int_infer_dtype(dtypes): def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool: # GH 46870: BooleanDtype._is_numeric == True but should be excluded dtype = dtype if not isinstance(dtype, ArrowDtype) else dtype.numpy_dtype - return issubclass(dtype.type, tuple(dtypes_set)) or ( - np.number in dtypes_set - and getattr(dtype, "_is_numeric", False) - and not is_bool_dtype(dtype) + return ( + issubclass(dtype.type, tuple(dtypes_set)) + or ( + np.number in dtypes_set + and getattr(dtype, "_is_numeric", False) + and not is_bool_dtype(dtype) + ) + or (dtype.type is str and np.object_ in dtypes_set) ) def predicate(arr: ArrayLike) -> bool: diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index 0354e9df3d168..d3e28d328c8fd 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -102,6 +102,10 @@ def test_select_dtypes_include_using_list_like(self, using_infer_string): ri = df.select_dtypes(include=[str]) tm.assert_frame_equal(ri, ei) + ri = df.select_dtypes(include=["object"]) + ei = df[["a"]] + tm.assert_frame_equal(ri, ei) + def test_select_dtypes_exclude_using_list_like(self): df = DataFrame( { @@ -309,17 +313,15 @@ def test_select_dtypes_not_an_attr_but_still_valid_dtype(self, using_infer_strin df["g"] = df.f.diff() assert not hasattr(np, "u8") r = df.select_dtypes(include=["i8", "O"], exclude=["timedelta"]) - if using_infer_string: - e = df[["b"]] - else: - e = df[["a", "b"]] + # if using_infer_string: + # TODO warn + e = df[["a", "b"]] tm.assert_frame_equal(r, e) r = df.select_dtypes(include=["i8", "O", "timedelta64[ns]"]) - if using_infer_string: - e = df[["b", "g"]] - else: - e = df[["a", "b", "g"]] + # if using_infer_string: + # TODO warn + e = df[["a", "b", "g"]] tm.assert_frame_equal(r, e) def test_select_dtypes_empty(self):