Backport PR pandas-dev#62323 on branch 2.3.x (String dtype: keep select_dtypes(include=object) selecting string columns) (pandas-dev#62400)

meeseeksmachine · jorisvandenbossche · web-flow · commit 0426e59e7210 · 2025-09-22T09:49:09.000-07:00
Co-authored-by: Joris Van den Bossche &lt;jorisvandenbossche@gmail.com&gt;
diff --git a/doc/source/whatsnew/v2.3.3.rst b/doc/source/whatsnew/v2.3.3.rst
@@ -18,6 +18,16 @@ Most changes in this release are related to :class:`StringDtype` which will
 become the default string dtype in pandas 3.0. See
 :ref:`whatsnew_230.upcoming_changes` for more details.
 
+.. _whatsnew_233.string_fixes.improvements:
+
+Improvements
+^^^^^^^^^^^^
+- Update :meth:`DataFrame.select_dtypes` to keep selecting ``str`` columns when
+  specifying ``include=["object"]`` for backwards compatibility. In a future
+  release, this will be deprecated and code for pandas 3+ should be updated to
+  do ``include=["str"]`` (:issue:`61916`)
+
+
 .. _whatsnew_233.string_fixes.bugs:
 
 Bug fixes
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -966,7 +966,9 @@ def invalidate_string_dtypes(dtype_set: set[DtypeObj]) -> None:
         np.dtype("<U").type,  # type: ignore[arg-type]
     }
     if non_string_dtypes != dtype_set:
-        raise TypeError("string dtypes are not allowed, use 'object' instead")
+        raise TypeError(
+            "numpy string dtypes are not allowed, use 'str' or 'object' instead"
+        )
 
 
 def coerce_indexer_dtype(indexer, categories) -> np.ndarray:
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -144,6 +144,7 @@
     TimedeltaArray,
 )
 from pandas.core.arrays.sparse import SparseFrameAccessor
+from pandas.core.arrays.string_ import StringDtype
 from pandas.core.construction import (
     ensure_wrapped_if_datetimelike,
     sanitize_array,
@@ -5080,10 +5081,19 @@ def check_int_infer_dtype(dtypes):
         def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool:
             # GH 46870: BooleanDtype._is_numeric == True but should be excluded
             dtype = dtype if not isinstance(dtype, ArrowDtype) else dtype.numpy_dtype
-            return issubclass(dtype.type, tuple(dtypes_set)) or (
-                np.number in dtypes_set
-                and getattr(dtype, "_is_numeric", False)
-                and not is_bool_dtype(dtype)
+            return (
+                issubclass(dtype.type, tuple(dtypes_set))
+                or (
+                    np.number in dtypes_set
+                    and getattr(dtype, "_is_numeric", False)
+                    and not is_bool_dtype(dtype)
+                )
+                # backwards compat for the default `str` dtype being selected by object
+                or (
+                    isinstance(dtype, StringDtype)
+                    and dtype.na_value is np.nan
+                    and np.object_ in dtypes_set
+                )
             )
 
         def predicate(arr: ArrayLike) -> bool:
diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py
@@ -102,6 +102,10 @@ def test_select_dtypes_include_using_list_like(self, using_infer_string):
             ri = df.select_dtypes(include=[str])
             tm.assert_frame_equal(ri, ei)
 
+        ri = df.select_dtypes(include=["object"])
+        ei = df[["a"]]
+        tm.assert_frame_equal(ri, ei)
+
     def test_select_dtypes_exclude_using_list_like(self):
         df = DataFrame(
             {
@@ -309,17 +313,15 @@ def test_select_dtypes_not_an_attr_but_still_valid_dtype(self, using_infer_strin
         df["g"] = df.f.diff()
         assert not hasattr(np, "u8")
         r = df.select_dtypes(include=["i8", "O"], exclude=["timedelta"])
-        if using_infer_string:
-            e = df[["b"]]
-        else:
-            e = df[["a", "b"]]
+        # if using_infer_string:
+        #     TODO warn
+        e = df[["a", "b"]]
         tm.assert_frame_equal(r, e)
 
         r = df.select_dtypes(include=["i8", "O", "timedelta64[ns]"])
-        if using_infer_string:
-            e = df[["b", "g"]]
-        else:
-            e = df[["a", "b", "g"]]
+        # if using_infer_string:
+        #     TODO warn
+        e = df[["a", "b", "g"]]
         tm.assert_frame_equal(r, e)
 
     def test_select_dtypes_empty(self):
@@ -483,3 +485,26 @@ def test_select_dtypes_no_view(self):
         result = df.select_dtypes(include=["number"])
         result.iloc[0, 0] = 0
         tm.assert_frame_equal(df, df_orig)
+
+    def test_select_dtype_object_and_str(self, using_infer_string):
+        # https://github.com/pandas-dev/pandas/issues/61916
+        df = DataFrame(
+            {
+                "a": ["a", "b", "c"],
+                "b": [1, 2, 3],
+                "c": pd.array(["a", "b", "c"], dtype="string"),
+            }
+        )
+
+        # with "object" -> only select the object or default str dtype column
+        result = df.select_dtypes(include=["object"])
+        expected = df[["a"]]
+        tm.assert_frame_equal(result, expected)
+
+        # with "string" -> select both the default 'str' and the nullable 'string'
+        result = df.select_dtypes(include=["string"])
+        if using_infer_string:
+            expected = df[["a", "c"]]
+        else:
+            expected = df[["c"]]
+        tm.assert_frame_equal(result, expected)

Original file line number	Diff line number	Diff line change
`@@ -966,7 +966,9 @@ def invalidate_string_dtypes(dtype_set: set[DtypeObj]) -> None:`
`966`	`966`	`np.dtype("<U").type, # type: ignore[arg-type]`
`967`	`967`	`}`
`968`	`968`	`if non_string_dtypes != dtype_set:`
`969`		`- raise TypeError("string dtypes are not allowed, use 'object' instead")`
	`969`	`+ raise TypeError(`
	`970`	`+ "numpy string dtypes are not allowed, use 'str' or 'object' instead"`
	`971`	`+ )`
`970`	`972`
`971`	`973`
`972`	`974`	`def coerce_indexer_dtype(indexer, categories) -> np.ndarray:`