Merge branch '2.3.x' into backport-61909

rhshadrach · web-flow · commit 66caaaea3045 · 2025-08-18T16:11:09.000-05:00
diff --git a/doc/source/user_guide/migration-3-strings.rst b/doc/source/user_guide/migration-3-strings.rst
@@ -118,12 +118,17 @@ through the ``str`` accessor will work the same:
 Overview of behavior differences and how to address them
 ---------------------------------------------------------
 
-The dtype is no longer object dtype
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The dtype is no longer a numpy "object" dtype
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 When inferring or reading string data, the data type of the resulting DataFrame
 column or Series will silently start being the new ``"str"`` dtype instead of
-``"object"`` dtype, and this can have some impact on your code.
+the numpy ``"object"`` dtype, and this can have some impact on your code.
+
+The new string dtype is a pandas data type ("extension dtype"), and no longer a
+numpy ``np.dtype`` instance. Therefore, passing the dtype of a string column to
+numpy functions will no longer work (e.g. passing it to a ``dtype=`` argument
+of a numpy function, or using ``np.issubdtype`` to check the dtype).
 
 Checking the dtype
 ^^^^^^^^^^^^^^^^^^
diff --git a/doc/source/whatsnew/v2.3.2.rst b/doc/source/whatsnew/v2.3.2.rst
@@ -22,7 +22,11 @@ become the default string dtype in pandas 3.0. See
 
 Bug fixes
 ^^^^^^^^^
--
+- Fix :meth:`~DataFrame.to_json` with ``orient="table"`` to correctly use the
+  "string" type in the JSON Table Schema for :class:`StringDtype` columns
+  (:issue:`61889`)
+- Fixed ``~Series.str.match``, ``~Series.str.fullmatch`` and ``~Series.str.contains``
+  with compiled regex for the Arrow-backed string dtype (:issue:`61964`, :issue:`61942`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_232.contributors:
diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py
@@ -301,23 +301,29 @@ def _str_contains(
 
     def _str_match(
         self,
-        pat: str,
+        pat: str | re.Pattern,
         case: bool = True,
         flags: int = 0,
         na: Scalar | lib.NoDefault = lib.no_default,
     ):
-        if not pat.startswith("^"):
+        if isinstance(pat, re.Pattern):
+            # GH#61952
+            pat = pat.pattern
+        if isinstance(pat, str) and not pat.startswith("^"):
             pat = f"^{pat}"
         return self._str_contains(pat, case, flags, na, regex=True)
 
     def _str_fullmatch(
         self,
-        pat,
+        pat: str | re.Pattern,
         case: bool = True,
         flags: int = 0,
         na: Scalar | lib.NoDefault = lib.no_default,
     ):
-        if not pat.endswith("$") or pat.endswith("\\$"):
+        if isinstance(pat, re.Pattern):
+            # GH#61952
+            pat = pat.pattern
+        if isinstance(pat, str) and (not pat.endswith("$") or pat.endswith("\\$")):
             pat = f"{pat}$"
         return self._str_match(pat, case, flags, na)
 
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2215,8 +2215,16 @@ def _repr_categories(self) -> list[str]:
         )
         from pandas.io.formats import format as fmt
 
+        formatter = None
+        if self.categories.dtype == "str":
+            # the extension array formatter defaults to boxed=True in format_array
+            # override here to boxed=False to be consistent with QUOTE_NONNUMERIC
+            formatter = cast(ExtensionArray, self.categories._values)._formatter(
+                boxed=False
+            )
+
         format_array = partial(
-            fmt.format_array, formatter=None, quoting=QUOTE_NONNUMERIC
+            fmt.format_array, formatter=formatter, quoting=QUOTE_NONNUMERIC
         )
         if len(self.categories) > max_categories:
             num = max_categories // 2
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -355,6 +355,8 @@ def _str_contains(
     ):
         if flags:
             return super()._str_contains(pat, case, flags, na, regex)
+        if isinstance(pat, re.Pattern):
+            pat = pat.pattern
 
         return ArrowStringArrayMixin._str_contains(self, pat, case, flags, na, regex)
 
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -2575,6 +2575,22 @@ def __getitem__(self, key):
         return super().__getitem__(key)
 
     def __setitem__(self, key, value) -> None:
+        if not PYPY and using_copy_on_write():
+            if sys.getrefcount(self.obj) <= 2:
+                warnings.warn(
+                    _chained_assignment_msg, ChainedAssignmentError, stacklevel=2
+                )
+        elif not PYPY and not using_copy_on_write():
+            ctr = sys.getrefcount(self.obj)
+            ref_count = 2
+            if not warn_copy_on_write() and _check_cacher(self.obj):
+                # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221
+                ref_count += 1
+            if ctr <= ref_count:
+                warnings.warn(
+                    _chained_assignment_warning_msg, FutureWarning, stacklevel=2
+                )
+
         if self.ndim == 2 and not self._axes_are_unique:
             # GH#33041 fall back to .loc
             if not isinstance(key, tuple) or not all(is_scalar(x) for x in key):
@@ -2599,6 +2615,25 @@ def _convert_key(self, key):
                 raise ValueError("iAt based indexing can only have integer indexers")
         return key
 
+    def __setitem__(self, key, value) -> None:
+        if not PYPY and using_copy_on_write():
+            if sys.getrefcount(self.obj) <= 2:
+                warnings.warn(
+                    _chained_assignment_msg, ChainedAssignmentError, stacklevel=2
+                )
+        elif not PYPY and not using_copy_on_write():
+            ctr = sys.getrefcount(self.obj)
+            ref_count = 2
+            if not warn_copy_on_write() and _check_cacher(self.obj):
+                # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221
+                ref_count += 1
+            if ctr <= ref_count:
+                warnings.warn(
+                    _chained_assignment_warning_msg, FutureWarning, stacklevel=2
+                )
+
+        return super().__setitem__(key, value)
+
 
 def _tuplify(ndim: int, loc: Hashable) -> tuple[Hashable | slice, ...]:
     """
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
@@ -1353,8 +1353,8 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na=lib.no_default):
 
         Parameters
         ----------
-        pat : str
-            Character sequence.
+        pat : str or compiled regex
+            Character sequence or regular expression.
         case : bool, default True
             If True, case sensitive.
         flags : int, default 0 (no flags)
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
@@ -245,14 +245,15 @@ def rep(x, r):
 
     def _str_match(
         self,
-        pat: str,
+        pat: str | re.Pattern,
         case: bool = True,
         flags: int = 0,
         na: Scalar | lib.NoDefault = lib.no_default,
     ):
         if not case:
             flags |= re.IGNORECASE
-
+        if isinstance(pat, re.Pattern):
+            pat = pat.pattern
         regex = re.compile(pat, flags=flags)
 
         f = lambda x: regex.match(x) is not None
@@ -267,7 +268,8 @@ def _str_fullmatch(
     ):
         if not case:
             flags |= re.IGNORECASE
-
+        if isinstance(pat, re.Pattern):
+            pat = pat.pattern
         regex = re.compile(pat, flags=flags)
 
         f = lambda x: regex.fullmatch(x) is not None
diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py
@@ -90,8 +90,6 @@ def as_json_table_type(x: DtypeObj) -> str:
         return "datetime"
     elif lib.is_np_dtype(x, "m"):
         return "duration"
-    elif isinstance(x, ExtensionDtype):
-        return "any"
     elif is_string_dtype(x):
         return "string"
     else:
@@ -197,7 +195,7 @@ def convert_json_field_to_pandas_type(field) -> str | CategoricalDtype:
     """
     typ = field["type"]
     if typ == "string":
-        return "object"
+        return field.get("extDtype", None)
     elif typ == "integer":
         return field.get("extDtype", "int64")
     elif typ == "number":
diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py
@@ -19,16 +19,11 @@
 class TestCategoricalReprWithFactor:
     def test_print(self, using_infer_string):
         factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True)
-        if using_infer_string:
-            expected = [
-                "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
-                "Categories (3, str): [a < b < c]",
-            ]
-        else:
-            expected = [
-                "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
-                "Categories (3, object): ['a' < 'b' < 'c']",
-            ]
+        dtype = "str" if using_infer_string else "object"
+        expected = [
+            "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
+            f"Categories (3, {dtype}): ['a' < 'b' < 'c']",
+        ]
         expected = "\n".join(expected)
         actual = repr(factor)
         assert actual == expected
diff --git a/pandas/tests/copy_view/test_chained_assignment_deprecation.py b/pandas/tests/copy_view/test_chained_assignment_deprecation.py
@@ -172,3 +172,83 @@ def test_frame_setitem(indexer, using_copy_on_write):
     with option_context("chained_assignment", "warn"):
         with tm.raises_chained_assignment_error(extra_warnings=extra_warnings):
             df[0:3][indexer] = 10
+
+
+@pytest.mark.parametrize(
+    "indexer", [0, [0, 1], slice(0, 2), np.array([True, False, True])]
+)
+def test_series_iloc_setitem(indexer):
+    df = DataFrame({"a": [1, 2, 3], "b": 1})
+
+    with option_context("chained_assignment", "warn"):
+        with tm.raises_chained_assignment_error():
+            df["a"].iloc[indexer] = 0
+
+
+@pytest.mark.parametrize(
+    "indexer", [0, [0, 1], slice(0, 2), np.array([True, False, True])]
+)
+def test_frame_iloc_setitem(indexer, using_copy_on_write):
+    df = DataFrame({"a": [1, 2, 3, 4, 5], "b": 1})
+
+    extra_warnings = () if using_copy_on_write else (SettingWithCopyWarning,)
+
+    with option_context("chained_assignment", "warn"):
+        with tm.raises_chained_assignment_error(extra_warnings=extra_warnings):
+            df[0:3].iloc[indexer] = 10
+
+
+@pytest.mark.parametrize(
+    "indexer", [0, [0, 1], slice(0, 2), np.array([True, False, True])]
+)
+def test_series_loc_setitem(indexer):
+    df = DataFrame({"a": [1, 2, 3], "b": 1})
+
+    with option_context("chained_assignment", "warn"):
+        with tm.raises_chained_assignment_error():
+            df["a"].loc[indexer] = 0
+
+
+@pytest.mark.parametrize(
+    "indexer", [0, [0, 1], (0, "a"), slice(0, 2), np.array([True, False, True])]
+)
+def test_frame_loc_setitem(indexer, using_copy_on_write):
+    df = DataFrame({"a": [1, 2, 3, 4, 5], "b": 1})
+
+    extra_warnings = () if using_copy_on_write else (SettingWithCopyWarning,)
+
+    with option_context("chained_assignment", "warn"):
+        with tm.raises_chained_assignment_error(extra_warnings=extra_warnings):
+            df[0:3].loc[indexer] = 10
+
+
+def test_series_at_setitem():
+    df = DataFrame({"a": [1, 2, 3], "b": 1})
+
+    with option_context("chained_assignment", "warn"):
+        with tm.raises_chained_assignment_error():
+            df["a"].at[0] = 0
+
+
+def test_frame_at_setitem():
+    df = DataFrame({"a": [1, 2, 3, 4, 5], "b": 1})
+
+    with option_context("chained_assignment", "warn"):
+        with tm.raises_chained_assignment_error():
+            df[0:3].at[0, "a"] = 10
+
+
+def test_series_iat_setitem():
+    df = DataFrame({"a": [1, 2, 3], "b": 1})
+
+    with option_context("chained_assignment", "warn"):
+        with tm.raises_chained_assignment_error():
+            df["a"].iat[0] = 0
+
+
+def test_frame_iat_setitem():
+    df = DataFrame({"a": [1, 2, 3, 4, 5], "b": 1})
+
+    with option_context("chained_assignment", "warn"):
+        with tm.raises_chained_assignment_error():
+            df[0:3].iat[0, 0] = 10
diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas._libs import index as libindex
 from pandas._libs.arrays import NDArrayBacked
 
@@ -196,7 +194,6 @@ def test_unique(self, data, categories, expected_data, ordered):
         expected = CategoricalIndex(expected_data, dtype=dtype)
         tm.assert_index_equal(idx.unique(), expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="repr doesn't roundtrip")
     def test_repr_roundtrip(self):
         ci = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True)
         str(ci)
diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py
@@ -69,7 +69,7 @@ def test_build_table_schema(self, df_schema, using_infer_string):
             "primaryKey": ["idx"],
         }
         if using_infer_string:
-            expected["fields"][2] = {"name": "B", "type": "any", "extDtype": "str"}
+            expected["fields"][2] = {"name": "B", "type": "string", "extDtype": "str"}
         assert result == expected
         result = build_table_schema(df_schema)
         assert "pandas_version" in result
@@ -119,10 +119,10 @@ def test_multiindex(self, df_schema, using_infer_string):
         if using_infer_string:
             expected["fields"][0] = {
                 "name": "level_0",
-                "type": "any",
+                "type": "string",
                 "extDtype": "str",
             }
-            expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "str"}
+            expected["fields"][3] = {"name": "B", "type": "string", "extDtype": "str"}
         assert result == expected
 
         df.index.names = ["idx0", None]
@@ -305,7 +305,7 @@ def test_to_json(self, df_table, using_infer_string):
         ]
 
         if using_infer_string:
-            fields[2] = {"name": "B", "type": "any", "extDtype": "str"}
+            fields[2] = {"name": "B", "type": "string", "extDtype": "str"}
 
         schema = {"fields": fields, "primaryKey": ["idx"]}
         data = [
@@ -544,7 +544,7 @@ def test_convert_pandas_type_to_json_field_categorical(self, kind, ordered):
                 },
                 CategoricalDtype(categories=["a", "b", "c"], ordered=True),
             ),
-            ({"type": "string"}, "object"),
+            ({"type": "string"}, None),
         ],
     )
     def test_convert_json_field_to_pandas_type(self, inp, exp):
diff --git a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py
diff --git a/pandas/tests/series/test_formats.py b/pandas/tests/series/test_formats.py
diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py
diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py
diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py