Skip to content

Commit 66caaae

Browse files
authored
Merge branch '2.3.x' into backport-61909
2 parents 7f6206c + 7981a43 commit 66caaae

File tree

18 files changed

+227
-79
lines changed

18 files changed

+227
-79
lines changed

doc/source/user_guide/migration-3-strings.rst

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -118,12 +118,17 @@ through the ``str`` accessor will work the same:
118118
Overview of behavior differences and how to address them
119119
---------------------------------------------------------
120120

121-
The dtype is no longer object dtype
122-
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
121+
The dtype is no longer a numpy "object" dtype
122+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
123123

124124
When inferring or reading string data, the data type of the resulting DataFrame
125125
column or Series will silently start being the new ``"str"`` dtype instead of
126-
``"object"`` dtype, and this can have some impact on your code.
126+
the numpy ``"object"`` dtype, and this can have some impact on your code.
127+
128+
The new string dtype is a pandas data type ("extension dtype"), and no longer a
129+
numpy ``np.dtype`` instance. Therefore, passing the dtype of a string column to
130+
numpy functions will no longer work (e.g. passing it to a ``dtype=`` argument
131+
of a numpy function, or using ``np.issubdtype`` to check the dtype).
127132

128133
Checking the dtype
129134
^^^^^^^^^^^^^^^^^^

doc/source/whatsnew/v2.3.2.rst

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,11 @@ become the default string dtype in pandas 3.0. See
2222

2323
Bug fixes
2424
^^^^^^^^^
25-
-
25+
- Fix :meth:`~DataFrame.to_json` with ``orient="table"`` to correctly use the
26+
"string" type in the JSON Table Schema for :class:`StringDtype` columns
27+
(:issue:`61889`)
28+
- Fixed ``~Series.str.match``, ``~Series.str.fullmatch`` and ``~Series.str.contains``
29+
with compiled regex for the Arrow-backed string dtype (:issue:`61964`, :issue:`61942`)
2630

2731
.. ---------------------------------------------------------------------------
2832
.. _whatsnew_232.contributors:

pandas/core/arrays/_arrow_string_mixins.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -301,23 +301,29 @@ def _str_contains(
301301

302302
def _str_match(
303303
self,
304-
pat: str,
304+
pat: str | re.Pattern,
305305
case: bool = True,
306306
flags: int = 0,
307307
na: Scalar | lib.NoDefault = lib.no_default,
308308
):
309-
if not pat.startswith("^"):
309+
if isinstance(pat, re.Pattern):
310+
# GH#61952
311+
pat = pat.pattern
312+
if isinstance(pat, str) and not pat.startswith("^"):
310313
pat = f"^{pat}"
311314
return self._str_contains(pat, case, flags, na, regex=True)
312315

313316
def _str_fullmatch(
314317
self,
315-
pat,
318+
pat: str | re.Pattern,
316319
case: bool = True,
317320
flags: int = 0,
318321
na: Scalar | lib.NoDefault = lib.no_default,
319322
):
320-
if not pat.endswith("$") or pat.endswith("\\$"):
323+
if isinstance(pat, re.Pattern):
324+
# GH#61952
325+
pat = pat.pattern
326+
if isinstance(pat, str) and (not pat.endswith("$") or pat.endswith("\\$")):
321327
pat = f"{pat}$"
322328
return self._str_match(pat, case, flags, na)
323329

pandas/core/arrays/categorical.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2215,8 +2215,16 @@ def _repr_categories(self) -> list[str]:
22152215
)
22162216
from pandas.io.formats import format as fmt
22172217

2218+
formatter = None
2219+
if self.categories.dtype == "str":
2220+
# the extension array formatter defaults to boxed=True in format_array
2221+
# override here to boxed=False to be consistent with QUOTE_NONNUMERIC
2222+
formatter = cast(ExtensionArray, self.categories._values)._formatter(
2223+
boxed=False
2224+
)
2225+
22182226
format_array = partial(
2219-
fmt.format_array, formatter=None, quoting=QUOTE_NONNUMERIC
2227+
fmt.format_array, formatter=formatter, quoting=QUOTE_NONNUMERIC
22202228
)
22212229
if len(self.categories) > max_categories:
22222230
num = max_categories // 2

pandas/core/arrays/string_arrow.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,8 @@ def _str_contains(
355355
):
356356
if flags:
357357
return super()._str_contains(pat, case, flags, na, regex)
358+
if isinstance(pat, re.Pattern):
359+
pat = pat.pattern
358360

359361
return ArrowStringArrayMixin._str_contains(self, pat, case, flags, na, regex)
360362

pandas/core/indexing.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2575,6 +2575,22 @@ def __getitem__(self, key):
25752575
return super().__getitem__(key)
25762576

25772577
def __setitem__(self, key, value) -> None:
2578+
if not PYPY and using_copy_on_write():
2579+
if sys.getrefcount(self.obj) <= 2:
2580+
warnings.warn(
2581+
_chained_assignment_msg, ChainedAssignmentError, stacklevel=2
2582+
)
2583+
elif not PYPY and not using_copy_on_write():
2584+
ctr = sys.getrefcount(self.obj)
2585+
ref_count = 2
2586+
if not warn_copy_on_write() and _check_cacher(self.obj):
2587+
# see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221
2588+
ref_count += 1
2589+
if ctr <= ref_count:
2590+
warnings.warn(
2591+
_chained_assignment_warning_msg, FutureWarning, stacklevel=2
2592+
)
2593+
25782594
if self.ndim == 2 and not self._axes_are_unique:
25792595
# GH#33041 fall back to .loc
25802596
if not isinstance(key, tuple) or not all(is_scalar(x) for x in key):
@@ -2599,6 +2615,25 @@ def _convert_key(self, key):
25992615
raise ValueError("iAt based indexing can only have integer indexers")
26002616
return key
26012617

2618+
def __setitem__(self, key, value) -> None:
2619+
if not PYPY and using_copy_on_write():
2620+
if sys.getrefcount(self.obj) <= 2:
2621+
warnings.warn(
2622+
_chained_assignment_msg, ChainedAssignmentError, stacklevel=2
2623+
)
2624+
elif not PYPY and not using_copy_on_write():
2625+
ctr = sys.getrefcount(self.obj)
2626+
ref_count = 2
2627+
if not warn_copy_on_write() and _check_cacher(self.obj):
2628+
# see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221
2629+
ref_count += 1
2630+
if ctr <= ref_count:
2631+
warnings.warn(
2632+
_chained_assignment_warning_msg, FutureWarning, stacklevel=2
2633+
)
2634+
2635+
return super().__setitem__(key, value)
2636+
26022637

26032638
def _tuplify(ndim: int, loc: Hashable) -> tuple[Hashable | slice, ...]:
26042639
"""

pandas/core/strings/accessor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1353,8 +1353,8 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na=lib.no_default):
13531353
13541354
Parameters
13551355
----------
1356-
pat : str
1357-
Character sequence.
1356+
pat : str or compiled regex
1357+
Character sequence or regular expression.
13581358
case : bool, default True
13591359
If True, case sensitive.
13601360
flags : int, default 0 (no flags)

pandas/core/strings/object_array.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -245,14 +245,15 @@ def rep(x, r):
245245

246246
def _str_match(
247247
self,
248-
pat: str,
248+
pat: str | re.Pattern,
249249
case: bool = True,
250250
flags: int = 0,
251251
na: Scalar | lib.NoDefault = lib.no_default,
252252
):
253253
if not case:
254254
flags |= re.IGNORECASE
255-
255+
if isinstance(pat, re.Pattern):
256+
pat = pat.pattern
256257
regex = re.compile(pat, flags=flags)
257258

258259
f = lambda x: regex.match(x) is not None
@@ -267,7 +268,8 @@ def _str_fullmatch(
267268
):
268269
if not case:
269270
flags |= re.IGNORECASE
270-
271+
if isinstance(pat, re.Pattern):
272+
pat = pat.pattern
271273
regex = re.compile(pat, flags=flags)
272274

273275
f = lambda x: regex.fullmatch(x) is not None

pandas/io/json/_table_schema.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,6 @@ def as_json_table_type(x: DtypeObj) -> str:
9090
return "datetime"
9191
elif lib.is_np_dtype(x, "m"):
9292
return "duration"
93-
elif isinstance(x, ExtensionDtype):
94-
return "any"
9593
elif is_string_dtype(x):
9694
return "string"
9795
else:
@@ -197,7 +195,7 @@ def convert_json_field_to_pandas_type(field) -> str | CategoricalDtype:
197195
"""
198196
typ = field["type"]
199197
if typ == "string":
200-
return "object"
198+
return field.get("extDtype", None)
201199
elif typ == "integer":
202200
return field.get("extDtype", "int64")
203201
elif typ == "number":

pandas/tests/arrays/categorical/test_repr.py

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,16 +19,11 @@
1919
class TestCategoricalReprWithFactor:
2020
def test_print(self, using_infer_string):
2121
factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True)
22-
if using_infer_string:
23-
expected = [
24-
"['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
25-
"Categories (3, str): [a < b < c]",
26-
]
27-
else:
28-
expected = [
29-
"['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
30-
"Categories (3, object): ['a' < 'b' < 'c']",
31-
]
22+
dtype = "str" if using_infer_string else "object"
23+
expected = [
24+
"['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
25+
f"Categories (3, {dtype}): ['a' < 'b' < 'c']",
26+
]
3227
expected = "\n".join(expected)
3328
actual = repr(factor)
3429
assert actual == expected

0 commit comments

Comments
 (0)