Skip to content

Commit 6ffc7a9

Browse files
committed
Merge remote-tracking branch 'upstream/main' into deps/optional/bump
2 parents 1119bc9 + 2891172 commit 6ffc7a9

File tree

9 files changed

+231
-24
lines changed

9 files changed

+231
-24
lines changed

doc/source/whatsnew/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ Version 2.3
2424
.. toctree::
2525
:maxdepth: 2
2626

27+
v2.3.3
2728
v2.3.2
2829
v2.3.1
2930
v2.3.0

doc/source/whatsnew/v2.3.3.rst

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
.. _whatsnew_233:
2+
3+
What's new in 2.3.3 (September XX, 2025)
4+
----------------------------------------
5+
6+
These are the changes in pandas 2.3.3. See :ref:`release` for a full changelog
7+
including other versions of pandas.
8+
9+
{{ header }}
10+
11+
.. ---------------------------------------------------------------------------
12+
.. _whatsnew_233.string_fixes:
13+
14+
Improvements and fixes for the StringDtype
15+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
16+
17+
Most changes in this release are related to :class:`StringDtype` which will
18+
become the default string dtype in pandas 3.0. See
19+
:ref:`whatsnew_230.upcoming_changes` for more details.
20+
21+
.. _whatsnew_233.string_fixes.bugs:
22+
23+
Bug fixes
24+
^^^^^^^^^
25+
- Fix regression in ``~Series.str.contains``, ``~Series.str.match`` and ``~Series.str.fullmatch``
26+
with a compiled regex and custom flags (:issue:`62240`)
27+
28+
.. ---------------------------------------------------------------------------
29+
.. _whatsnew_233.contributors:
30+
31+
Contributors
32+
~~~~~~~~~~~~

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -965,6 +965,7 @@ Indexing
965965
- Bug in :meth:`Series.__setitem__` when assigning boolean series with boolean indexer will raise ``LossySetitemError`` (:issue:`57338`)
966966
- Bug in printing :attr:`Index.names` and :attr:`MultiIndex.levels` would not escape single quotes (:issue:`60190`)
967967
- Bug in reindexing of :class:`DataFrame` with :class:`PeriodDtype` columns in case of consolidated block (:issue:`60980`, :issue:`60273`)
968+
- Bug in :meth:`DataFrame.loc.__getitem__` and :meth:`DataFrame.iloc.__getitem__` with a :class:`CategoricalDtype` column with integer categories raising when trying to index a row containing a ``NaN`` entry (:issue:`58954`)
968969
- Bug in :meth:`Index.__getitem__` incorrectly raising with a 0-dim ``np.ndarray`` key (:issue:`55601`)
969970

970971
Missing

pandas/core/arrays/_arrow_string_mixins.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -300,29 +300,23 @@ def _str_contains(
300300

301301
def _str_match(
302302
self,
303-
pat: str | re.Pattern,
303+
pat: str,
304304
case: bool = True,
305305
flags: int = 0,
306306
na: Scalar | lib.NoDefault = lib.no_default,
307307
):
308-
if isinstance(pat, re.Pattern):
309-
# GH#61952
310-
pat = pat.pattern
311-
if isinstance(pat, str) and not pat.startswith("^"):
308+
if not pat.startswith("^"):
312309
pat = f"^{pat}"
313310
return self._str_contains(pat, case, flags, na, regex=True)
314311

315312
def _str_fullmatch(
316313
self,
317-
pat: str | re.Pattern,
314+
pat: str,
318315
case: bool = True,
319316
flags: int = 0,
320317
na: Scalar | lib.NoDefault = lib.no_default,
321318
):
322-
if isinstance(pat, re.Pattern):
323-
# GH#61952
324-
pat = pat.pattern
325-
if isinstance(pat, str) and (not pat.endswith("$") or pat.endswith("\\$")):
319+
if not pat.endswith("$") or pat.endswith("\\$"):
326320
pat = f"{pat}$"
327321
return self._str_match(pat, case, flags, na)
328322

pandas/core/arrays/string_arrow.py

Lines changed: 54 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
ArrayLike,
5555
Dtype,
5656
NpDtype,
57+
Scalar,
5758
npt,
5859
)
5960

@@ -335,8 +336,6 @@ def astype(self, dtype, copy: bool = True):
335336
_str_startswith = ArrowStringArrayMixin._str_startswith
336337
_str_endswith = ArrowStringArrayMixin._str_endswith
337338
_str_pad = ArrowStringArrayMixin._str_pad
338-
_str_match = ArrowStringArrayMixin._str_match
339-
_str_fullmatch = ArrowStringArrayMixin._str_fullmatch
340339
_str_lower = ArrowStringArrayMixin._str_lower
341340
_str_upper = ArrowStringArrayMixin._str_upper
342341
_str_strip = ArrowStringArrayMixin._str_strip
@@ -351,6 +350,28 @@ def astype(self, dtype, copy: bool = True):
351350
_str_len = ArrowStringArrayMixin._str_len
352351
_str_slice = ArrowStringArrayMixin._str_slice
353352

353+
@staticmethod
354+
def _is_re_pattern_with_flags(pat: str | re.Pattern) -> bool:
355+
# check if `pat` is a compiled regex pattern with flags that are not
356+
# supported by pyarrow
357+
return (
358+
isinstance(pat, re.Pattern)
359+
and (pat.flags & ~(re.IGNORECASE | re.UNICODE)) != 0
360+
)
361+
362+
@staticmethod
363+
def _preprocess_re_pattern(pat: re.Pattern, case: bool) -> tuple[str, bool, int]:
364+
pattern = pat.pattern
365+
flags = pat.flags
366+
# flags is not supported by pyarrow, but `case` is -> extract and remove
367+
if flags & re.IGNORECASE:
368+
case = False
369+
flags = flags & ~re.IGNORECASE
370+
# when creating a pattern with re.compile and a string, it automatically
371+
# gets a UNICODE flag, while pyarrow assumes unicode for strings anyway
372+
flags = flags & ~re.UNICODE
373+
return pattern, case, flags
374+
354375
def _str_contains(
355376
self,
356377
pat,
@@ -359,13 +380,42 @@ def _str_contains(
359380
na=lib.no_default,
360381
regex: bool = True,
361382
):
362-
if flags:
383+
if flags or self._is_re_pattern_with_flags(pat):
363384
return super()._str_contains(pat, case, flags, na, regex)
364385
if isinstance(pat, re.Pattern):
365-
pat = pat.pattern
386+
# TODO flags passed separately by user are ignored
387+
pat, case, flags = self._preprocess_re_pattern(pat, case)
366388

367389
return ArrowStringArrayMixin._str_contains(self, pat, case, flags, na, regex)
368390

391+
def _str_match(
392+
self,
393+
pat: str | re.Pattern,
394+
case: bool = True,
395+
flags: int = 0,
396+
na: Scalar | lib.NoDefault = lib.no_default,
397+
):
398+
if flags or self._is_re_pattern_with_flags(pat):
399+
return super()._str_match(pat, case, flags, na)
400+
if isinstance(pat, re.Pattern):
401+
pat, case, flags = self._preprocess_re_pattern(pat, case)
402+
403+
return ArrowStringArrayMixin._str_match(self, pat, case, flags, na)
404+
405+
def _str_fullmatch(
406+
self,
407+
pat: str | re.Pattern,
408+
case: bool = True,
409+
flags: int = 0,
410+
na: Scalar | lib.NoDefault = lib.no_default,
411+
):
412+
if flags or self._is_re_pattern_with_flags(pat):
413+
return super()._str_fullmatch(pat, case, flags, na)
414+
if isinstance(pat, re.Pattern):
415+
pat, case, flags = self._preprocess_re_pattern(pat, case)
416+
417+
return ArrowStringArrayMixin._str_fullmatch(self, pat, case, flags, na)
418+
369419
def _str_replace(
370420
self,
371421
pat: str | re.Pattern,

pandas/core/internals/managers.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
is_list_like,
5151
)
5252
from pandas.core.dtypes.dtypes import (
53+
CategoricalDtype,
5354
DatetimeTZDtype,
5455
ExtensionDtype,
5556
SparseDtype,
@@ -1138,7 +1139,24 @@ def fast_xs(self, loc: int) -> SingleBlockManager:
11381139
# Such assignment may incorrectly coerce NaT to None
11391140
# result[blk.mgr_locs] = blk._slice((slice(None), loc))
11401141
for i, rl in enumerate(blk.mgr_locs):
1141-
result[rl] = blk.iget((i, loc))
1142+
item = blk.iget((i, loc))
1143+
if (
1144+
result.dtype.kind in "iub"
1145+
and lib.is_float(item)
1146+
and isna(item)
1147+
and isinstance(blk.dtype, CategoricalDtype)
1148+
):
1149+
# GH#58954 caused bc interleaved_dtype is wrong for Categorical
1150+
# TODO(GH#38240) this will be unnecessary
1151+
# Note that doing this in a try/except would work for the
1152+
# integer case, but not for bool, which will cast the NaN
1153+
# entry to True.
1154+
if result.dtype.kind == "b":
1155+
new_dtype = object
1156+
else:
1157+
new_dtype = np.float64
1158+
result = result.astype(new_dtype)
1159+
result[rl] = item
11421160

11431161
if isinstance(dtype, ExtensionDtype):
11441162
cls = dtype.construct_array_type()

pandas/core/strings/object_array.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -262,8 +262,7 @@ def _str_match(
262262
):
263263
if not case:
264264
flags |= re.IGNORECASE
265-
if isinstance(pat, re.Pattern):
266-
pat = pat.pattern
265+
267266
regex = re.compile(pat, flags=flags)
268267

269268
f = lambda x: regex.match(x) is not None
@@ -278,8 +277,7 @@ def _str_fullmatch(
278277
):
279278
if not case:
280279
flags |= re.IGNORECASE
281-
if isinstance(pat, re.Pattern):
282-
pat = pat.pattern
280+
283281
regex = re.compile(pat, flags=flags)
284282

285283
f = lambda x: regex.fullmatch(x) is not None

pandas/tests/indexing/test_categorical.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -571,3 +571,25 @@ def test_getitem_categorical_with_nan(self):
571571
df = DataFrame(ser)
572572
assert df.loc[np.nan, 0] == 2
573573
assert df.loc[np.nan][0] == 2
574+
575+
def test_getitem_row_categorical_with_nan(self):
576+
# GH#58954
577+
df = DataFrame({"a": [1, 2], "b": CategoricalIndex([1, None])})
578+
579+
res = df.iloc[1]
580+
expected = Series([2, np.nan], index=df.columns, name=1)
581+
tm.assert_series_equal(res, expected)
582+
583+
res = df.loc[1]
584+
tm.assert_series_equal(res, expected)
585+
586+
def test_getitem_row_categorical_with_nan_bool(self):
587+
# GH#58954
588+
df = DataFrame({"a": [True, False], "b": CategoricalIndex([False, None])})
589+
590+
res = df.iloc[1]
591+
expected = Series([False, np.nan], index=df.columns, dtype=object, name=1)
592+
tm.assert_series_equal(res, expected)
593+
594+
res = df.loc[1]
595+
tm.assert_series_equal(res, expected)

pandas/tests/strings/test_find_replace.py

Lines changed: 96 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -283,13 +283,60 @@ def test_contains_nan(any_string_dtype):
283283

284284
def test_contains_compiled_regex(any_string_dtype):
285285
# GH#61942
286-
ser = Series(["foo", "bar", "baz"], dtype=any_string_dtype)
286+
expected_dtype = (
287+
np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
288+
)
289+
290+
ser = Series(["foo", "bar", "Baz"], dtype=any_string_dtype)
291+
287292
pat = re.compile("ba.")
288293
result = ser.str.contains(pat)
294+
expected = Series([False, True, False], dtype=expected_dtype)
295+
tm.assert_series_equal(result, expected)
296+
297+
# TODO this currently works for pyarrow-backed dtypes but raises for python
298+
if any_string_dtype == "string" and any_string_dtype.storage == "pyarrow":
299+
result = ser.str.contains(pat, case=False)
300+
expected = Series([False, True, True], dtype=expected_dtype)
301+
tm.assert_series_equal(result, expected)
302+
else:
303+
with pytest.raises(
304+
ValueError, match="cannot process flags argument with a compiled pattern"
305+
):
306+
ser.str.contains(pat, case=False)
307+
308+
pat = re.compile("ba.", flags=re.IGNORECASE)
309+
result = ser.str.contains(pat)
310+
expected = Series([False, True, True], dtype=expected_dtype)
311+
tm.assert_series_equal(result, expected)
312+
313+
# TODO should this be supported?
314+
with pytest.raises(
315+
ValueError, match="cannot process flags argument with a compiled pattern"
316+
):
317+
ser.str.contains(pat, flags=re.IGNORECASE)
318+
289319

320+
def test_contains_compiled_regex_flags(any_string_dtype):
321+
# ensure other (than ignorecase) flags are respected
290322
expected_dtype = (
291323
np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
292324
)
325+
326+
ser = Series(["foobar", "foo\nbar", "Baz"], dtype=any_string_dtype)
327+
328+
pat = re.compile("^ba")
329+
result = ser.str.contains(pat)
330+
expected = Series([False, False, False], dtype=expected_dtype)
331+
tm.assert_series_equal(result, expected)
332+
333+
pat = re.compile("^ba", flags=re.MULTILINE)
334+
result = ser.str.contains(pat)
335+
expected = Series([False, True, False], dtype=expected_dtype)
336+
tm.assert_series_equal(result, expected)
337+
338+
pat = re.compile("^ba", flags=re.MULTILINE | re.IGNORECASE)
339+
result = ser.str.contains(pat)
293340
expected = Series([False, True, True], dtype=expected_dtype)
294341
tm.assert_series_equal(result, expected)
295342

@@ -833,14 +880,36 @@ def test_match_case_kwarg(any_string_dtype):
833880

834881
def test_match_compiled_regex(any_string_dtype):
835882
# GH#61952
836-
values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
837-
result = values.str.match(re.compile(r"ab"), case=False)
838883
expected_dtype = (
839884
np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
840885
)
886+
887+
values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
888+
889+
result = values.str.match(re.compile("ab"))
890+
expected = Series([True, False, True, False], dtype=expected_dtype)
891+
tm.assert_series_equal(result, expected)
892+
893+
# TODO this currently works for pyarrow-backed dtypes but raises for python
894+
if any_string_dtype == "string" and any_string_dtype.storage == "pyarrow":
895+
result = values.str.match(re.compile("ab"), case=False)
896+
expected = Series([True, True, True, True], dtype=expected_dtype)
897+
tm.assert_series_equal(result, expected)
898+
else:
899+
with pytest.raises(
900+
ValueError, match="cannot process flags argument with a compiled pattern"
901+
):
902+
values.str.match(re.compile("ab"), case=False)
903+
904+
result = values.str.match(re.compile("ab", flags=re.IGNORECASE))
841905
expected = Series([True, True, True, True], dtype=expected_dtype)
842906
tm.assert_series_equal(result, expected)
843907

908+
with pytest.raises(
909+
ValueError, match="cannot process flags argument with a compiled pattern"
910+
):
911+
values.str.match(re.compile("ab"), flags=re.IGNORECASE)
912+
844913

845914
# --------------------------------------------------------------------------------------
846915
# str.fullmatch
@@ -913,14 +982,36 @@ def test_fullmatch_case_kwarg(any_string_dtype):
913982

914983
def test_fullmatch_compiled_regex(any_string_dtype):
915984
# GH#61952
916-
values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
917-
result = values.str.fullmatch(re.compile(r"ab"), case=False)
918985
expected_dtype = (
919986
np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
920987
)
988+
989+
values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
990+
991+
result = values.str.fullmatch(re.compile("ab"))
992+
expected = Series([True, False, False, False], dtype=expected_dtype)
993+
tm.assert_series_equal(result, expected)
994+
995+
# TODO this currently works for pyarrow-backed dtypes but raises for python
996+
if any_string_dtype == "string" and any_string_dtype.storage == "pyarrow":
997+
result = values.str.fullmatch(re.compile("ab"), case=False)
998+
expected = Series([True, True, False, False], dtype=expected_dtype)
999+
tm.assert_series_equal(result, expected)
1000+
else:
1001+
with pytest.raises(
1002+
ValueError, match="cannot process flags argument with a compiled pattern"
1003+
):
1004+
values.str.fullmatch(re.compile("ab"), case=False)
1005+
1006+
result = values.str.fullmatch(re.compile("ab", flags=re.IGNORECASE))
9211007
expected = Series([True, True, False, False], dtype=expected_dtype)
9221008
tm.assert_series_equal(result, expected)
9231009

1010+
with pytest.raises(
1011+
ValueError, match="cannot process flags argument with a compiled pattern"
1012+
):
1013+
values.str.fullmatch(re.compile("ab"), flags=re.IGNORECASE)
1014+
9241015

9251016
# --------------------------------------------------------------------------------------
9261017
# str.findall

0 commit comments

Comments
 (0)