Skip to content

Commit adb39e4

Browse files
authored
BUG: ser.str.match with mismatched case/pat/flags (#63108)
1 parent c6ca221 commit adb39e4

File tree

4 files changed

+66
-15
lines changed

4 files changed

+66
-15
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1118,6 +1118,7 @@ Conversion
11181118

11191119
Strings
11201120
^^^^^^^
1121+
- Bug in :meth:`Series.str.match` failing to raise when given a compiled ``re.Pattern`` object and conflicting ``case`` or ``flags`` arguments (:issue:`62240`)
11211122
- Bug in :meth:`Series.str.replace` raising an error on valid group references (``\1``, ``\2``, etc.) on series converted to PyArrow backend dtype (:issue:`62653`)
11221123
- Bug in :meth:`Series.str.zfill` raising ``AttributeError`` for :class:`ArrowDtype` (:issue:`61485`)
11231124
- Bug in :meth:`Series.value_counts` would not respect ``sort=False`` for series having ``string`` dtype (:issue:`55224`)

pandas/core/strings/accessor.py

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1351,7 +1351,13 @@ def contains(
13511351
return self._wrap_result(result, fill_value=na, returns_string=False)
13521352

13531353
@forbid_nonstring_types(["bytes"])
1354-
def match(self, pat: str, case: bool = True, flags: int = 0, na=lib.no_default):
1354+
def match(
1355+
self,
1356+
pat: str | re.Pattern,
1357+
case: bool | lib.NoDefault = lib.no_default,
1358+
flags: int | lib.NoDefault = lib.no_default,
1359+
na=lib.no_default,
1360+
):
13551361
"""
13561362
Determine if each string starts with a match of a regular expression.
13571363
@@ -1397,6 +1403,39 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na=lib.no_default):
13971403
2 False
13981404
dtype: bool
13991405
"""
1406+
if flags is not lib.no_default:
1407+
# pat.flags will have re.U regardless, so we need to add it here
1408+
# before checking for a match
1409+
flags = flags | re.U
1410+
if is_re(pat):
1411+
if pat.flags != flags:
1412+
raise ValueError(
1413+
"Cannot both specify 'flags' and pass a compiled regexp "
1414+
"object with conflicting flags"
1415+
)
1416+
else:
1417+
pat = re.compile(pat, flags=flags)
1418+
# set flags=0 to ensure that when we call
1419+
# re.compile(pat, flags=flags) the constructor does not raise.
1420+
flags = 0
1421+
else:
1422+
flags = 0
1423+
1424+
if case is lib.no_default:
1425+
if is_re(pat):
1426+
case = not bool(pat.flags & re.IGNORECASE)
1427+
else:
1428+
# Case-sensitive default
1429+
case = True
1430+
elif is_re(pat):
1431+
implicit_case = not bool(pat.flags & re.IGNORECASE)
1432+
if implicit_case != case:
1433+
# GH#62240
1434+
raise ValueError(
1435+
"Cannot both specify 'case' and pass a compiled regexp "
1436+
"object with conflicting case-sensitivity"
1437+
)
1438+
14001439
result = self._data.array._str_match(pat, case=case, flags=flags, na=na)
14011440
return self._wrap_result(result, fill_value=na, returns_string=False)
14021441

pandas/core/strings/object_array.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,17 @@ def _str_match(
238238
if not case:
239239
flags |= re.IGNORECASE
240240

241-
regex = re.compile(pat, flags=flags)
241+
if isinstance(pat, re.Pattern):
242+
# We need to check that flags matches pat.flags.
243+
# pat.flags will have re.U regardless, so we need to add it here
244+
# before checking for a match
245+
flags = flags | re.U
246+
247+
if flags != pat.flags:
248+
raise ValueError("Cannot pass flags that do not match pat.flags")
249+
regex = pat
250+
else:
251+
regex = re.compile(pat, flags=flags)
242252

243253
f = lambda x: regex.match(x) is not None
244254
return self._str_map(f, na_value=na, dtype=np.dtype(bool))

pandas/tests/strings/test_find_replace.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1004,26 +1004,27 @@ def test_match_compiled_regex(any_string_dtype):
10041004
expected = Series([True, False, True, False], dtype=expected_dtype)
10051005
tm.assert_series_equal(result, expected)
10061006

1007-
# TODO this currently works for pyarrow-backed dtypes but raises for python
1008-
if any_string_dtype == "string" and any_string_dtype.storage == "pyarrow":
1009-
result = values.str.match(re.compile("ab"), case=False)
1010-
expected = Series([True, True, True, True], dtype=expected_dtype)
1011-
tm.assert_series_equal(result, expected)
1012-
else:
1013-
with pytest.raises(
1014-
ValueError, match="cannot process flags argument with a compiled pattern"
1015-
):
1016-
values.str.match(re.compile("ab"), case=False)
1007+
msg = (
1008+
"Cannot both specify 'case' and pass a compiled "
1009+
"regexp object with conflicting case-sensitivity"
1010+
)
1011+
with pytest.raises(ValueError, match=msg):
1012+
values.str.match(re.compile("ab"), case=False)
10171013

10181014
result = values.str.match(re.compile("ab", flags=re.IGNORECASE))
10191015
expected = Series([True, True, True, True], dtype=expected_dtype)
10201016
tm.assert_series_equal(result, expected)
10211017

1022-
with pytest.raises(
1023-
ValueError, match="cannot process flags argument with a compiled pattern"
1024-
):
1018+
msg = (
1019+
"Cannot both specify 'flags' and pass a compiled "
1020+
"regexp object with conflicting flags"
1021+
)
1022+
with pytest.raises(ValueError, match=msg):
10251023
values.str.match(re.compile("ab"), flags=re.IGNORECASE)
10261024

1025+
# But if the flags match you're OK
1026+
values.str.match(re.compile("ab", flags=re.IGNORECASE), flags=re.IGNORECASE)
1027+
10271028

10281029
@pytest.mark.parametrize(
10291030
"pat, case, exp",

0 commit comments

Comments
 (0)