Skip to content

Commit 3cefa1e

Browse files
khemkaran10Khemkaranjorisvandenbossche
authored
BUG: fix Series.str.fullmatch() and Series.str.match() with a compiled regex failing with arrow strings (#61964)
Co-authored-by: Khemkaran <[email protected]> Co-authored-by: Joris Van den Bossche <[email protected]>
1 parent 5774290 commit 3cefa1e

File tree

5 files changed

+41
-9
lines changed

5 files changed

+41
-9
lines changed

doc/source/whatsnew/v2.3.2.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ Bug fixes
2626
"string" type in the JSON Table Schema for :class:`StringDtype` columns
2727
(:issue:`61889`)
2828
- Boolean operations (``|``, ``&``, ``^``) with bool-dtype objects on the left and :class:`StringDtype` objects on the right now cast the string to bool, with a deprecation warning (:issue:`60234`)
29+
- Fixed ``~Series.str.match`` and ``~Series.str.fullmatch`` with compiled regex
30+
for the Arrow-backed string dtype (:issue:`61964`)
2931

3032
.. ---------------------------------------------------------------------------
3133
.. _whatsnew_232.contributors:

pandas/core/arrays/_arrow_string_mixins.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -302,23 +302,29 @@ def _str_contains(
302302

303303
def _str_match(
304304
self,
305-
pat: str,
305+
pat: str | re.Pattern,
306306
case: bool = True,
307307
flags: int = 0,
308308
na: Scalar | lib.NoDefault = lib.no_default,
309309
):
310-
if not pat.startswith("^"):
310+
if isinstance(pat, re.Pattern):
311+
# GH#61952
312+
pat = pat.pattern
313+
if isinstance(pat, str) and not pat.startswith("^"):
311314
pat = f"^{pat}"
312315
return self._str_contains(pat, case, flags, na, regex=True)
313316

314317
def _str_fullmatch(
315318
self,
316-
pat,
319+
pat: str | re.Pattern,
317320
case: bool = True,
318321
flags: int = 0,
319322
na: Scalar | lib.NoDefault = lib.no_default,
320323
):
321-
if not pat.endswith("$") or pat.endswith("\\$"):
324+
if isinstance(pat, re.Pattern):
325+
# GH#61952
326+
pat = pat.pattern
327+
if isinstance(pat, str) and (not pat.endswith("$") or pat.endswith("\\$")):
322328
pat = f"{pat}$"
323329
return self._str_match(pat, case, flags, na)
324330

pandas/core/strings/accessor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1361,8 +1361,8 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na=lib.no_default):
13611361
13621362
Parameters
13631363
----------
1364-
pat : str
1365-
Character sequence.
1364+
pat : str or compiled regex
1365+
Character sequence or regular expression.
13661366
case : bool, default True
13671367
If True, case sensitive.
13681368
flags : int, default 0 (no flags)

pandas/core/strings/object_array.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -248,14 +248,15 @@ def rep(x, r):
248248

249249
def _str_match(
250250
self,
251-
pat: str,
251+
pat: str | re.Pattern,
252252
case: bool = True,
253253
flags: int = 0,
254254
na: Scalar | lib.NoDefault = lib.no_default,
255255
):
256256
if not case:
257257
flags |= re.IGNORECASE
258-
258+
if isinstance(pat, re.Pattern):
259+
pat = pat.pattern
259260
regex = re.compile(pat, flags=flags)
260261

261262
f = lambda x: regex.match(x) is not None
@@ -270,7 +271,8 @@ def _str_fullmatch(
270271
):
271272
if not case:
272273
flags |= re.IGNORECASE
273-
274+
if isinstance(pat, re.Pattern):
275+
pat = pat.pattern
274276
regex = re.compile(pat, flags=flags)
275277

276278
f = lambda x: regex.fullmatch(x) is not None

pandas/tests/strings/test_find_replace.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -818,6 +818,17 @@ def test_match_case_kwarg(any_string_dtype):
818818
tm.assert_series_equal(result, expected)
819819

820820

821+
def test_match_compiled_regex(any_string_dtype):
822+
# GH#61952
823+
values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
824+
result = values.str.match(re.compile(r"ab"), case=False)
825+
expected_dtype = (
826+
np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
827+
)
828+
expected = Series([True, True, True, True], dtype=expected_dtype)
829+
tm.assert_series_equal(result, expected)
830+
831+
821832
# --------------------------------------------------------------------------------------
822833
# str.fullmatch
823834
# --------------------------------------------------------------------------------------
@@ -887,6 +898,17 @@ def test_fullmatch_case_kwarg(any_string_dtype):
887898
tm.assert_series_equal(result, expected)
888899

889900

901+
def test_fullmatch_compiled_regex(any_string_dtype):
902+
# GH#61952
903+
values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
904+
result = values.str.fullmatch(re.compile(r"ab"), case=False)
905+
expected_dtype = (
906+
np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
907+
)
908+
expected = Series([True, True, False, False], dtype=expected_dtype)
909+
tm.assert_series_equal(result, expected)
910+
911+
890912
# --------------------------------------------------------------------------------------
891913
# str.findall
892914
# --------------------------------------------------------------------------------------

0 commit comments

Comments
 (0)