Skip to content

Commit fd40f9a

Browse files
Backport PR #61073 on branch 2.3.x (BUG: fix bug in str.fullmatch for Arrow backend with optional groups) (#62401)
Co-authored-by: ptth222 <[email protected]>
1 parent 8be57bc commit fd40f9a

File tree

4 files changed

+54
-6
lines changed

4 files changed

+54
-6
lines changed

doc/source/whatsnew/v2.3.3.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ Bug fixes
2525
- Fix bug in :meth:`Series.str.replace` using named capture groups (e.g., ``\g<name>``) with the Arrow-backed dtype would raise an error (:issue:`57636`)
2626
- Fix regression in ``~Series.str.contains``, ``~Series.str.match`` and ``~Series.str.fullmatch``
2727
with a compiled regex and custom flags (:issue:`62240`)
28+
- Fix :meth:`Series.str.fullmatch` not matching patterns with groups correctly for the Arrow-backed string dtype (:issue:`61072`)
2829

2930
.. ---------------------------------------------------------------------------
3031
.. _whatsnew_233.contributors:

pandas/core/arrays/_arrow_string_mixins.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -327,8 +327,12 @@ def _str_fullmatch(
327327
flags: int = 0,
328328
na: Scalar | lib.NoDefault = lib.no_default,
329329
):
330-
if not pat.endswith("$") or pat.endswith("\\$"):
331-
pat = f"{pat}$"
330+
if (not pat.endswith("$") or pat.endswith("\\$")) and not pat.startswith("^"):
331+
pat = f"^({pat})$"
332+
elif not pat.endswith("$") or pat.endswith("\\$"):
333+
pat = f"^({pat[1:]})$"
334+
elif not pat.startswith("^"):
335+
pat = f"^({pat[0:-1]})$"
332336
return self._str_match(pat, case, flags, na)
333337

334338
def _str_find(self, sub: str, start: int = 0, end: int | None = None):

pandas/tests/extension/test_arrow.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1876,23 +1876,28 @@ def test_str_match(pat, case, na, exp):
18761876

18771877
@pytest.mark.parametrize(
18781878
"pat, case, na, exp",
1879+
# Note: keep cases in sync with
1880+
# pandas/tests/strings/test_find_replace.py::test_str_fullmatch_extra_cases
18791881
[
1880-
["abc", False, None, [True, True, False, None]],
1882+
["abc", False, None, [True, False, False, None]],
18811883
["Abc", True, None, [False, False, False, None]],
18821884
["bc", True, None, [False, False, False, None]],
1883-
["ab", False, None, [True, True, False, None]],
1884-
["a[a-z]{2}", False, None, [True, True, False, None]],
1885+
["ab", False, None, [False, False, False, None]],
1886+
["a[a-z]{2}", False, None, [True, False, False, None]],
18851887
["A[a-z]{1}", True, None, [False, False, False, None]],
18861888
# GH Issue: #56652
18871889
["abc$", False, None, [True, False, False, None]],
18881890
["abc\\$", False, None, [False, True, False, None]],
18891891
["Abc$", True, None, [False, False, False, None]],
18901892
["Abc\\$", True, None, [False, False, False, None]],
1893+
# https://github.com/pandas-dev/pandas/issues/61072
1894+
["(abc)|(abx)", True, None, [True, False, False, None]],
1895+
["((abc)|(abx))", True, None, [True, False, False, None]],
18911896
],
18921897
)
18931898
def test_str_fullmatch(pat, case, na, exp):
18941899
ser = pd.Series(["abc", "abc$", "$abc", None], dtype=ArrowDtype(pa.string()))
1895-
result = ser.str.match(pat, case=case, na=na)
1900+
result = ser.str.fullmatch(pat, case=case, na=na)
18961901
expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_()))
18971902
tm.assert_series_equal(result, expected)
18981903

pandas/tests/strings/test_find_replace.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1079,6 +1079,44 @@ def test_fullmatch_compiled_regex(any_string_dtype):
10791079
values.str.fullmatch(re.compile("ab"), flags=re.IGNORECASE)
10801080

10811081

1082+
@pytest.mark.parametrize(
1083+
"pat, case, na, exp",
1084+
# Note: keep cases in sync with
1085+
# pandas/tests/extension/test_arrow.py::test_str_fullmatch
1086+
[
1087+
["abc", False, None, [True, False, False, None]],
1088+
["Abc", True, None, [False, False, False, None]],
1089+
["bc", True, None, [False, False, False, None]],
1090+
["ab", False, None, [False, False, False, None]],
1091+
["a[a-z]{2}", False, None, [True, False, False, None]],
1092+
["A[a-z]{1}", True, None, [False, False, False, None]],
1093+
# GH Issue: #56652
1094+
["abc$", False, None, [True, False, False, None]],
1095+
["abc\\$", False, None, [False, True, False, None]],
1096+
["Abc$", True, None, [False, False, False, None]],
1097+
["Abc\\$", True, None, [False, False, False, None]],
1098+
# https://github.com/pandas-dev/pandas/issues/61072
1099+
["(abc)|(abx)", True, None, [True, False, False, None]],
1100+
["((abc)|(abx))", True, None, [True, False, False, None]],
1101+
],
1102+
)
1103+
def test_str_fullmatch_extra_cases(any_string_dtype, pat, case, na, exp):
1104+
ser = Series(["abc", "abc$", "$abc", None], dtype=any_string_dtype)
1105+
result = ser.str.fullmatch(pat, case=case, na=na)
1106+
1107+
if any_string_dtype == "str":
1108+
# NaN propagates as False
1109+
exp[-1] = False
1110+
expected_dtype = bool
1111+
else:
1112+
expected_dtype = (
1113+
"object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
1114+
)
1115+
expected = Series([True, False, np.nan, False], dtype=expected_dtype)
1116+
expected = Series(exp, dtype=expected_dtype)
1117+
tm.assert_series_equal(result, expected)
1118+
1119+
10821120
# --------------------------------------------------------------------------------------
10831121
# str.findall
10841122
# --------------------------------------------------------------------------------------

0 commit comments

Comments
 (0)