diff --git a/doc/source/whatsnew/v2.3.3.rst b/doc/source/whatsnew/v2.3.3.rst index aaed7544d9975..1184835ff3a1a 100644 --- a/doc/source/whatsnew/v2.3.3.rst +++ b/doc/source/whatsnew/v2.3.3.rst @@ -25,6 +25,7 @@ Bug fixes - Fix bug in :meth:`Series.str.replace` using named capture groups (e.g., ``\g``) with the Arrow-backed dtype would raise an error (:issue:`57636`) - Fix regression in ``~Series.str.contains``, ``~Series.str.match`` and ``~Series.str.fullmatch`` with a compiled regex and custom flags (:issue:`62240`) +- Fix :meth:`Series.str.fullmatch` not matching patterns with groups correctly for the Arrow-backed string dtype (:issue:`61072`) .. --------------------------------------------------------------------------- .. _whatsnew_233.contributors: diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 74c59dd465b52..f6867934327b0 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -327,8 +327,12 @@ def _str_fullmatch( flags: int = 0, na: Scalar | lib.NoDefault = lib.no_default, ): - if not pat.endswith("$") or pat.endswith("\\$"): - pat = f"{pat}$" + if (not pat.endswith("$") or pat.endswith("\\$")) and not pat.startswith("^"): + pat = f"^({pat})$" + elif not pat.endswith("$") or pat.endswith("\\$"): + pat = f"^({pat[1:]})$" + elif not pat.startswith("^"): + pat = f"^({pat[0:-1]})$" return self._str_match(pat, case, flags, na) def _str_find(self, sub: str, start: int = 0, end: int | None = None): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 820425ab51949..479497497a2ea 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1876,23 +1876,28 @@ def test_str_match(pat, case, na, exp): @pytest.mark.parametrize( "pat, case, na, exp", + # Note: keep cases in sync with + # pandas/tests/strings/test_find_replace.py::test_str_fullmatch_extra_cases [ - ["abc", False, None, [True, True, False, None]], + ["abc", False, None, [True, False, False, None]], ["Abc", True, None, [False, False, False, None]], ["bc", True, None, [False, False, False, None]], - ["ab", False, None, [True, True, False, None]], - ["a[a-z]{2}", False, None, [True, True, False, None]], + ["ab", False, None, [False, False, False, None]], + ["a[a-z]{2}", False, None, [True, False, False, None]], ["A[a-z]{1}", True, None, [False, False, False, None]], # GH Issue: #56652 ["abc$", False, None, [True, False, False, None]], ["abc\\$", False, None, [False, True, False, None]], ["Abc$", True, None, [False, False, False, None]], ["Abc\\$", True, None, [False, False, False, None]], + # https://github.com/pandas-dev/pandas/issues/61072 + ["(abc)|(abx)", True, None, [True, False, False, None]], + ["((abc)|(abx))", True, None, [True, False, False, None]], ], ) def test_str_fullmatch(pat, case, na, exp): ser = pd.Series(["abc", "abc$", "$abc", None], dtype=ArrowDtype(pa.string())) - result = ser.str.match(pat, case=case, na=na) + result = ser.str.fullmatch(pat, case=case, na=na) expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_())) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index be59bc195b387..c0dbe9cf8ac5f 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -1079,6 +1079,44 @@ def test_fullmatch_compiled_regex(any_string_dtype): values.str.fullmatch(re.compile("ab"), flags=re.IGNORECASE) +@pytest.mark.parametrize( + "pat, case, na, exp", + # Note: keep cases in sync with + # pandas/tests/extension/test_arrow.py::test_str_fullmatch + [ + ["abc", False, None, [True, False, False, None]], + ["Abc", True, None, [False, False, False, None]], + ["bc", True, None, [False, False, False, None]], + ["ab", False, None, [False, False, False, None]], + ["a[a-z]{2}", False, None, [True, False, False, None]], + ["A[a-z]{1}", True, None, [False, False, False, None]], + # GH Issue: #56652 + ["abc$", False, None, [True, False, False, None]], + ["abc\\$", False, None, [False, True, False, None]], + ["Abc$", True, None, [False, False, False, None]], + ["Abc\\$", True, None, [False, False, False, None]], + # https://github.com/pandas-dev/pandas/issues/61072 + ["(abc)|(abx)", True, None, [True, False, False, None]], + ["((abc)|(abx))", True, None, [True, False, False, None]], + ], +) +def test_str_fullmatch_extra_cases(any_string_dtype, pat, case, na, exp): + ser = Series(["abc", "abc$", "$abc", None], dtype=any_string_dtype) + result = ser.str.fullmatch(pat, case=case, na=na) + + if any_string_dtype == "str": + # NaN propagates as False + exp[-1] = False + expected_dtype = bool + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series([True, False, np.nan, False], dtype=expected_dtype) + expected = Series(exp, dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + # -------------------------------------------------------------------------------------- # str.findall # --------------------------------------------------------------------------------------