Skip to content

Fix #61072: inconsistent fullmatch results with regex alternation #61343

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 10 commits into from
Closed
58 changes: 50 additions & 8 deletions pandas/core/strings/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1462,15 +1462,57 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default):
matches the regular expression.
extract : Extract matched groups.

Examples
--------
>>> ser = pd.Series(["cat", "duck", "dove"])
>>> ser.str.fullmatch(r"d.+")
0 False
1 True
2 True
dtype: bool
Notes
-----
This method enforces consistent behavior between Python's string dtype
and PyArrow-backed string arrays when using regular expressions
containing alternation (|). For regex patterns with alternation operators,
the method ensures proper grouping by wrapping the pattern in parentheses
when using PyArrow-backed string arrays.
"""
is_pyarrow = False
arr = self._data.array
arr_type = type(arr).__name__
is_pyarrow = arr_type == "ArrowStringArray"
if not is_pyarrow:
is_pyarrow = "Arrow" in arr_type
if not is_pyarrow and hasattr(arr, "dtype"):
dtype_str = str(arr.dtype)
is_pyarrow = (
"pyarrow" in dtype_str.lower() or "arrow" in dtype_str.lower()
)
if is_pyarrow and "|" in pat:

def _is_fully_wrapped(pattern):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This appears to be an improper solution to me. Can you explain why the current code path fails?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have already fixed the error in the pull request, why do u think is it a improper solution?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're introducing character-by-character parsing with a lot of branching logic.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the review. From what I can tell, the issue seems to be caused by how pyarrow handles regex alternation. Do you think there’s anything I should do on the pandas side, or is this something that should be fixed upstream?

if not (pattern.startswith("(") and pattern.endswith(")")):
return False
inner = pattern[1:-1]
level = 0
escape = False
in_char_class = False
for char in inner:
if escape:
escape = False
continue
if char == "\\":
escape = True
elif not in_char_class and char == "[":
in_char_class = True
elif in_char_class and char == "]":
in_char_class = False
elif not in_char_class:
if char == "(":
level += 1
elif char == ")":
if level == 0:
return False
level -= 1
return level == 0

if not (
pat.startswith("(") and pat.endswith(")") and _is_fully_wrapped(pat)
):
pat = f"({pat})"
result = self._data.array._str_fullmatch(pat, case=case, flags=flags, na=na)
return self._wrap_result(result, fill_value=na, returns_string=False)

Expand Down
36 changes: 36 additions & 0 deletions pandas/tests/strings/test_pyarrow_format_behavior.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import pytest

from pandas import Series


@pytest.mark.parametrize("dtype", [str])
def test_string_array(dtype):
test_series = Series(["asdf", "as"], dtype=dtype)
regex = r"((as)|(as))"
regex2 = r"(as)|(as)"
assert list(test_series.str.fullmatch(regex)) == [False, True]
assert list(test_series.str.fullmatch(regex2)) == [False, True]


@pytest.mark.parametrize(
"data, pattern, expected",
[
(["cat", "duck", "dove"], r"d.+", [False, True, True]),
],
)
def test_string_match(data, pattern, expected):
ser = Series(data)
assert list(ser.str.fullmatch(pattern)) == expected


@pytest.mark.parametrize("dtype", [str])
@pytest.mark.parametrize(
"pattern, expected",
[
(r"(foo)|((as)(df)?)", [True, True, True]),
("foo|as", [False, True, True]),
],
)
def test_string_alternation_patterns(dtype, pattern, expected):
ser = Series(["asdf", "foo", "as"], dtype=dtype)
assert list(ser.str.fullmatch(pattern)) == expected
Loading