Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -697,9 +697,9 @@ def _reduce(
self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
):
if name in ["any", "all"]:
if not skipna and name == "all":
nas = pc.invert(pc.is_null(self._pa_array))
arr = pc.and_kleene(nas, pc.not_equal(self._pa_array, ""))
if not skipna:
nas = pc.is_null(self._pa_array)
arr = pc.or_kleene(nas, pc.not_equal(self._pa_array, ""))
Comment on lines -701 to +702
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Before, for converting the string array to a boolean array, True values were considered as "not "", and not null", and this diff changed that to "not "", or null"

else:
arr = pc.not_equal(self._pa_array, "")
return ArrowExtensionArray(arr)._reduce(
Expand Down
51 changes: 44 additions & 7 deletions pandas/tests/reductions/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1062,25 +1062,62 @@ def test_any_all_datetimelike(self):
assert df.any().all()
assert not df.all().any()

def test_any_all_pyarrow_string(self):
def test_any_all_string_dtype(self, any_string_dtype):
# GH#54591
pytest.importorskip("pyarrow")
ser = Series(["", "a"], dtype="string[pyarrow_numpy]")
if (
isinstance(any_string_dtype, pd.StringDtype)
and any_string_dtype.na_value is pd.NA
):
# the nullable string dtype currently still raise an error
# https://github.com/pandas-dev/pandas/issues/51939
ser = Series(["a", "b"], dtype=any_string_dtype)
with pytest.raises(TypeError):
ser.any()
with pytest.raises(TypeError):
ser.all()
return

ser = Series(["", "a"], dtype=any_string_dtype)
assert ser.any()
assert not ser.all()
assert ser.any(skipna=False)
assert not ser.all(skipna=False)

ser = Series([None, "a"], dtype="string[pyarrow_numpy]")
ser = Series([np.nan, "a"], dtype=any_string_dtype)
assert ser.any()
assert ser.all()
assert not ser.all(skipna=False)
assert ser.any(skipna=False)
assert ser.all(skipna=False) # NaN is considered truthy
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The places where I currently commented this # NaN is considered truthy are the test cases where the result would change from True to False if NaN would be considered as falsey instead.


ser = Series([None, ""], dtype="string[pyarrow_numpy]")
ser = Series([np.nan, ""], dtype=any_string_dtype)
assert not ser.any()
assert not ser.all()
assert ser.any(skipna=False) # NaN is considered truthy
assert not ser.all(skipna=False)

ser = Series(["a", "b"], dtype="string[pyarrow_numpy]")
ser = Series(["a", "b"], dtype=any_string_dtype)
assert ser.any()
assert ser.all()
assert ser.any(skipna=False)
assert ser.all(skipna=False)

ser = Series([], dtype=any_string_dtype)
assert not ser.any()
assert ser.all()
assert not ser.any(skipna=False)
assert ser.all(skipna=False)

ser = Series([""], dtype=any_string_dtype)
assert not ser.any()
assert not ser.all()
assert not ser.any(skipna=False)
assert not ser.all(skipna=False)

ser = Series([np.nan], dtype=any_string_dtype)
assert not ser.any()
assert ser.all()
assert ser.any(skipna=False) # NaN is considered truthy
assert ser.all(skipna=False) # NaN is considered truthy

def test_timedelta64_analytics(self):
# index min/max
Expand Down