Skip to content

Commit 8307c51

Browse files
author
Roline Stapny Saldanha
committed
Merge remote-tracking branch 'upstream/main' into sroline_issue_60923
2 parents fe66e04 + 36b8f20 commit 8307c51

File tree

8 files changed

+191
-13
lines changed

8 files changed

+191
-13
lines changed

doc/source/whatsnew/v2.3.2.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ Bug fixes
2525
- Fix :meth:`~DataFrame.to_json` with ``orient="table"`` to correctly use the
2626
"string" type in the JSON Table Schema for :class:`StringDtype` columns
2727
(:issue:`61889`)
28-
28+
- Boolean operations (``|``, ``&``, ``^``) with bool-dtype objects on the left and :class:`StringDtype` objects on the right now cast the string to bool, with a deprecation warning (:issue:`60234`)
2929

3030
.. ---------------------------------------------------------------------------
3131
.. _whatsnew_232.contributors:

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -854,6 +854,7 @@ Groupby/resample/rolling
854854
- Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`)
855855
- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` were not keeping the index name when the index had :class:`ArrowDtype` timestamp dtype (:issue:`61222`)
856856
- Bug in :meth:`DataFrame.resample` changing index type to :class:`MultiIndex` when the dataframe is empty and using an upsample method (:issue:`55572`)
857+
- Bug in :meth:`DataFrameGroupBy.agg` and :meth:`SeriesGroupBy.agg` that was returning numpy dtype values when input values are pyarrow dtype values, instead of returning pyarrow dtype values. (:issue:`53030`)
857858
- Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. (:issue:`55041`)
858859
- Bug in :meth:`DataFrameGroupBy.agg` where applying a user-defined function to an empty DataFrame returned a Series instead of an empty DataFrame. (:issue:`61503`)
859860
- Bug in :meth:`DataFrameGroupBy.apply` and :meth:`SeriesGroupBy.apply` for empty data frame with ``group_keys=False`` still creating output index using group keys. (:issue:`60471`)

pandas/core/arrays/arrow/array.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
overload,
1313
)
1414
import unicodedata
15+
import warnings
1516

1617
import numpy as np
1718

@@ -27,6 +28,7 @@
2728
pa_version_under13p0,
2829
)
2930
from pandas.util._decorators import doc
31+
from pandas.util._exceptions import find_stack_level
3032

3133
from pandas.core.dtypes.cast import (
3234
can_hold_element,
@@ -852,6 +854,25 @@ def _logical_method(self, other, op) -> Self:
852854
# integer types. Otherwise these are boolean ops.
853855
if pa.types.is_integer(self._pa_array.type):
854856
return self._evaluate_op_method(other, op, ARROW_BIT_WISE_FUNCS)
857+
elif (
858+
(
859+
pa.types.is_string(self._pa_array.type)
860+
or pa.types.is_large_string(self._pa_array.type)
861+
)
862+
and op in (roperator.ror_, roperator.rand_, roperator.rxor)
863+
and isinstance(other, np.ndarray)
864+
and other.dtype == bool
865+
):
866+
# GH#60234 backward compatibility for the move to StringDtype in 3.0
867+
op_name = op.__name__[1:].strip("_")
868+
warnings.warn(
869+
f"'{op_name}' operations between boolean dtype and {self.dtype} are "
870+
"deprecated and will raise in a future version. Explicitly "
871+
"cast the strings to a boolean dtype before operating instead.",
872+
FutureWarning,
873+
stacklevel=find_stack_level(),
874+
)
875+
return op(other, self.astype(bool))
855876
else:
856877
return self._evaluate_op_method(other, op, ARROW_LOGICAL_FUNCS)
857878

pandas/core/arrays/string_.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
missing,
5353
nanops,
5454
ops,
55+
roperator,
5556
)
5657
from pandas.core.algorithms import isin
5758
from pandas.core.array_algos import masked_reductions
@@ -390,6 +391,26 @@ class BaseStringArray(ExtensionArray):
390391

391392
dtype: StringDtype
392393

394+
# TODO(4.0): Once the deprecation here is enforced, this method can be
395+
# removed and we use the parent class method instead.
396+
def _logical_method(self, other, op):
397+
if (
398+
op in (roperator.ror_, roperator.rand_, roperator.rxor)
399+
and isinstance(other, np.ndarray)
400+
and other.dtype == bool
401+
):
402+
# GH#60234 backward compatibility for the move to StringDtype in 3.0
403+
op_name = op.__name__[1:].strip("_")
404+
warnings.warn(
405+
f"'{op_name}' operations between boolean dtype and {self.dtype} are "
406+
"deprecated and will raise in a future version. Explicitly "
407+
"cast the strings to a boolean dtype before operating instead.",
408+
FutureWarning,
409+
stacklevel=find_stack_level(),
410+
)
411+
return op(other, self.astype(bool))
412+
return NotImplemented
413+
393414
@doc(ExtensionArray.tolist)
394415
def tolist(self) -> list:
395416
if self.ndim > 1:

pandas/core/groupby/ops.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -44,13 +44,15 @@
4444
ensure_platform_int,
4545
ensure_uint64,
4646
is_1d_only_ea_dtype,
47+
is_string_dtype,
4748
)
4849
from pandas.core.dtypes.missing import (
4950
isna,
5051
maybe_fill,
5152
)
5253

5354
from pandas.core.arrays import Categorical
55+
from pandas.core.arrays.arrow.array import ArrowExtensionArray
5456
from pandas.core.frame import DataFrame
5557
from pandas.core.groupby import grouper
5658
from pandas.core.indexes.api import (
@@ -963,18 +965,26 @@ def agg_series(
963965
-------
964966
np.ndarray or ExtensionArray
965967
"""
968+
result = self._aggregate_series_pure_python(obj, func)
969+
npvalues = lib.maybe_convert_objects(result, try_float=False)
970+
971+
if isinstance(obj._values, ArrowExtensionArray):
972+
# When obj.dtype is a string, any object can be cast. Only do so if the
973+
# UDF returned strings or NA values.
974+
if not is_string_dtype(obj.dtype) or lib.is_string_array(
975+
npvalues, skipna=True
976+
):
977+
out = maybe_cast_pointwise_result(
978+
npvalues, obj.dtype, numeric_only=True, same_dtype=preserve_dtype
979+
)
980+
else:
981+
out = npvalues
966982

967-
if not isinstance(obj._values, np.ndarray):
983+
elif not isinstance(obj._values, np.ndarray):
968984
# we can preserve a little bit more aggressively with EA dtype
969985
# because maybe_cast_pointwise_result will do a try/except
970986
# with _from_sequence. NB we are assuming here that _from_sequence
971987
# is sufficiently strict that it casts appropriately.
972-
preserve_dtype = True
973-
974-
result = self._aggregate_series_pure_python(obj, func)
975-
976-
npvalues = lib.maybe_convert_objects(result, try_float=False)
977-
if preserve_dtype:
978988
out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True)
979989
else:
980990
out = npvalues

pandas/tests/groupby/aggregate/test_aggregate.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import pytest
1111

1212
from pandas.errors import SpecificationError
13+
import pandas.util._test_decorators as td
1314

1415
from pandas.core.dtypes.common import is_integer_dtype
1516

@@ -23,6 +24,7 @@
2324
to_datetime,
2425
)
2526
import pandas._testing as tm
27+
from pandas.arrays import ArrowExtensionArray
2628
from pandas.core.groupby.grouper import Grouping
2729

2830

@@ -1809,6 +1811,102 @@ def test_groupby_aggregation_func_list_multi_index_duplicate_columns():
18091811
tm.assert_frame_equal(result, expected)
18101812

18111813

1814+
@td.skip_if_no("pyarrow")
1815+
@pytest.mark.parametrize(
1816+
"input_dtype, output_dtype",
1817+
[
1818+
# With NumPy arrays, the results from the UDF would be e.g. np.float32 scalars
1819+
# which we can therefore preserve. However with PyArrow arrays, the results are
1820+
# Python scalars so we have no information about size or uint vs int.
1821+
("float[pyarrow]", "double[pyarrow]"),
1822+
("int64[pyarrow]", "int64[pyarrow]"),
1823+
("uint64[pyarrow]", "int64[pyarrow]"),
1824+
("bool[pyarrow]", "bool[pyarrow]"),
1825+
],
1826+
)
1827+
def test_agg_lambda_pyarrow_dtype_conversion(input_dtype, output_dtype):
1828+
# GH#59601
1829+
# Test PyArrow dtype conversion back to PyArrow dtype
1830+
df = DataFrame(
1831+
{
1832+
"A": ["c1", "c2", "c3", "c1", "c2", "c3"],
1833+
"B": pd.array([100, 200, 255, 0, 199, 40392], dtype=input_dtype),
1834+
}
1835+
)
1836+
gb = df.groupby("A")
1837+
result = gb.agg(lambda x: x.min())
1838+
1839+
expected = DataFrame(
1840+
{"B": pd.array([0, 199, 255], dtype=output_dtype)},
1841+
index=Index(["c1", "c2", "c3"], name="A"),
1842+
)
1843+
tm.assert_frame_equal(result, expected)
1844+
1845+
1846+
@td.skip_if_no("pyarrow")
1847+
def test_agg_lambda_complex128_dtype_conversion():
1848+
# GH#59601
1849+
df = DataFrame(
1850+
{"A": ["c1", "c2", "c3"], "B": pd.array([100, 200, 255], "int64[pyarrow]")}
1851+
)
1852+
gb = df.groupby("A")
1853+
result = gb.agg(lambda x: complex(x.sum(), x.count()))
1854+
1855+
expected = DataFrame(
1856+
{
1857+
"B": pd.array(
1858+
[complex(100, 1), complex(200, 1), complex(255, 1)], dtype="complex128"
1859+
),
1860+
},
1861+
index=Index(["c1", "c2", "c3"], name="A"),
1862+
)
1863+
tm.assert_frame_equal(result, expected)
1864+
1865+
1866+
@td.skip_if_no("pyarrow")
1867+
def test_agg_lambda_numpy_uint64_to_pyarrow_dtype_conversion():
1868+
# GH#59601
1869+
df = DataFrame(
1870+
{
1871+
"A": ["c1", "c2", "c3"],
1872+
"B": pd.array([100, 200, 255], dtype="uint64[pyarrow]"),
1873+
}
1874+
)
1875+
gb = df.groupby("A")
1876+
result = gb.agg(lambda x: np.uint64(x.sum()))
1877+
1878+
expected = DataFrame(
1879+
{
1880+
"B": pd.array([100, 200, 255], dtype="uint64[pyarrow]"),
1881+
},
1882+
index=Index(["c1", "c2", "c3"], name="A"),
1883+
)
1884+
tm.assert_frame_equal(result, expected)
1885+
1886+
1887+
@td.skip_if_no("pyarrow")
1888+
def test_agg_lambda_pyarrow_struct_to_object_dtype_conversion():
1889+
# GH#59601
1890+
import pyarrow as pa
1891+
1892+
df = DataFrame(
1893+
{
1894+
"A": ["c1", "c2", "c3"],
1895+
"B": pd.array([100, 200, 255], dtype="int64[pyarrow]"),
1896+
}
1897+
)
1898+
gb = df.groupby("A")
1899+
result = gb.agg(lambda x: {"number": 1})
1900+
1901+
arr = pa.array([{"number": 1}, {"number": 1}, {"number": 1}])
1902+
expected = DataFrame(
1903+
{"B": ArrowExtensionArray(arr)},
1904+
index=Index(["c1", "c2", "c3"], name="A"),
1905+
)
1906+
1907+
tm.assert_frame_equal(result, expected)
1908+
1909+
18121910
def test_groupby_aggregate_empty_builtin_sum():
18131911
df = DataFrame(columns=["Group", "Data"])
18141912
result = df.groupby(["Group"], as_index=False)["Data"].agg("sum")

pandas/tests/groupby/test_groupby.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2434,25 +2434,28 @@ def test_rolling_wrong_param_min_period():
24342434

24352435
def test_by_column_values_with_same_starting_value(any_string_dtype):
24362436
# GH29635
2437+
dtype = any_string_dtype
24372438
df = DataFrame(
24382439
{
24392440
"Name": ["Thomas", "Thomas", "Thomas John"],
24402441
"Credit": [1200, 1300, 900],
2441-
"Mood": Series(["sad", "happy", "happy"], dtype=any_string_dtype),
2442+
"Mood": Series(["sad", "happy", "happy"], dtype=dtype),
24422443
}
24432444
)
24442445
aggregate_details = {"Mood": Series.mode, "Credit": "sum"}
24452446

24462447
result = df.groupby(["Name"]).agg(aggregate_details)
2447-
expected_result = DataFrame(
2448+
expected = DataFrame(
24482449
{
24492450
"Mood": [["happy", "sad"], "happy"],
24502451
"Credit": [2500, 900],
24512452
"Name": ["Thomas", "Thomas John"],
2452-
}
2453+
},
24532454
).set_index("Name")
2454-
2455-
tm.assert_frame_equal(result, expected_result)
2455+
if getattr(dtype, "storage", None) == "pyarrow":
2456+
mood_values = pd.array(["happy", "sad"], dtype=dtype)
2457+
expected["Mood"] = [mood_values, "happy"]
2458+
tm.assert_frame_equal(result, expected)
24562459

24572460

24582461
def test_groupby_none_in_first_mi_level():

pandas/tests/strings/test_strings.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -787,3 +787,27 @@ def test_decode_with_dtype_none():
787787
result = ser.str.decode("utf-8", dtype=None)
788788
expected = Series(["a", "b", "c"], dtype="str")
789789
tm.assert_series_equal(result, expected)
790+
791+
792+
def test_reversed_logical_ops(any_string_dtype):
793+
# GH#60234
794+
dtype = any_string_dtype
795+
warn = None if dtype == object else FutureWarning
796+
left = Series([True, False, False, True])
797+
right = Series(["", "", "b", "c"], dtype=dtype)
798+
799+
msg = "operations between boolean dtype and"
800+
with tm.assert_produces_warning(warn, match=msg):
801+
result = left | right
802+
expected = left | right.astype(bool)
803+
tm.assert_series_equal(result, expected)
804+
805+
with tm.assert_produces_warning(warn, match=msg):
806+
result = left & right
807+
expected = left & right.astype(bool)
808+
tm.assert_series_equal(result, expected)
809+
810+
with tm.assert_produces_warning(warn, match=msg):
811+
result = left ^ right
812+
expected = left ^ right.astype(bool)
813+
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)