Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 84 additions & 1 deletion pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1977,7 +1977,10 @@ def _reduce_pyarrow(self, name: str, *, skipna: bool = True, **kwargs) -> pa.Sca

data_to_reduce = self._pa_array

if name in ["any", "all"] and (
if name == "count":
return super().count()

elif name in ["any", "all"] and (
pa.types.is_integer(pa_type)
or pa.types.is_floating(pa_type)
or pa.types.is_duration(pa_type)
Expand Down Expand Up @@ -2156,6 +2159,86 @@ def _reduce_calc(
else:
return pa_result

def sum(
self,
*,
skipna: bool = True,
**kwargs,
):
return self._reduce("sum", skipna=skipna, **kwargs)

def min(
self,
*,
skipna: bool = True,
**kwargs,
):
return self._reduce("min", skipna=skipna, **kwargs)

def max(
self,
*,
skipna: bool = True,
**kwargs,
):
return self._reduce("max", skipna=skipna, **kwargs)

def mean(
self,
*,
skipna: bool = True,
**kwargs,
):
return self._reduce("mean", skipna=skipna, **kwargs)

def sem(
self,
*,
skipna: bool = True,
**kwargs,
):
return self._reduce("sem", skipna=skipna, **kwargs)

def skew(
self,
*,
skipna: bool = True,
**kwargs,
):
return self._reduce("skew", skipna=skipna, **kwargs)

def median(
self,
*,
skipna: bool = True,
**kwargs,
):
return self._reduce("median", skipna=skipna, **kwargs)

def var(
self,
*,
skipna: bool = True,
**kwargs,
):
return self._reduce("var", skipna=skipna, **kwargs)

def std(
self,
*,
skipna: bool = True,
**kwargs,
):
return self._reduce("std", skipna=skipna, **kwargs)

def prod(
self,
*,
skipna: bool = True,
**kwargs,
):
return self._reduce("prod", skipna=skipna, **kwargs)

def _explode(self):
"""
See Series.explode.__doc__.
Expand Down
7 changes: 6 additions & 1 deletion pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2301,7 +2301,9 @@ def _reduce(
f"'{type(self).__name__}' with dtype {self.dtype} "
f"does not support operation '{name}'"
)
result = meth(skipna=skipna, **kwargs)
if name != "count":
kwargs["skipna"] = skipna
result = meth(**kwargs)
if keepdims:
if name in ["min", "max"]:
result = self._from_sequence([result], dtype=self.dtype)
Expand All @@ -2310,6 +2312,9 @@ def _reduce(

return result

def count(self):
return self.isna().sum()

# https://github.com/python/typeshed/issues/2148#issuecomment-520783318
# Incompatible types in assignment (expression has type "None", base class
# "object" defined the type as "Callable[[object], int]")
Expand Down
28 changes: 28 additions & 0 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -1538,6 +1538,11 @@ def _reduce(
op = getattr(nanops, f"nan{name}")
axis = kwargs.pop("axis", None)
result = op(data, axis=axis, skipna=skipna, mask=mask, **kwargs)
if self.ndim > 1:
# Remainder of method assumes scalar result; other ops above
# do not respect `axis` argument and so always return a scalar.
# TODO: What happens when ndim > 1 on main?
return result

if keepdims:
if isna(result):
Expand Down Expand Up @@ -1644,6 +1649,29 @@ def mean(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
)
return self._wrap_reduction_result("mean", result, skipna=skipna, axis=axis)

def median(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
nv.validate_median((), kwargs)
result = self._reduce("median", skipna=skipna, axis=axis, **kwargs)
return self._wrap_reduction_result("median", result, skipna=skipna, axis=axis)

def kurt(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
# TODO: Does not exist?
# nv.validate_kurt((), kwargs)
result = self._reduce("kurt", skipna=skipna, axis=axis, **kwargs)
return self._wrap_reduction_result("kurt", result, skipna=skipna, axis=axis)

def sem(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
# TODO: Does not exist?
# nv.validate_sem((), kwargs)
result = self._reduce("sem", skipna=skipna, axis=axis, **kwargs)
return self._wrap_reduction_result("sem", result, skipna=skipna, axis=axis)

def skew(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
# TODO: Does not exist?
# nv.validate_skew((), kwargs)
result = self._reduce("skew", skipna=skipna, axis=axis, **kwargs)
return self._wrap_reduction_result("skew", result, skipna=skipna, axis=axis)

def var(
self, *, skipna: bool = True, axis: AxisInt | None = 0, ddof: int = 1, **kwargs
):
Expand Down
3 changes: 3 additions & 0 deletions pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -935,6 +935,9 @@ def _reduce(
else:
return nanops.nanall(self._ndarray, skipna=skipna)

if name == "count":
return super().count()

if name in ["min", "max", "argmin", "argmax", "sum"]:
result = getattr(self, name)(skipna=skipna, axis=axis, **kwargs)
if keepdims:
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -522,7 +522,7 @@ def _reduce(
return result.astype(np.bool_)
return result

if name in ("min", "max", "sum", "argmin", "argmax"):
if name in ("count", "min", "max", "sum", "argmin", "argmax"):
result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs)
else:
raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
Expand Down
8 changes: 2 additions & 6 deletions pandas/tests/extension/base/dim2.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,14 +274,10 @@ def get_reduction_result_dtype(dtype):
expected = expected.fillna(fill_value)

tm.assert_extension_array_equal(result, expected)
elif method == "median":
# std and var are not dtype-preserving
expected = data
tm.assert_extension_array_equal(result, expected)
elif method in ["mean", "std", "var"]:
elif method in ["mean", "median", "std", "var"]:
if is_integer_dtype(data) or is_bool_dtype(data):
data = data.astype("Float64")
if method == "mean":
if method in ["mean", "median"]:
tm.assert_extension_array_equal(result, data)
else:
tm.assert_extension_array_equal(result, data - data)
Expand Down
57 changes: 56 additions & 1 deletion pandas/tests/extension/base/reduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class BaseReduceTests:

def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
# Specify if we expect this reduction to succeed.
return False
return op_name == "count"

def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool):
# We perform the same operation on the np.float64 data and check
Expand Down Expand Up @@ -126,3 +126,58 @@ def test_reduce_frame(self, data, all_numeric_reductions, skipna):
pytest.skip(f"Reduction {op_name} not supported for this dtype")

self.check_reduce_frame(ser, op_name, skipna)

def test_reduce_array(self, data, all_numeric_reductions, skipna: bool):
op_name = all_numeric_reductions
ser = pd.Series(data)

kwargs = {}
if op_name == "mean" and isinstance(ser.array, pd.arrays.SparseArray):
# TODO: Missing skipna argument
pass
elif op_name != "count":
kwargs["skipna"] = skipna

if "DecimalArray" in str(type(ser.array)) and op_name != "count":
# DecimalArray does not implement sum et all directly.
msg = f"object has no attribute '{op_name}'"
with pytest.raises(AttributeError, match=msg):
getattr(ser.array, op_name)(**kwargs)
return

if not self._supports_reduction(ser, op_name):
# TODO: the message being checked here isn't actually checking anything
msg = "|".join(
[
f"object has no attribute '{op_name}'",
"does not support operation",
f"{op_name} is not implemented for",
f"Cannot perform reduction '{op_name}'",
"[Cc]ould not convert",
"Cannot convert",
f"Categorical is not ordered for operation {op_name}",
"setting an array element with a sequence",
"can't multiply sequence by non-int of type",
r"complex\(\) first argument must be a string or a number",
]
)
with pytest.raises(
(TypeError, NotImplementedError, AttributeError), match=msg
):
getattr(ser.array, op_name)(**kwargs)
return
if (
isinstance(ser.array.dtype, pd.SparseDtype)
and op_name in ["sum", "min", "max"]
and not skipna
):
# TODO: Bug - ._reduce doesn't properly handle not skipna
return

res_op = getattr(ser.array, op_name)
try:
expected = ser.array._reduce(op_name, **kwargs)
except (NotImplementedError, AttributeError):
return
result = res_op(**kwargs)
tm.assert_almost_equal(result, expected)
2 changes: 1 addition & 1 deletion pandas/tests/extension/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def _supports_accumulation(self, ser, op_name: str) -> bool:
return op_name in ["cummin", "cummax"]

def _supports_reduction(self, obj, op_name: str) -> bool:
return op_name in ["min", "max", "median", "mean", "std", "any", "all"]
return op_name in ["min", "max", "median", "mean", "std", "any", "all", "count"]

@pytest.mark.parametrize("skipna", [True, False])
def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/extension/test_interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ class TestIntervalArray(base.ExtensionTests):
divmod_exc = TypeError

def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
return op_name in ["min", "max"]
return op_name in ["min", "max", "count"]

def test_fillna_limit_frame(self, data_missing):
# GH#58001
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/extension/test_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):

def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
if ser.dtype.kind == "O":
return op_name in ["sum", "min", "max", "any", "all"]
return op_name in ["sum", "min", "max", "any", "all", "count"]
return True

def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/extension/test_period.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def _supports_accumulation(self, ser, op_name: str) -> bool:
return op_name in ["cummin", "cummax"]

def _supports_reduction(self, obj, op_name: str) -> bool:
return op_name in ["min", "max", "median"]
return op_name in ["count", "min", "max", "median"]

def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool):
if op_name == "median":
Expand Down
17 changes: 17 additions & 0 deletions pandas/tests/extension/test_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,23 @@ def test_reduce_frame(self, data, all_numeric_reductions, skipna, request):

super().test_reduce_frame(data, all_numeric_reductions, skipna)

def test_reduce_array(self, data, all_numeric_reductions, skipna, request):
if all_numeric_reductions in [
"prod",
"median",
"var",
"std",
"sem",
"skew",
"kurt",
]:
mark = pytest.mark.xfail(
reason="This should be viable but is not implemented"
)
request.node.add_marker(mark)

super().test_reduce_array(data, all_numeric_reductions, skipna)

def _check_unsupported(self, data):
if data.dtype == SparseDtype(int, 0):
pytest.skip("Can't store nan in int array.")
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/extension/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ def _get_expected_exception(
return None

def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
return op_name in ["min", "max", "sum"] or (
return op_name in ["min", "max", "sum", "count"] or (
ser.dtype.na_value is np.nan # type: ignore[union-attr]
and op_name in ("any", "all")
)
Expand Down
Loading