diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 07c8ab6d4f2cb..109f674fb9043 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -392,6 +392,73 @@ def _from_sequence_of_strings( ) return cls._from_sequence(scalars, dtype=pa_type, copy=copy) + def _cast_pointwise_result(self, values) -> ArrayLike: + if len(values) == 0: + # Retain our dtype + return self[:0].copy() + + try: + arr = pa.array(values, from_pandas=True) + except (ValueError, TypeError): + # e.g. test_by_column_values_with_same_starting_value with nested + # values, one entry of which is an ArrowStringArray + # or test_agg_lambda_complex128_dtype_conversion for complex values + return super()._cast_pointwise_result(values) + + if pa.types.is_duration(arr.type): + # workaround for https://github.com/apache/arrow/issues/40620 + result = ArrowExtensionArray._from_sequence(values) + if pa.types.is_duration(self._pa_array.type): + result = result.astype(self.dtype) # type: ignore[assignment] + elif pa.types.is_timestamp(self._pa_array.type): + # Try to retain original unit + new_dtype = ArrowDtype(pa.duration(self._pa_array.type.unit)) + try: + result = result.astype(new_dtype) # type: ignore[assignment] + except ValueError: + pass + elif pa.types.is_date64(self._pa_array.type): + # Try to match unit we get on non-pointwise op + dtype = ArrowDtype(pa.duration("ms")) + result = result.astype(dtype) # type: ignore[assignment] + elif pa.types.is_date(self._pa_array.type): + # Try to match unit we get on non-pointwise op + dtype = ArrowDtype(pa.duration("s")) + result = result.astype(dtype) # type: ignore[assignment] + return result + + elif pa.types.is_date(arr.type) and pa.types.is_date(self._pa_array.type): + arr = arr.cast(self._pa_array.type) + elif pa.types.is_time(arr.type) and pa.types.is_time(self._pa_array.type): + arr = arr.cast(self._pa_array.type) + elif pa.types.is_decimal(arr.type) and pa.types.is_decimal(self._pa_array.type): + arr = arr.cast(self._pa_array.type) + elif pa.types.is_integer(arr.type) and pa.types.is_integer(self._pa_array.type): + try: + arr = arr.cast(self._pa_array.type) + except pa.lib.ArrowInvalid: + # e.g. test_combine_add if we can't cast + pass + elif pa.types.is_floating(arr.type) and pa.types.is_floating( + self._pa_array.type + ): + try: + arr = arr.cast(self._pa_array.type) + except pa.lib.ArrowInvalid: + # e.g. test_combine_add if we can't cast + pass + + if isinstance(self.dtype, StringDtype): + if pa.types.is_string(arr.type) or pa.types.is_large_string(arr.type): + # ArrowStringArrayNumpySemantics + return type(self)(arr).astype(self.dtype) + if self.dtype.na_value is np.nan: + # ArrowEA has different semantics, so we return numpy-based + # result instead + return super()._cast_pointwise_result(values) + return ArrowExtensionArray(arr) + return type(self)(arr) + @classmethod def _box_pa( cls, value, pa_type: pa.DataType | None = None diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 90fb8c175ebf6..1cd10a9eef9d1 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -19,7 +19,6 @@ cast, overload, ) -import warnings import numpy as np @@ -35,13 +34,11 @@ Substitution, cache_readonly, ) -from pandas.util._exceptions import find_stack_level from pandas.util._validators import ( validate_bool_kwarg, validate_insert_loc, ) -from pandas.core.dtypes.cast import maybe_cast_pointwise_result from pandas.core.dtypes.common import ( is_list_like, is_scalar, @@ -89,7 +86,6 @@ AstypeArg, AxisInt, Dtype, - DtypeObj, FillnaOptions, InterpolateOptions, NumpySorter, @@ -311,38 +307,6 @@ def _from_sequence( """ raise AbstractMethodError(cls) - @classmethod - def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self: - """ - Strict analogue to _from_sequence, allowing only sequences of scalars - that should be specifically inferred to the given dtype. - - Parameters - ---------- - scalars : sequence - dtype : ExtensionDtype - - Raises - ------ - TypeError or ValueError - - Notes - ----- - This is called in a try/except block when casting the result of a - pointwise operation. - """ - try: - return cls._from_sequence(scalars, dtype=dtype, copy=False) - except (ValueError, TypeError): - raise - except Exception: - warnings.warn( - "_from_scalars should only raise ValueError or TypeError. " - "Consider overriding _from_scalars where appropriate.", - stacklevel=find_stack_level(), - ) - raise - @classmethod def _from_sequence_of_strings( cls, strings, *, dtype: ExtensionDtype, copy: bool = False @@ -371,9 +335,6 @@ def _from_sequence_of_strings( from a sequence of scalars. api.extensions.ExtensionArray._from_factorized : Reconstruct an ExtensionArray after factorization. - api.extensions.ExtensionArray._from_scalars : Strict analogue to _from_sequence, - allowing only sequences of scalars that should be specifically inferred to - the given dtype. Examples -------- @@ -416,6 +377,14 @@ def _from_factorized(cls, values, original): """ raise AbstractMethodError(cls) + def _cast_pointwise_result(self, values) -> ArrayLike: + """ + Cast the result of a pointwise operation (e.g. Series.map) to an + array, preserve dtype_backend if possible. + """ + values = np.asarray(values, dtype=object) + return lib.maybe_convert_objects(values, convert_non_numeric=True) + # ------------------------------------------------------------------------ # Must be a Sequence # ------------------------------------------------------------------------ @@ -2842,7 +2811,7 @@ def _maybe_convert(arr): # https://github.com/pandas-dev/pandas/issues/22850 # We catch all regular exceptions here, and fall back # to an ndarray. - res = maybe_cast_pointwise_result(arr, self.dtype, same_dtype=False) + res = self._cast_pointwise_result(arr) if not isinstance(res, type(self)): # exception raised in _from_sequence; ensure we have ndarray res = np.asarray(arr) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 86e5f93609d1c..78928713166f4 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -103,7 +103,6 @@ AstypeArg, AxisInt, Dtype, - DtypeObj, NpDtype, Ordered, Shape, @@ -529,20 +528,12 @@ def _from_sequence( ) -> Self: return cls(scalars, dtype=dtype, copy=copy) - @classmethod - def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self: - if dtype is None: - # The _from_scalars strictness doesn't make much sense in this case. - raise NotImplementedError - - res = cls._from_sequence(scalars, dtype=dtype) - - # if there are any non-category elements in scalars, these will be - # converted to NAs in res. - mask = isna(scalars) - if not (mask == res.isna()).all(): - # Some non-category element in scalars got converted to NA in res. - raise ValueError + def _cast_pointwise_result(self, values) -> ArrayLike: + res = super()._cast_pointwise_result(values) + cat = type(self)._from_sequence(res, dtype=self.dtype) + if (cat.isna() == isna(res)).all(): + # i.e. the conversion was non-lossy + return cat return res @overload diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 4103b7c337efe..67ee16fd3a34e 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -83,7 +83,6 @@ from pandas._typing import ( ArrayLike, DateTimeErrorChoices, - DtypeObj, IntervalClosedType, TimeAmbiguous, TimeNonexistent, @@ -293,14 +292,6 @@ def _scalar_type(self) -> type[Timestamp]: _dtype: np.dtype[np.datetime64] | DatetimeTZDtype _freq: BaseOffset | None = None - @classmethod - def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self: - if lib.infer_dtype(scalars, skipna=True) not in ["datetime", "datetime64"]: - # TODO: require any NAs be valid-for-DTA - # TODO: if dtype is passed, check for tzawareness compat? - raise ValueError - return cls._from_sequence(scalars, dtype=dtype) - @classmethod def _validate_dtype(cls, values, dtype): # used in TimeLikeOps.__init__ diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 0f619df14ee0c..0402452e484ea 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -26,6 +26,7 @@ from pandas.util._decorators import doc from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.common import ( is_bool, is_integer_dtype, @@ -147,6 +148,19 @@ def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False) -> Self: values, mask = cls._coerce_to_array(scalars, dtype=dtype, copy=copy) return cls(values, mask) + def _cast_pointwise_result(self, values) -> ArrayLike: + values = np.asarray(values, dtype=object) + result = lib.maybe_convert_objects(values, convert_to_nullable_dtype=True) + lkind = self.dtype.kind + rkind = result.dtype.kind + if (lkind in "iu" and rkind in "iu") or (lkind == rkind == "f"): + result = cast(BaseMaskedArray, result) + new_data = maybe_downcast_to_dtype( + result._data, dtype=self.dtype.numpy_dtype + ) + result = type(result)(new_data, result._mask) + return result + @classmethod @doc(ExtensionArray._empty) def _empty(cls, shape: Shape, dtype: ExtensionDtype) -> Self: diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 73e509474b6e4..cef20da195f43 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -14,7 +14,10 @@ from pandas.compat.numpy import function as nv from pandas.core.dtypes.astype import astype_array -from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike +from pandas.core.dtypes.cast import ( + construct_1d_object_array_from_listlike, + maybe_downcast_to_dtype, +) from pandas.core.dtypes.common import pandas_dtype from pandas.core.dtypes.dtypes import NumpyEADtype from pandas.core.dtypes.missing import isna @@ -34,6 +37,7 @@ from collections.abc import Callable from pandas._typing import ( + ArrayLike, AxisInt, Dtype, FillnaOptions, @@ -145,6 +149,24 @@ def _from_sequence( result = result.copy() return cls(result) + def _cast_pointwise_result(self, values) -> ArrayLike: + result = super()._cast_pointwise_result(values) + lkind = self.dtype.kind + rkind = result.dtype.kind + if ( + (lkind in "iu" and rkind in "iu") + or (lkind == "f" and rkind == "f") + or (lkind == rkind == "c") + ): + result = maybe_downcast_to_dtype(result, self.dtype.numpy_dtype) + elif rkind == "M": + # Ensure potential subsequent .astype(object) doesn't incorrectly + # convert Timestamps to ints + from pandas import array as pd_array + + result = pd_array(result, copy=False) + return result + # ------------------------------------------------------------------------ # Data diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 384a264ca690e..ab5569537dc55 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -607,6 +607,23 @@ def _from_sequence( def _from_factorized(cls, values, original) -> Self: return cls(values, dtype=original.dtype) + def _cast_pointwise_result(self, values): + result = super()._cast_pointwise_result(values) + if result.dtype.kind == self.dtype.kind: + try: + # e.g. test_groupby_agg_extension + res = type(self)._from_sequence(result, dtype=self.dtype) + if ((res == result) | (isna(result) & res.isna())).all(): + # This does not hold for e.g. + # test_arith_frame_with_scalar[0-__truediv__] + return res + return type(self)._from_sequence(result) + except (ValueError, TypeError): + return type(self)._from_sequence(result) + else: + # e.g. test_combine_le avoid casting bools to Sparse[float64, nan] + return type(self)._from_sequence(result) + # ------------------------------------------------------------------------ # Data # ------------------------------------------------------------------------ diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 983e7b246032c..4d91f33a8df87 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -412,13 +412,6 @@ def tolist(self) -> list: return [x.tolist() for x in self] return list(self.to_numpy()) - @classmethod - def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self: - if lib.infer_dtype(scalars, skipna=True) not in ["string", "empty"]: - # TODO: require any NAs be valid-for-string - raise ValueError - return cls._from_sequence(scalars, dtype=dtype) - def _formatter(self, boxed: bool = False): formatter = partial( printing.pprint_thing, @@ -732,6 +725,13 @@ def _from_sequence_of_strings( ) -> Self: return cls._from_sequence(strings, dtype=dtype, copy=copy) + def _cast_pointwise_result(self, values) -> ArrayLike: + result = super()._cast_pointwise_result(values) + if isinstance(result.dtype, StringDtype): + # Ensure we retain our same na_value/storage + result = result.astype(self.dtype) # type: ignore[call-overload] + return result + @classmethod def _empty(cls, shape, dtype) -> StringArray: values = np.empty(shape, dtype=object) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 20fe9b92b4677..afe359b3faede 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -437,80 +437,6 @@ def maybe_upcast_numeric_to_64bit(arr: NumpyIndexT) -> NumpyIndexT: return arr -def maybe_cast_pointwise_result( - result: ArrayLike, - dtype: DtypeObj, - numeric_only: bool = False, - same_dtype: bool = True, -) -> ArrayLike: - """ - Try casting result of a pointwise operation back to the original dtype if - appropriate. - - Parameters - ---------- - result : array-like - Result to cast. - dtype : np.dtype or ExtensionDtype - Input Series from which result was calculated. - numeric_only : bool, default False - Whether to cast only numerics or datetimes as well. - same_dtype : bool, default True - Specify dtype when calling _from_sequence - - Returns - ------- - result : array-like - result maybe casted to the dtype. - """ - - if isinstance(dtype, ExtensionDtype): - cls = dtype.construct_array_type() - if same_dtype: - result = _maybe_cast_to_extension_array(cls, result, dtype=dtype) - else: - result = _maybe_cast_to_extension_array(cls, result) - - elif (numeric_only and dtype.kind in "iufcb") or not numeric_only: - result = maybe_downcast_to_dtype(result, dtype) - - return result - - -def _maybe_cast_to_extension_array( - cls: type[ExtensionArray], obj: ArrayLike, dtype: ExtensionDtype | None = None -) -> ArrayLike: - """ - Call to `_from_sequence` that returns the object unchanged on Exception. - - Parameters - ---------- - cls : class, subclass of ExtensionArray - obj : arraylike - Values to pass to cls._from_sequence - dtype : ExtensionDtype, optional - - Returns - ------- - ExtensionArray or obj - """ - result: ArrayLike - - if dtype is not None: - try: - result = cls._from_scalars(obj, dtype=dtype) - except (TypeError, ValueError): - return obj - return result - - try: - result = cls._from_sequence(obj, dtype=dtype) - except Exception: - # We can't predict what downstream EA constructors may raise - result = obj - return result - - @overload def ensure_dtype_can_hold_na(dtype: np.dtype) -> np.dtype: ... diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 594358c0b4080..eab221e4df2a9 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -35,7 +35,6 @@ from pandas.util._decorators import cache_readonly from pandas.core.dtypes.cast import ( - maybe_cast_pointwise_result, maybe_downcast_to_dtype, ) from pandas.core.dtypes.common import ( @@ -44,7 +43,6 @@ ensure_platform_int, ensure_uint64, is_1d_only_ea_dtype, - is_string_dtype, ) from pandas.core.dtypes.missing import ( isna, @@ -52,7 +50,6 @@ ) from pandas.core.arrays import Categorical -from pandas.core.arrays.arrow.array import ArrowExtensionArray from pandas.core.frame import DataFrame from pandas.core.groupby import grouper from pandas.core.indexes.api import ( @@ -966,29 +963,7 @@ def agg_series( np.ndarray or ExtensionArray """ result = self._aggregate_series_pure_python(obj, func) - npvalues = lib.maybe_convert_objects(result, try_float=False) - - if isinstance(obj._values, ArrowExtensionArray): - # When obj.dtype is a string, any object can be cast. Only do so if the - # UDF returned strings or NA values. - if not is_string_dtype(obj.dtype) or lib.is_string_array( - npvalues, skipna=True - ): - out = maybe_cast_pointwise_result( - npvalues, obj.dtype, numeric_only=True, same_dtype=preserve_dtype - ) - else: - out = npvalues - - elif not isinstance(obj._values, np.ndarray): - # we can preserve a little bit more aggressively with EA dtype - # because maybe_cast_pointwise_result will do a try/except - # with _from_sequence. NB we are assuming here that _from_sequence - # is sufficiently strict that it casts appropriately. - out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True) - else: - out = npvalues - return out + return obj.array._cast_pointwise_result(result) @final def _aggregate_series_pure_python( diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index df4df2a4913c2..59ac122e4f9ea 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -90,7 +90,6 @@ common_dtype_categorical_compat, find_result_type, infer_dtype_from, - maybe_cast_pointwise_result, np_can_hold_element, ) from pandas.core.dtypes.common import ( @@ -6398,17 +6397,20 @@ def map(self, mapper, na_action: Literal["ignore"] | None = None): if not new_values.size: # empty dtype = self.dtype - - # e.g. if we are floating and new_values is all ints, then we - # don't want to cast back to floating. But if we are UInt64 - # and new_values is all ints, we want to try. - same_dtype = lib.infer_dtype(new_values, skipna=False) == self.inferred_type - if same_dtype: - new_values = maybe_cast_pointwise_result( - new_values, self.dtype, same_dtype=same_dtype - ) - - return Index._with_infer(new_values, dtype=dtype, copy=False, name=self.name) + elif isinstance(new_values, Categorical): + # cast_pointwise_result is unnecessary + dtype = new_values.dtype + else: + if isinstance(self, MultiIndex): + arr = self[:0].to_flat_index().array + else: + arr = self[:0].array + # e.g. if we are floating and new_values is all ints, then we + # don't want to cast back to floating. But if we are UInt64 + # and new_values is all ints, we want to try. + new_values = arr._cast_pointwise_result(new_values) + dtype = new_values.dtype + return Index(new_values, dtype=dtype, copy=False, name=self.name) # TODO: De-duplicate with map, xref GH#32349 @final diff --git a/pandas/core/series.py b/pandas/core/series.py index 00cff09801f1a..6055e65c2786b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -70,7 +70,6 @@ find_common_type, infer_dtype_from, maybe_box_native, - maybe_cast_pointwise_result, ) from pandas.core.dtypes.common import ( is_dict_like, @@ -84,7 +83,6 @@ validate_all_hashable, ) from pandas.core.dtypes.dtypes import ( - CategoricalDtype, ExtensionDtype, SparseDtype, ) @@ -117,7 +115,6 @@ ) from pandas.core.arrays.categorical import CategoricalAccessor from pandas.core.arrays.sparse import SparseAccessor -from pandas.core.arrays.string_ import StringDtype from pandas.core.construction import ( array as pd_array, extract_array, @@ -3185,15 +3182,14 @@ def combine( new_values[:] = [func(lv, other) for lv in self._values] new_name = self.name - # try_float=False is to match agg_series - npvalues = lib.maybe_convert_objects(new_values, try_float=False) - # same_dtype here is a kludge to avoid casting e.g. [True, False] to - # ["True", "False"] - same_dtype = isinstance(self.dtype, (StringDtype, CategoricalDtype)) - res_values = maybe_cast_pointwise_result( - npvalues, self.dtype, same_dtype=same_dtype + res_values = self.array._cast_pointwise_result(new_values) + return self._constructor( + res_values, + dtype=res_values.dtype, + index=new_index, + name=new_name, + copy=False, ) - return self._constructor(res_values, index=new_index, name=new_name, copy=False) def combine_first(self, other) -> Series: """ diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index fd9fec0cb490c..90ec84a30a129 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -367,6 +367,18 @@ def test_combine_le(self, data_repeated): ) tm.assert_series_equal(result, expected) + def _construct_for_combine_add(self, left, right): + if isinstance(right, type(left)): + return left._from_sequence( + [a + b for (a, b) in zip(list(left), list(right))], + dtype=left.dtype, + ) + else: + return left._from_sequence( + [a + right for a in list(left)], + dtype=left.dtype, + ) + def test_combine_add(self, data_repeated): # GH 20825 orig_data1, orig_data2 = data_repeated(2) @@ -377,26 +389,22 @@ def test_combine_add(self, data_repeated): # we will expect Series.combine to raise as well. try: with np.errstate(over="ignore"): - expected = pd.Series( - orig_data1._from_sequence( - [a + b for (a, b) in zip(list(orig_data1), list(orig_data2))] - ) - ) + arr = self._construct_for_combine_add(orig_data1, orig_data2) except TypeError: # If the operation is not supported pointwise for our scalars, # then Series.combine should also raise with pytest.raises(TypeError): s1.combine(s2, lambda x1, x2: x1 + x2) return + expected = pd.Series(arr) result = s1.combine(s2, lambda x1, x2: x1 + x2) tm.assert_series_equal(result, expected) val = s1.iloc[0] result = s1.combine(val, lambda x1, x2: x1 + x2) - expected = pd.Series( - orig_data1._from_sequence([a + val for a in list(orig_data1)]) - ) + arr = self._construct_for_combine_add(orig_data1, val) + expected = pd.Series(arr) tm.assert_series_equal(result, expected) def test_combine_first(self, data): diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 45991e32726c6..65fb6f33b0ea3 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -109,6 +109,16 @@ def _from_sequence_of_strings(cls, strings, *, dtype: ExtensionDtype, copy=False def _from_factorized(cls, values, original): return cls(values) + def _cast_pointwise_result(self, values): + result = super()._cast_pointwise_result(values) + try: + # If this were ever made a non-test EA, special-casing could + # be avoided by handling Decimal in maybe_convert_objects + res = type(self)._from_sequence(result, dtype=self.dtype) + except (ValueError, TypeError): + return result + return res + _HANDLED_TYPES = (decimal.Decimal, numbers.Number, np.ndarray) def to_numpy( diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 5221cd402f53d..5247dfcbb275b 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -1,7 +1,6 @@ from __future__ import annotations import decimal -import operator import numpy as np import pytest @@ -282,33 +281,10 @@ def _create_arithmetic_method(cls, op): DecimalArrayWithoutCoercion._add_arithmetic_ops() -def test_combine_from_sequence_raises(monkeypatch): - # https://github.com/pandas-dev/pandas/issues/22850 - cls = DecimalArrayWithoutFromSequence - - def construct_array_type(self): - return DecimalArrayWithoutFromSequence - - monkeypatch.setattr(DecimalDtype, "construct_array_type", construct_array_type) - - arr = cls([decimal.Decimal("1.0"), decimal.Decimal("2.0")]) - ser = pd.Series(arr) - result = ser.combine(ser, operator.add) - - # note: object dtype - expected = pd.Series( - [decimal.Decimal("2.0"), decimal.Decimal("4.0")], dtype="object" - ) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "class_", [DecimalArrayWithoutFromSequence, DecimalArrayWithoutCoercion] -) -def test_scalar_ops_from_sequence_raises(class_): +def test_scalar_ops_from_sequence_raises(): # op(EA, EA) should return an EA, or an ndarray if it's not possible # to return an EA with the return values. - arr = class_([decimal.Decimal("1.0"), decimal.Decimal("2.0")]) + arr = DecimalArrayWithoutCoercion([decimal.Decimal("1.0"), decimal.Decimal("2.0")]) result = arr + arr expected = np.array( [decimal.Decimal("2.0"), decimal.Decimal("4.0")], dtype="object" diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index a1799d0c113d1..bc30ba4ef7769 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -90,6 +90,13 @@ def _from_sequence(cls, scalars, *, dtype=None, copy=False): def _from_factorized(cls, values, original): return cls([UserDict(x) for x in values if x != ()]) + def _cast_pointwise_result(self, values): + result = super()._cast_pointwise_result(values) + try: + return type(self)._from_sequence(result, dtype=self.dtype) + except (ValueError, TypeError): + return result + def __getitem__(self, item): if isinstance(item, tuple): item = unpack_tuple_and_ellipses(item) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 857fdc7468cec..4c0ced8b56288 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -46,6 +46,7 @@ ) from pandas.errors import Pandas4Warning +from pandas.core.dtypes.common import pandas_dtype from pandas.core.dtypes.dtypes import ( ArrowDtype, CategoricalDtypeType, @@ -271,6 +272,26 @@ def data_for_twos(data): class TestArrowArray(base.ExtensionTests): + def _construct_for_combine_add(self, left, right): + dtype = left.dtype + + # in a couple cases, addition is not dtype-preserving + if dtype == "bool[pyarrow]": + dtype = pandas_dtype("int64[pyarrow]") + elif dtype == "int8[pyarrow]" and isinstance(right, type(left)): + dtype = pandas_dtype("int64[pyarrow]") + + if isinstance(right, type(left)): + return left._from_sequence( + [a + b for (a, b) in zip(list(left), list(right))], + dtype=dtype, + ) + else: + return left._from_sequence( + [a + right for a in list(left)], + dtype=dtype, + ) + def test_compare_scalar(self, data, comparison_op): ser = pd.Series(data) self._compare_other(ser, data, comparison_op, data[0]) @@ -786,6 +807,8 @@ def rtruediv(x, y): return tm.get_op_from_name(op_name) + # TODO: use EA._cast_pointwise_result, same with other test files that + # override this def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): # BaseOpsUtil._combine can upcast expected dtype # (because it generates expected on python scalars) @@ -795,16 +818,28 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): if op_name in ["eq", "ne", "lt", "le", "gt", "ge"]: return pointwise_result.astype("boolean[pyarrow]") + original_dtype = tm.get_dtype(expected) + was_frame = False if isinstance(expected, pd.DataFrame): was_frame = True expected_data = expected.iloc[:, 0] - original_dtype = obj.iloc[:, 0].dtype else: expected_data = expected - original_dtype = obj.dtype - orig_pa_type = original_dtype.pyarrow_dtype + # the pointwise method will have retained our original dtype, while + # the op(ser, other) version will have cast to 64bit + if type(other) is int and op_name not in ["__floordiv__"]: + if original_dtype.kind == "f": + return expected.astype("float64[pyarrow]") + else: + return expected.astype("int64[pyarrow]") + elif type(other) is float: + return expected.astype("float64[pyarrow]") + + # error: Item "ExtensionDtype" of "dtype[Any] | ExtensionDtype" has + # no attribute "pyarrow_dtype" + orig_pa_type = original_dtype.pyarrow_dtype # type: ignore[union-attr] if not was_frame and isinstance(other, pd.Series): # i.e. test_arith_series_with_array if not ( @@ -834,29 +869,7 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): pa_expected = pa.array(expected_data._values) - if pa.types.is_duration(pa_expected.type): - if pa.types.is_date(orig_pa_type): - if pa.types.is_date64(orig_pa_type): - # TODO: why is this different vs date32? - unit = "ms" - else: - unit = "s" - else: - # pyarrow sees sequence of datetime/timedelta objects and defaults - # to "us" but the non-pointwise op retains unit - # timestamp or duration - unit = orig_pa_type.unit - if type(other) in [datetime, timedelta] and unit in ["s", "ms"]: - # pydatetime/pytimedelta objects have microsecond reso, so we - # take the higher reso of the original and microsecond. Note - # this matches what we would do with DatetimeArray/TimedeltaArray - unit = "us" - - pa_expected = pa_expected.cast(f"duration[{unit}]") - - elif pa.types.is_decimal(pa_expected.type) and pa.types.is_decimal( - orig_pa_type - ): + if pa.types.is_decimal(pa_expected.type) and pa.types.is_decimal(orig_pa_type): # decimal precision can resize in the result type depending on data # just compare the float values alt = getattr(obj, op_name)(other) diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index 0e9ffce07bf98..034ddb351a7ab 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -168,6 +168,8 @@ def data_for_grouping(dtype): class TestMaskedArrays(base.ExtensionTests): + _combine_le_expected_dtype = "boolean" + @pytest.fixture(autouse=True) def skip_if_doesnt_support_2d(self, dtype, request): # Override the fixture so that we run these tests. @@ -215,42 +217,14 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): sdtype = tm.get_dtype(obj) expected = pointwise_result - if op_name in ("eq", "ne", "le", "ge", "lt", "gt"): - return expected.astype("boolean") - - if sdtype.kind in "iu": - if op_name in ("__rtruediv__", "__truediv__", "__div__"): - filled = expected.fillna(np.nan) - expected = filled.astype("Float64") - else: - # combine method result in 'biggest' (int64) dtype - expected = expected.astype(sdtype) - elif sdtype.kind == "b": + if sdtype.kind == "b": if op_name in ( - "__floordiv__", - "__rfloordiv__", - "__pow__", - "__rpow__", "__mod__", "__rmod__", ): # combine keeps boolean type expected = expected.astype("Int8") - elif op_name in ("__truediv__", "__rtruediv__"): - # combine with bools does not generate the correct result - # (numpy behaviour for div is to regard the bools as numeric) - op = self.get_op_from_name(op_name) - expected = self._combine(obj.astype(float), other, op) - expected = expected.astype("Float64") - - if op_name == "__rpow__": - # for rpow, combine does not propagate NaN - result = getattr(obj, op_name)(other) - expected[result.isna()] = np.nan - else: - # combine method result in 'biggest' (float64) dtype - expected = expected.astype(sdtype) return expected def test_divmod_series_array(self, data, data_for_twos, request): @@ -263,16 +237,6 @@ def test_divmod_series_array(self, data, data_for_twos, request): request.applymarker(mark) super().test_divmod_series_array(data, data_for_twos) - def test_combine_le(self, data_repeated): - # TODO: patching self is a bad pattern here - orig_data1, orig_data2 = data_repeated(2) - if orig_data1.dtype.kind == "b": - self._combine_le_expected_dtype = "boolean" - else: - # TODO: can we make this boolean? - self._combine_le_expected_dtype = object - super().test_combine_le(data_repeated) - def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: if op_name in ["any", "all"] and ser.dtype.kind != "b": pytest.skip(reason="Tested in tests/reductions/test_reductions.py") diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 96c014f549056..d8203c2e2e350 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -101,6 +101,14 @@ def data_for_grouping(dtype, chunked): class TestStringArray(base.ExtensionTests): + def test_combine_le(self, data_repeated): + dtype = next(iter(data_repeated(2))).dtype + if dtype.storage == "pyarrow" and dtype.na_value is pd.NA: + self._combine_le_expected_dtype = "bool[pyarrow]" + else: + self._combine_le_expected_dtype = "bool" + return super().test_combine_le(data_repeated) + def test_eq_with_str(self, dtype): super().test_eq_with_str(dtype) @@ -223,9 +231,7 @@ def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): def test_combine_add(self, data_repeated, using_infer_string, request): dtype = next(data_repeated(1)).dtype - if using_infer_string and ( - (dtype.na_value is pd.NA) and dtype.storage == "python" - ): + if not using_infer_string and dtype.storage == "python": mark = pytest.mark.xfail( reason="The pointwise operation result will be inferred to " "string[nan, pyarrow], which does not match the input dtype" diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index ae7a840937163..c968587c469d1 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -188,6 +188,8 @@ def f(x): agged = grouped.agg(f) expected = Series([4.0, 2.0], index=["bar", "foo"]) + if values.dtype == np.float32: + expected = expected.astype(np.float32) tm.assert_series_equal(agged, expected) @@ -546,6 +548,14 @@ def test_callable_result_dtype_frame( op = getattr(df.groupby(keys)[["c"]], method) result = op(lambda x: x.astype(result_dtype).iloc[0]) expected_index = pd.RangeIndex(0, 1) if method == "transform" else agg_index + + if method == "aggregate": + # _cast_pointwise_result retains the input's dtype where feasible + if input_dtype == "float32" and result_dtype == "float64": + result_dtype = "float32" + if input_dtype == "int32" and result_dtype == "int64": + result_dtype = "int32" + expected = DataFrame({"c": [df["c"].iloc[0]]}, index=expected_index).astype( result_dtype ) @@ -1813,31 +1823,23 @@ def test_groupby_aggregation_func_list_multi_index_duplicate_columns(): @td.skip_if_no("pyarrow") @pytest.mark.parametrize( - "input_dtype, output_dtype", - [ - # With NumPy arrays, the results from the UDF would be e.g. np.float32 scalars - # which we can therefore preserve. However with PyArrow arrays, the results are - # Python scalars so we have no information about size or uint vs int. - ("float[pyarrow]", "double[pyarrow]"), - ("int64[pyarrow]", "int64[pyarrow]"), - ("uint64[pyarrow]", "int64[pyarrow]"), - ("bool[pyarrow]", "bool[pyarrow]"), - ], + "dtype", + ["float[pyarrow]", "int64[pyarrow]", "uint64[pyarrow]", "bool[pyarrow]"], ) -def test_agg_lambda_pyarrow_dtype_conversion(input_dtype, output_dtype): +def test_agg_lambda_pyarrow_dtype_conversion(dtype): # GH#59601 # Test PyArrow dtype conversion back to PyArrow dtype df = DataFrame( { "A": ["c1", "c2", "c3", "c1", "c2", "c3"], - "B": pd.array([100, 200, 255, 0, 199, 40392], dtype=input_dtype), + "B": pd.array([100, 200, 255, 0, 199, 40392], dtype=dtype), } ) gb = df.groupby("A") result = gb.agg(lambda x: x.min()) expected = DataFrame( - {"B": pd.array([0, 199, 255], dtype=output_dtype)}, + {"B": pd.array([0, 199, 255], dtype=dtype)}, index=Index(["c1", "c2", "c3"], name="A"), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/test_any_index.py b/pandas/tests/indexes/test_any_index.py index a4c18732ef258..be15bce8bb82f 100644 --- a/pandas/tests/indexes/test_any_index.py +++ b/pandas/tests/indexes/test_any_index.py @@ -9,6 +9,7 @@ from pandas.errors import InvalidIndexError +from pandas import StringDtype import pandas._testing as tm @@ -36,8 +37,15 @@ def test_mutability(index): @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") -def test_map_identity_mapping(index, request): +def test_map_identity_mapping(index, request, using_infer_string): # GH#12766 + if ( + not using_infer_string + and isinstance(index.dtype, StringDtype) + and index.dtype.storage == "python" + ): + mark = pytest.mark.xfail(reason="Does not preserve dtype") + request.applymarker(mark) result = index.map(lambda x: x) if index.dtype == object and (result.dtype == bool or result.dtype == "string"): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 985ed880e7998..26eb33195ccbc 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -581,12 +581,19 @@ def test_map_dictlike_simple(self, mapper): ], ) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") - def test_map_dictlike(self, index, mapper, request): + def test_map_dictlike(self, index, mapper, request, using_infer_string): # GH 12756 if isinstance(index, CategoricalIndex): pytest.skip("Tested in test_categorical") elif not index.is_unique: pytest.skip("Cannot map duplicated index") + if ( + not using_infer_string + and isinstance(index.dtype, pd.StringDtype) + and index.dtype.storage == "python" + ): + mark = pytest.mark.xfail(reason="map does not retain dtype") + request.applymarker(mark) rng = np.arange(len(index), 0, -1, dtype=np.int64) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 042c8ae186804..262f032f20187 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1246,12 +1246,7 @@ def test_resample_not_monotonic(unit): "int64", "int32", "float64", - pytest.param( - "float32", - marks=pytest.mark.xfail( - reason="Empty groups cause x.mean() to return float64" - ), - ), + "float32", ], ) def test_resample_median_bug_1688(dtype, unit): diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 286625b8ce470..f3c52a674cf66 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -264,8 +264,6 @@ def f_1(x): return x.resample("2s").apply(lambda y: y.sum()) result = g.apply(f_1) - # y.sum() results in int64 instead of int32 on 32-bit architectures - expected = expected.astype("int64") tm.assert_frame_equal(result, expected) @@ -295,7 +293,9 @@ def test_apply_columns_multilevel(): # GH 16231 cols = pd.MultiIndex.from_tuples([("A", "a", "", "one"), ("B", "b", "i", "two")]) ind = date_range(start="2017-01-01", freq="15Min", periods=8) - df = DataFrame(np.array([0] * 16).reshape(8, 2), index=ind, columns=cols) + df = DataFrame( + np.array([0] * 16, dtype=np.int64).reshape(8, 2), index=ind, columns=cols + ) agg_dict = {col: (np.sum if col[3] == "one" else np.mean) for col in df.columns} result = df.resample("h").apply(lambda x: agg_dict[x.name](x)) expected = DataFrame(