diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e74bd2f745b94..2a002b87ccf4c 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -647,6 +647,8 @@ Numeric Conversion ^^^^^^^^^^ +- Bug in :meth:`BaseMaskedArray.map` was casting ``pd.NA`` to ``np.nan``. (:issue:`57390`) +- Bug in :meth:`BaseMaskedArray.map` was casting ``pd.NA`` to ``np.nan``. (:issue:`57390`) - Bug in :meth:`DataFrame.astype` not casting ``values`` for Arrow-based dictionary dtype correctly (:issue:`58479`) - Bug in :meth:`DataFrame.update` bool dtype being converted to object (:issue:`55509`) - Bug in :meth:`Series.astype` might modify read-only array inplace when casting to a string dtype (:issue:`57212`) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index daaaacee3487d..54dd539eff881 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -74,6 +74,10 @@ def map_infer( *, convert: Literal[False], ignore_na: bool = ..., + mask: npt.NDArray[np.bool_] | None = ..., + na_value: Any = ..., + convert_to_nullable_dtype: bool = ..., + storage: str | None = ..., ) -> np.ndarray: ... @overload def map_infer( @@ -82,6 +86,10 @@ def map_infer( *, convert: bool = ..., ignore_na: bool = ..., + mask: npt.NDArray[np.bool_] | None = ..., + na_value: Any = ..., + convert_to_nullable_dtype: bool = ..., + storage: str | None = ..., ) -> ArrayLike: ... @overload def maybe_convert_objects( @@ -93,6 +101,8 @@ def maybe_convert_objects( convert_non_numeric: Literal[False] = ..., convert_to_nullable_dtype: Literal[False] = ..., dtype_if_all_nat: DtypeObj | None = ..., + storage: str | None = ..., + na_value: Any = ..., ) -> npt.NDArray[np.object_ | np.number]: ... @overload def maybe_convert_objects( @@ -104,6 +114,8 @@ def maybe_convert_objects( convert_non_numeric: bool = ..., convert_to_nullable_dtype: Literal[True] = ..., dtype_if_all_nat: DtypeObj | None = ..., + storage: str | None = ..., + na_value: Any = ..., ) -> ArrayLike: ... @overload def maybe_convert_objects( @@ -115,6 +127,8 @@ def maybe_convert_objects( convert_non_numeric: bool = ..., convert_to_nullable_dtype: bool = ..., dtype_if_all_nat: DtypeObj | None = ..., + storage: str | None = ..., + na_value: Any = ..., ) -> ArrayLike: ... @overload def maybe_convert_numeric( @@ -178,6 +192,9 @@ def map_infer_mask( convert: Literal[False], na_value: Any = ..., dtype: np.dtype = ..., + convert_to_nullable_dtype: bool = ..., + convert_non_numeric: bool = ..., + storage: str | None = ..., ) -> np.ndarray: ... @overload def map_infer_mask( @@ -188,6 +205,9 @@ def map_infer_mask( convert: bool = ..., na_value: Any = ..., dtype: np.dtype = ..., + convert_to_nullable_dtype: bool = ..., + convert_non_numeric: bool = ..., + storage: str | None = ..., ) -> ArrayLike: ... def indices_fast( index: npt.NDArray[np.intp], diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index de603beff7836..7deea573f6a1c 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2525,6 +2525,80 @@ def maybe_convert_numeric( return (ints, None) +@cython.boundscheck(False) +@cython.wraparound(False) +def _convert_to_pyarrow( + ndarray[object] objects, + ndarray[uint8_t] mask, + object na_value=None) -> "ArrayLike": + from pandas.core.dtypes.dtypes import ArrowDtype + + from pandas.core.arrays.string_ import StringDtype + + # pa.array does not support na_value as pd.NA, + # so we replace them by None and then restore them after + objects[mask] = None + pa_array = pa.array(objects) + + # Pyarrow large string are StringDtype (not ArrowDtype) + if pa.types.is_large_string(pa_array.type): + dtype = StringDtype(storage="pyarrow", na_value=na_value) + else: + dtype = ArrowDtype(pa_array.type) + return dtype.construct_array_type()._from_sequence(pa_array, dtype=dtype) + + +@cython.boundscheck(False) +@cython.wraparound(False) +def _convert_to_based_masked( + ndarray[object] objects, + object numpy_dtype) -> "ArrayLike": + from pandas.core.dtypes.dtypes import BaseMaskedDtype + + from pandas.core.construction import array as pd_array + + dtype = BaseMaskedDtype.from_numpy_dtype(numpy_dtype) + return pd_array(objects, dtype=dtype) + + +@cython.boundscheck(False) +@cython.wraparound(False) +def _maybe_get_numpy_dtype(Seen seen, object scalar_type): + # Numpy scalar type + if issubclass(scalar_type, np.generic): + return np.dtype(scalar_type) + # Native python type + elif seen.bool_: + return np.dtype(bool) + elif seen.uint_: + return np.dtype(np.uint) + elif seen.int_ or seen.sint_: + return np.dtype(int) + elif seen.float_: + return np.dtype(float) + else: + return None + + +@cython.boundscheck(False) +@cython.wraparound(False) +def _maybe_get_based_masked_scalar_numpy_dtype( + val_types, + seen, + convert_to_nullable_dtype): + # If we have no type or more than one type we cannot build a based masked array + if not val_types or len(val_types) > 1: + return None + + numpy_dtype = _maybe_get_numpy_dtype(seen, val_types.pop()) + if ( + numpy_dtype and numpy_dtype.kind in "biuf" + and convert_to_nullable_dtype): + return numpy_dtype + else: + return None + + @cython.boundscheck(False) @cython.wraparound(False) def maybe_convert_objects(ndarray[object] objects, @@ -2534,7 +2608,9 @@ def maybe_convert_objects(ndarray[object] objects, bint convert_numeric=True, # NB: different default! bint convert_to_nullable_dtype=False, bint convert_non_numeric=False, - object dtype_if_all_nat=None) -> "ArrayLike": + object dtype_if_all_nat=None, + str storage=None, + object na_value=None) -> "ArrayLike": """ Type inference function-- convert object array to proper dtype @@ -2557,6 +2633,8 @@ def maybe_convert_objects(ndarray[object] objects, Whether to convert datetime, timedelta, period, interval types. dtype_if_all_nat : np.dtype, ExtensionDtype, or None, default None Dtype to cast to if we have all-NaT. + storage : {None, "python", "pyarrow", "pyarrow_numpy"}, default None + Backend storage Returns ------- @@ -2592,9 +2670,14 @@ def maybe_convert_objects(ndarray[object] objects, uints = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_UINT64, 0) bools = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_UINT8, 0) mask = np.full(n, False) + val = None + val_types = set() for i in range(n): val = objects[i] + if not checknull(val): + val_types.add(type(val)) + if itemsize_max != -1: itemsize = get_itemsize(val) if itemsize > itemsize_max or itemsize == -1: @@ -2728,6 +2811,17 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True break + if storage == "pyarrow": + return _convert_to_pyarrow(objects, mask, na_value) + + based_masked_scalar_numpy_dtype = _maybe_get_based_masked_scalar_numpy_dtype( + val_types, + seen, + convert_to_nullable_dtype) + + if based_masked_scalar_numpy_dtype: + return _convert_to_based_masked(objects, based_masked_scalar_numpy_dtype) + # we try to coerce datetime w/tz but must all have the same tz if seen.datetimetz_: if is_datetime_with_singletz_array(objects): @@ -2791,6 +2885,12 @@ def maybe_convert_objects(ndarray[object] objects, dtype = StringDtype(na_value=np.nan) return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) + elif storage == "python": + from pandas.core.arrays.string_ import StringDtype + + dtype = StringDtype(storage=storage, na_value=na_value) + return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) + seen.object_ = True elif seen.interval_: if is_interval_array(objects): @@ -2944,7 +3044,10 @@ def map_infer_mask( *, bint convert=True, object na_value=no_default, - cnp.dtype dtype=np.dtype(object) + bint convert_to_nullable_dtype=False, + convert_non_numeric=False, + cnp.dtype dtype=np.dtype(object), + str storage=None, ) -> "ArrayLike": """ Substitute for np.vectorize with pandas-friendly dtype inference. @@ -2960,8 +3063,12 @@ def map_infer_mask( na_value : Any, optional The result value to use for masked values. By default, the input value is used. + convert_non_numeric : bool, default False + Whether to convert datetime, timedelta, period, interval types. dtype : numpy.dtype The numpy dtype to use for the result ndarray. + storage : {None, "python", "pyarrow", "pyarrow_numpy"}, default None + Backend storage Returns ------- @@ -2997,7 +3104,13 @@ def map_infer_mask( PyArray_ITER_NEXT(result_it) if convert: - return maybe_convert_objects(result) + return maybe_convert_objects( + result, + convert_to_nullable_dtype=convert_to_nullable_dtype, + convert_non_numeric=convert_non_numeric, + storage=storage, + na_value=na_value, + ) else: return result @@ -3005,7 +3118,15 @@ def map_infer_mask( @cython.boundscheck(False) @cython.wraparound(False) def map_infer( - ndarray arr, object f, *, bint convert=True, bint ignore_na=False + ndarray arr, + object f, + *, + bint convert=True, + bint ignore_na=False, + const uint8_t[:] mask=None, + object na_value=None, + bint convert_to_nullable_dtype=False, + str storage=None, ) -> "ArrayLike": """ Substitute for np.vectorize with pandas-friendly dtype inference. @@ -3017,6 +3138,15 @@ def map_infer( convert : bint ignore_na : bint If True, NA values will not have f applied + mask : ndarray, optional + uint8 dtype ndarray indicating na_value to apply `f` to. + na_value : Any, optional + The input value to use for masked values. + convert_to_nullable_dtype : bool, default False + If an array-like object contains only integer or boolean values (and NaN) is + encountered, whether to convert and return an Boolean/IntegerArray. + storage : {None, "python", "pyarrow", "pyarrow_numpy"}, default None + Backend storage Returns ------- @@ -3033,7 +3163,10 @@ def map_infer( if ignore_na and checknull(arr[i]): result[i] = arr[i] continue - val = f(arr[i]) + elif mask is not None and na_value is not None and mask[i]: + val = f(na_value) + else: + val = f(arr[i]) if cnp.PyArray_IsZeroDim(val): # unbox 0-dim arrays, GH#690 @@ -3042,7 +3175,13 @@ def map_infer( result[i] = val if convert: - return maybe_convert_objects(result) + return maybe_convert_objects( + result, + convert_to_nullable_dtype=convert_to_nullable_dtype, + convert_non_numeric=True, + storage=storage, + na_value=na_value, + ) else: return result diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 390a527c22bbb..271712867baea 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -1,3 +1,4 @@ +from collections import UserDict from decimal import Decimal import numbers from sys import maxsize @@ -148,6 +149,7 @@ cpdef bint checknull(object val): - np.timedelta64 representation of NaT - NA - Decimal("NaN") + - {} empty dict or UserDict Parameters ---------- @@ -157,7 +159,12 @@ cpdef bint checknull(object val): ------- bool """ - if val is None or val is NaT or val is C_NA: + if ( + val is None + or val is NaT + or val is C_NA + or (isinstance(val, (dict, UserDict)) and not val) + ): return True elif util.is_float_object(val) or util.is_complex_object(val): if val != val: @@ -191,6 +198,7 @@ cpdef ndarray[uint8_t] isnaobj(ndarray arr): - np.timedelta64 representation of NaT - NA - Decimal("NaN") + - {} empty dict or UserDict Parameters ---------- diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 56f8adda93251..07c67c048dd54 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -23,6 +23,7 @@ iNaT, lib, ) +from pandas._libs.missing import NA from pandas._typing import ( AnyArrayLike, ArrayLike, @@ -98,6 +99,7 @@ Series, ) from pandas.core.arrays import ( + ArrowExtensionArray, BaseMaskedArray, ExtensionArray, ) @@ -1663,15 +1665,19 @@ def map_array( # possibility that they are tuples # The return value of mapping with an empty mapper is - # expected to be pd.Series(np.nan, ...). As np.nan is - # of dtype float64 the return value of this method should - # be float64 as well + # expected to be pd.Series(np.nan, ...) or pd.Series(NA, ...). + # As np.nan is of dtype float64 the return value of this method should + # be float64 in this case + # in the other case (NA) it should be the dtype of the original data from pandas import Series + dtype = None if len(mapper) == 0: - mapper = Series(mapper, dtype=np.float64) - else: - mapper = Series(mapper) + if hasattr(arr.dtype, "na_value") and arr.dtype.na_value is NA: + dtype = arr.dtype + else: + dtype = np.float64 + mapper = Series(mapper, dtype=dtype) if isinstance(mapper, ABCSeries): if na_action == "ignore": @@ -1687,9 +1693,73 @@ def map_array( if not len(arr): return arr.copy() - # we must convert to python types - values = arr.astype(object, copy=False) + mask, na_value, storage, values = _build_map_infer_methods_params(arr) + if na_action is None: - return lib.map_infer(values, mapper) + return lib.map_infer( + values, + mapper, + mask=mask, + na_value=na_value, + convert_to_nullable_dtype=na_value is NA, + storage=storage, + ) + else: + return lib.map_infer_mask( + values, + mapper, + mask=mask, + na_value=na_value, + convert_to_nullable_dtype=na_value is NA, + convert_non_numeric=True, + storage=storage, + ) + + +def _build_map_infer_methods_params(arr: ArrayLike): + """ + Process lib.map_infer and lib.map_infer_mask parameters from an array `arr` + + Parameters + ---------- + arr + + Returns + ------- + mask : np.ndarray[bool] + na_value : object + A value in `values` to consider missing. + storage : {"python", "pyarrow", "pyarrow_numpy"}, default "python" + Backend storage + values : np.ndarray + Values to be processed by lib.map_infer and lib.map_infer_mask + + """ + na_value = None + mask = isna(arr) + storage = None + if isinstance(arr.dtype, (BaseMaskedDtype, ExtensionDtype)) and hasattr( + arr, "_hasna" + ): + na_value = arr.dtype.na_value + + if isinstance(arr.dtype, BaseMaskedDtype): + arr = cast("BaseMaskedArray", arr) + values = np.fromiter(arr._data, dtype="O") + + elif isinstance(arr.dtype, ExtensionDtype): + arr = cast("ExtensionArray", arr) + if hasattr(arr.dtype, "storage"): + storage = arr.dtype.storage + + if storage == "pyarrow": + arr = cast("ArrowExtensionArray", arr) + values = np.fromiter(arr._pa_array, dtype="O") + else: + values = np.asarray(arr) + else: - return lib.map_infer_mask(values, mapper, mask=isna(values).view(np.uint8)) + # we must convert to python types + values = arr.astype(object, copy=False) + na_value = np.nan + return mask, na_value, storage, values diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e0c93db0afb07..e8af4e34b65bb 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -52,7 +52,6 @@ ops, roperator, ) -from pandas.core.algorithms import map_array from pandas.core.arraylike import OpsMixin from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin from pandas.core.arrays._utils import to_numpy_dtype_inference @@ -342,7 +341,13 @@ def _from_sequence_of_strings( elif pa.types.is_date(pa_type): from pandas.core.tools.datetimes import to_datetime - scalars = to_datetime(strings, errors="raise").date + if isinstance(strings, ExtensionArray) and isinstance( + strings.dtype, ArrowDtype + ): + strings = cast("ArrowExtensionArray", strings) + scalars = to_datetime(strings._pa_array, errors="raise").date + else: + scalars = to_datetime(strings, errors="raise").date elif pa.types.is_duration(pa_type): from pandas.core.tools.timedeltas import to_timedelta @@ -1469,10 +1474,7 @@ def to_numpy( return result def map(self, mapper, na_action: Literal["ignore"] | None = None): - if is_numeric_dtype(self.dtype): - return map_array(self.to_numpy(), mapper, na_action=na_action) - else: - return super().map(mapper, na_action) + return super().map(mapper, na_action) @doc(ExtensionArray.duplicated) def duplicated( diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 4835d808f2433..8b27990cb1346 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -69,6 +69,7 @@ unique, ) from pandas.core.array_algos.quantile import quantile_with_mask +from pandas.core.construction import array as pd_array from pandas.core.missing import _fill_limit_area_1d from pandas.core.sorting import ( nargminmax, @@ -2558,7 +2559,21 @@ def map(self, mapper, na_action: Literal["ignore"] | None = None): If the function returns a tuple with more than one element a MultiIndex will be returned. """ - return map_array(self, mapper, na_action=na_action) + result = map_array(self, mapper, na_action=na_action) + if isinstance(result, np.ndarray): + # Get the scalar types + scalar_types = set(np.array([type(x) for x in result])) + + # if scalar values types are compatible with self dtype + # we use the self dtype + # For example if scalar types are dict and UserDict and self is a JSONArray, + # we use self.dtype + if all(issubclass(t, self.dtype.type) for t in scalar_types): + return pd_array(result, self.dtype) + else: + return pd_array(result, result.dtype) + else: + return result # ------------------------------------------------------------------------ # GroupBy Methods diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index f3a0cc0dccdb3..ff8a5a9fdff09 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1329,7 +1329,7 @@ def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): return self._wrap_reduction_result("max", result, skipna=skipna, axis=axis) def map(self, mapper, na_action: Literal["ignore"] | None = None): - return map_array(self.to_numpy(), mapper, na_action=na_action) + return map_array(self, mapper, na_action=na_action) @overload def any( diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 0268194e64d50..6d0b8fc5cfd99 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -277,8 +277,15 @@ def _str_fullmatch( return self._str_map(f, na_value=na, dtype=np.dtype(bool)) def _str_encode(self, encoding, errors: str = "strict"): - f = lambda x: x.encode(encoding, errors=errors) - return self._str_map(f, dtype=object) + def encode_func(x): + if x is str: + return x.encode(encoding=encoding, errors=errors) + else: + # If x is a 'pyarrow.lib.LargeStringScalar' it has + # no attribute 'encode' so we cast it + return str(x).encode(encoding=encoding, errors=errors) + + return self._str_map(encode_func, dtype=object) def _str_find(self, sub, start: int = 0, end=None): return self._str_find_(sub, start, end, side="left") diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index da444b55490f0..76e5d33d90038 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -944,10 +944,11 @@ def test_maybe_convert_objects_nullable_boolean(self): out = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True) tm.assert_extension_array_equal(out, exp) - arr = np.array([True, False, pd.NaT], dtype=object) - exp = np.array([True, False, pd.NaT], dtype=object) + # pd.NaT are not supported in BooleanArray, but pd.NA are supported + arr = np.array([True, False, pd.NA], dtype=object) + exp = BooleanArray._from_sequence([True, False, pd.NA], dtype="boolean") out = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True) - tm.assert_numpy_array_equal(out, exp) + tm.assert_extension_array_equal(out, exp) @pytest.mark.parametrize("val", [None, np.nan]) def test_maybe_convert_objects_nullable_boolean_na(self, val): diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index fd9fec0cb490c..b76c24ba75be4 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -100,8 +100,8 @@ def test_apply_simple_series(self, data): @pytest.mark.parametrize("na_action", [None, "ignore"]) def test_map(self, data_missing, na_action): result = data_missing.map(lambda x: x, na_action=na_action) - expected = data_missing.to_numpy() - tm.assert_numpy_array_equal(result, expected) + expected = data_missing + tm.assert_extension_array_equal(result, expected) def test_argsort(self, data_for_sorting): result = pd.Series(data_for_sorting).argsort() diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 9defb97394635..daf00a545fb87 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -280,16 +280,12 @@ def test_compare_scalar(self, data, comparison_op): def test_map(self, data_missing, na_action): if data_missing.dtype.kind in "mM": result = data_missing.map(lambda x: x, na_action=na_action) - expected = data_missing.to_numpy(dtype=object) - tm.assert_numpy_array_equal(result, expected) + expected = data_missing + tm.assert_extension_array_equal(result, expected) else: result = data_missing.map(lambda x: x, na_action=na_action) - if data_missing.dtype == "float32[pyarrow]": - # map roundtrips through objects, which converts to float64 - expected = data_missing.to_numpy(dtype="float64", na_value=np.nan) - else: - expected = data_missing.to_numpy() - tm.assert_numpy_array_equal(result, expected) + expected = data_missing + tm.assert_extension_array_equal(result, expected) def test_astype_str(self, data, request, using_infer_string): pa_dtype = data.dtype.pyarrow_dtype @@ -3505,5 +3501,5 @@ def test_cast_dictionary_different_value_dtype(arrow_type): def test_map_numeric_na_action(): ser = pd.Series([32, 40, None], dtype="int64[pyarrow]") result = ser.map(lambda x: 42, na_action="ignore") - expected = pd.Series([42.0, 42.0, np.nan], dtype="float64") + expected = pd.Series([42.0, 42.0, np.nan], dtype=ser.dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index 3b9079d06e231..672c1ec7a194d 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -171,21 +171,17 @@ class TestMaskedArrays(base.ExtensionTests): @pytest.mark.parametrize("na_action", [None, "ignore"]) def test_map(self, data_missing, na_action): result = data_missing.map(lambda x: x, na_action=na_action) - if data_missing.dtype == Float32Dtype(): - # map roundtrips through objects, which converts to float64 - expected = data_missing.to_numpy(dtype="float64", na_value=np.nan) - else: - expected = data_missing.to_numpy() - tm.assert_numpy_array_equal(result, expected) + expected = data_missing + tm.assert_extension_array_equal(result, expected) def test_map_na_action_ignore(self, data_missing_for_sorting): zero = data_missing_for_sorting[2] + na_value = data_missing_for_sorting.dtype.na_value result = data_missing_for_sorting.map(lambda x: zero, na_action="ignore") - if data_missing_for_sorting.dtype.kind == "b": - expected = np.array([False, pd.NA, False], dtype=object) - else: - expected = np.array([zero, np.nan, zero]) - tm.assert_numpy_array_equal(result, expected) + expected = type(data_missing_for_sorting)._from_sequence( + [zero, na_value, zero], dtype=data_missing_for_sorting.dtype + ) + tm.assert_extension_array_equal(result, expected) def _get_expected_exception(self, op_name, obj, other): try: diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index 7ae2fffa04205..45d36ae886d96 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -98,8 +98,14 @@ def test_left_join_multi_index(self, sort, infer_string): with option_context("future.infer_string", infer_string): icols = ["1st", "2nd", "3rd"] + def ord_func(x): + if infer_string: + # ord(x) return a TypeError if x is a pyarrow.lib.LargeStringScalar + return ord(str(x)) + return ord(x) + def bind_cols(df): - iord = lambda a: 0 if a != a else ord(a) + iord = lambda a: 0 if a != a else ord_func(a) f = lambda ts: ts.map(iord) - ord("a") return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 10 diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index 84b60a2afe6eb..cff34714e5921 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -231,8 +231,16 @@ def test_map_empty(request, index): s = Series(index) result = s.map({}) - - expected = Series(np.nan, index=s.index) + # If s has a na value equal to NA, we keep the original dtype + if hasattr(s.dtype, "na_value") and s.dtype.na_value is pd.NA: + na_value = s.dtype.na_value + dtype = s.dtype + # Else the dtype is always float64 + else: + na_value = np.nan + dtype = "float64" + + expected = Series(na_value, index=s.index, dtype=dtype) tm.assert_series_equal(result, expected) @@ -257,6 +265,45 @@ def test_map_int(): assert not isna(merged["c"]) +@pytest.mark.parametrize( + "ser", + [ + Series([pd.NA, 11], dtype="Int64"), + Series([pd.NA, 11.0], dtype="Float64"), + Series([pd.NA, True], dtype="boolean"), + ], +) +def test_map_with_pd_na_input(ser): + func_return_values_only = ( + lambda x: ser.dtype.type(1) if x is pd.NA else ser.dtype.type(2 * x) + ) + result = ser.map(func_return_values_only) + expected = Series( + [func_return_values_only(ser[0]), func_return_values_only(ser[1])], + dtype=ser.dtype, + ) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "ser", + [ + Series([pd.NA, 11], dtype="Int64"), + Series([pd.NA, 11.0], dtype="Float64"), + Series([pd.NA, True], dtype="boolean"), + Series([pd.NA, "AAA"], dtype="string"), + ], +) +def test_map_with_pd_na_output(ser): + func_return_value_and_na = lambda x: x if x is pd.NA else ser.dtype.type(2 * x) + result = ser.map(func_return_value_and_na) + expected = Series( + [func_return_value_and_na(ser[0]), func_return_value_and_na(ser[1])], + dtype=ser.dtype, + ) + tm.assert_series_equal(result, expected) + + def test_map_type_inference(): s = Series(range(3)) s2 = s.map(lambda x: np.where(x == 0, 0, 1)) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 75a2007b61640..3eff952caffa1 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -579,7 +579,16 @@ def test_encode_errors_kwarg(any_string_dtype): ser.str.encode("cp1252") result = ser.str.encode("cp1252", "ignore") - expected = ser.map(lambda x: x.encode("cp1252", "ignore")) + + def encode_func(x): + if x is str: + return x.encode("cp1252", "ignore") + else: + # If x is a 'pyarrow.lib.LargeStringScalar' it has + # no attribute 'encode' so we cast it + return str(x).encode("cp1252", "ignore") + + expected = ser.map(encode_func).astype("object") tm.assert_series_equal(result, expected)