diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 99a6be03c84d3..1758305a29ee4 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1217,10 +1217,11 @@ Other - Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`) - Bug in printing a :class:`Series` with a :class:`DataFrame` stored in :attr:`Series.attrs` raised a ``ValueError`` (:issue:`60568`) - Deprecated the keyword ``check_datetimelike_compat`` in :meth:`testing.assert_frame_equal` and :meth:`testing.assert_series_equal` (:issue:`55638`) +- Fixed bug in :meth:`Series.replace` and :meth:`DataFrame.replace` when trying to replace :class:`NA` values in a :class:`Float64Dtype` object with ``np.nan``; this now works with ``pd.set_option("mode.nan_is_na", False)`` and is irrelevant otherwise (:issue:`55127`) +- Fixed bug in :meth:`Series.replace` and :meth:`DataFrame.replace` when trying to replace :class:`np.nan` values in a :class:`Int64Dtype` object with :class:`NA`; this is now a no-op with ``pd.set_option("mode.nan_is_na", False)`` and is irrelevant otherwise (:issue:`51237`) - Fixed bug in the :meth:`Series.rank` with object dtype and extremely small float values (:issue:`62036`) - Fixed bug where the :class:`DataFrame` constructor misclassified array-like objects with a ``.name`` attribute as :class:`Series` or :class:`Index` (:issue:`61443`) - Fixed regression in :meth:`DataFrame.from_records` not initializing subclasses properly (:issue:`57008`) -- .. ***DO NOT USE THIS SECTION*** diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 25d7c792e5810..d20dc87259a37 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -312,7 +312,9 @@ def __setitem__(self, key, value) -> None: key = check_array_indexer(self, key) if is_scalar(value): - if is_valid_na_for_dtype(value, self.dtype): + if is_valid_na_for_dtype(value, self.dtype) and not ( + lib.is_float(value) and not is_nan_na() + ): self._mask[key] = True else: value = self._validate_setitem_value(value) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 141ec5df76792..338dc4a0b8a08 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -15,6 +15,8 @@ import numpy as np +from pandas._config import is_nan_na + from pandas._libs import ( NaT, algos, @@ -37,7 +39,11 @@ is_object_dtype, needs_i8_conversion, ) -from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + BaseMaskedDtype, + DatetimeTZDtype, +) from pandas.core.dtypes.missing import ( is_valid_na_for_dtype, isna, @@ -86,6 +92,31 @@ def mask_missing(arr: ArrayLike, value) -> npt.NDArray[np.bool_]: """ dtype, value = infer_dtype_from(value) + if ( + isinstance(arr.dtype, (BaseMaskedDtype, ArrowDtype)) + and lib.is_float(value) + and np.isnan(value) + and not is_nan_na() + ): + # TODO: this should be done in an EA method? + if arr.dtype.kind == "f": + # GH#55127 + if isinstance(arr.dtype, BaseMaskedDtype): + # error: "ExtensionArray" has no attribute "_data" [attr-defined] + mask = np.isnan(arr._data) & ~arr.isna() # type: ignore[attr-defined,operator] + return mask + else: + # error: "ExtensionArray" has no attribute "_pa_array" [attr-defined] + import pyarrow.compute as pc + + mask = pc.is_nan(arr._pa_array).fill_null(False).to_numpy() # type: ignore[attr-defined] + return mask + + elif arr.dtype.kind in "iu": + # GH#51237 + mask = np.zeros(arr.shape, dtype=bool) + return mask + if isna(value): return isna(arr) diff --git a/pandas/tests/arrays/masked/test_indexing.py b/pandas/tests/arrays/masked/test_indexing.py index 753d562c87ffa..81a3cef0fe716 100644 --- a/pandas/tests/arrays/masked/test_indexing.py +++ b/pandas/tests/arrays/masked/test_indexing.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd @@ -58,3 +60,47 @@ def test_setitem_validation_scalar_int(self, invalid, any_int_ea_dtype): def test_setitem_validation_scalar_float(self, invalid, float_ea_dtype): arr = pd.array([1, 2, None], dtype=float_ea_dtype) self._check_setitem_invalid(arr, invalid) + + +@pytest.mark.parametrize( + "dtype", + [ + "Float64", + pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")), + ], +) +@pytest.mark.parametrize("indexer", [1, [1], [False, True, False]]) +def test_setitem_nan_in_float64_array(dtype, indexer, using_nan_is_na): + arr = pd.array([0, pd.NA, 1], dtype=dtype) + + arr[indexer] = np.nan + if not using_nan_is_na: + assert np.isnan(arr[1]) + else: + assert arr[1] is pd.NA + + +@pytest.mark.parametrize( + "dtype", + [ + "Int64", + pytest.param("int64[pyarrow]", marks=td.skip_if_no("pyarrow")), + ], +) +@pytest.mark.parametrize("indexer", [1, [1], [False, True, False]]) +def test_setitem_nan_in_int64_array(dtype, indexer, using_nan_is_na): + arr = pd.array([0, 1, 2], dtype=dtype) + if not using_nan_is_na: + err = TypeError + msg = "Invalid value 'nan' for dtype 'Int64'" + if dtype == "int64[pyarrow]": + import pyarrow as pa + + err = pa.lib.ArrowInvalid + msg = "Could not convert nan with type float" + with pytest.raises(err, match=msg): + arr[indexer] = np.nan + assert arr[1] == 1 + else: + arr[indexer] = np.nan + assert arr[1] is pd.NA diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 41f72d17ebef7..ff1113e4d5f95 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -6,6 +6,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -1430,6 +1432,49 @@ def test_replace_with_nil_na(self): result = ser.replace("nil", "anything else") tm.assert_frame_equal(expected, result) + @pytest.mark.parametrize( + "dtype", + [ + "Float64", + pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")), + ], + ) + def test_replace_na_to_nan_nullable_floats(self, dtype, using_nan_is_na): + # GH#55127 + df = DataFrame({0: [1, np.nan, 1], 1: Series([0, pd.NA, 1], dtype=dtype)}) + + result = df.replace(pd.NA, np.nan) + + if using_nan_is_na: + expected = result + else: + expected = DataFrame( + {0: [1, np.nan, 1], 1: Series([0, np.nan, 1], dtype=dtype)} + ) + assert np.isnan(expected.loc[1, 1]) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "dtype", + [ + "Int64", + pytest.param("int64[pyarrow]", marks=td.skip_if_no("pyarrow")), + ], + ) + def test_replace_nan_nullable_ints(self, dtype, using_nan_is_na): + # GH#51237 with nan_is_na=False, replacing NaN should be a no-op here + ser = Series([1, 2, None], dtype=dtype) + + result = ser.replace(np.nan, -1) + + if using_nan_is_na: + # np.nan is equivalent to pd.NA here + expected = Series([1, 2, -1], dtype=dtype) + else: + expected = ser + tm.assert_series_equal(result, expected) + class TestDataFrameReplaceRegex: @pytest.mark.parametrize( diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index e36baba5e0108..ef034e62bb764 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -246,7 +246,7 @@ def test_convert_dtypes( with pytest.raises(TypeError, match="Invalid value"): result[result.notna()] = np.nan else: - result[result.notna()] = np.nan + result[result.notna()] = pd.NA # Make sure original not changed tm.assert_series_equal(series, copy)