Skip to content
Open
Show file tree
Hide file tree
Changes from 51 commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
5e88fde
BUG: read_csv with engine=pyarrow and numpy-nullable dtype
jbrockmendel Aug 6, 2025
eae6f64
mypy fixup, error message compat for 32bit builds
jbrockmendel Aug 6, 2025
2861b16
minimum version compat
jbrockmendel Aug 6, 2025
5369afa
not-infer-string compat
jbrockmendel Aug 6, 2025
db35a9c
mypy fixup
jbrockmendel Aug 6, 2025
505bfb6
update usage
jbrockmendel Aug 11, 2025
febe83c
CLN: remove redundant check
jbrockmendel Aug 11, 2025
c81cbec
Use Matts idea
jbrockmendel Aug 11, 2025
26a3049
re-xfail
jbrockmendel Aug 12, 2025
a70b429
API: rank with nullable dtypes preserve NA
jbrockmendel Aug 4, 2025
99a71b7
API: improve dtype in df.where with EA other
jbrockmendel Aug 3, 2025
c86747d
GH refs
jbrockmendel Aug 3, 2025
9d222d8
doc fixup
jbrockmendel Aug 3, 2025
6f800b3
BUG: Decimal(NaN) incorrectly allowed in ArrowEA constructor with tim…
jbrockmendel Jul 3, 2025
514a56f
GH ref
jbrockmendel Jul 3, 2025
fca3c7c
BUG: ArrowEA constructor with timestamp type
jbrockmendel Jul 4, 2025
f20758a
POC: consistent NaN treatment for pyarrow dtypes
jbrockmendel Jun 28, 2025
cc416fa
comment
jbrockmendel Jun 28, 2025
7094d85
Down to 40 failing tests
jbrockmendel Jul 5, 2025
eeb0d32
Fix rank, json tests
jbrockmendel Jul 6, 2025
814d001
CLN: remove outdated
jbrockmendel Jul 6, 2025
5db5e4b
Fix where kludge
jbrockmendel Jul 6, 2025
87536a7
update tests
jbrockmendel Jul 6, 2025
64f4271
Fix remaining tests
jbrockmendel Jul 6, 2025
26d1177
mypy fixup
jbrockmendel Jul 7, 2025
bcb2506
old-numpy compat
jbrockmendel Jul 7, 2025
8f99d05
simplify
jbrockmendel Jul 7, 2025
5abd585
Better option name, fixture
jbrockmendel Jul 31, 2025
70830f7
default True
jbrockmendel Jul 31, 2025
58b3c4f
Patch ops
jbrockmendel Jul 31, 2025
cd7ec33
mypy fixup
jbrockmendel Jul 31, 2025
cf7b229
Test for setitem/construction
jbrockmendel Jul 31, 2025
eb12ea1
update ufunc test
jbrockmendel Jul 31, 2025
f0262ef
Improve rank test skips
jbrockmendel Jul 31, 2025
544faf1
ENH: mode.nan_is_na for numpy-nullable dtypes
jbrockmendel Aug 4, 2025
6c4b68f
update style test
jbrockmendel Aug 4, 2025
90d3a28
update asvs, mypy ignores
jbrockmendel Aug 4, 2025
408aa06
pre-commit fixup
jbrockmendel Aug 4, 2025
9e5ebec
doc fixup
jbrockmendel Aug 4, 2025
0fd2e2d
Remove special-casing
jbrockmendel Aug 4, 2025
7de9f40
comment
jbrockmendel Aug 4, 2025
2f61a58
ruff format
jbrockmendel Aug 5, 2025
36143ad
Set default to True
jbrockmendel Aug 6, 2025
b7ea9ae
whatsnew
jbrockmendel Aug 12, 2025
a625190
Merge branch 'main' into api-nan-vs-na
jbrockmendel Aug 20, 2025
d471aa8
update _cast_pointwise_result
jbrockmendel Aug 20, 2025
27cd097
update cast_pointwise_result
jbrockmendel Aug 20, 2025
1bb0a4e
Merge branch 'main' into api-nan-vs-na
jbrockmendel Aug 20, 2025
7cc3b41
Merge branch 'main' into api-nan-vs-na
jbrockmendel Aug 23, 2025
5f76e19
Merge branch 'main' into api-nan-vs-na
jbrockmendel Aug 26, 2025
b2a64bb
remove unnecessary import
jbrockmendel Aug 26, 2025
1024ac5
Merge branch 'main' into api-nan-vs-na
jbrockmendel Sep 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions asv_bench/benchmarks/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,8 @@ class SortIntegerArray:
params = [10**3, 10**5]

def setup(self, N):
data = np.arange(N, dtype=float)
data[40] = np.nan
data = np.arange(N, dtype=float).astype(object)
data[40] = pd.NA
self.array = pd.array(data, dtype="Int64")

def time_argsort(self, N):
Expand Down
3 changes: 3 additions & 0 deletions asv_bench/benchmarks/frame_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import numpy as np

from pandas import (
NA,
DataFrame,
Index,
MultiIndex,
Expand Down Expand Up @@ -445,6 +446,8 @@ def setup(self, inplace, dtype):
values[::2] = np.nan
if dtype == "Int64":
values = values.round()
values = values.astype(object)
values[::2] = NA
self.df = DataFrame(values, dtype=dtype)
self.fill_values = self.df.iloc[self.df.first_valid_index()].to_dict()

Expand Down
4 changes: 4 additions & 0 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -689,6 +689,10 @@ def setup(self, dtype, method, with_nans):
null_vals = vals.astype(float, copy=True)
null_vals[::2, :] = np.nan
null_vals[::3, :] = np.nan
if dtype in ["Int64", "Float64"]:
null_vals = null_vals.astype(object)
null_vals[::2, :] = NA
null_vals[::3, :] = NA
df = DataFrame(null_vals, columns=list("abcde"), dtype=dtype)
df["key"] = keys
self.df = df
Expand Down
2 changes: 1 addition & 1 deletion doc/source/user_guide/text.rst
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ or convert from existing pandas data:

.. ipython:: python

s1 = pd.Series([1, 2, np.nan], dtype="Int64")
s1 = pd.Series([1, 2, pd.NA], dtype="Int64")
s1
s2 = s1.astype("string")
s2
Expand Down
4 changes: 2 additions & 2 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ marker of ``np.nan`` will infer to integer dtype. The display of the ``Series``

.. ipython:: python

s = pd.Series([1, 2, np.nan], dtype='Int64')
s = pd.Series([1, 2, pd.NA], dtype='Int64')
s


Expand Down Expand Up @@ -166,7 +166,7 @@ See the :ref:`dtypes docs <basics.dtypes>` for more on extension arrays.

.. ipython:: python

pd.array([1, 2, np.nan], dtype='Int64')
pd.array([1, 2, pd.NA], dtype='Int64')
pd.array(['a', 'b', 'c'], dtype='category')

Passing data for which there isn't dedicated extension type (e.g. float, integer, etc.)
Expand Down
49 changes: 49 additions & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,55 @@ small behavior differences as collateral:
- Adding or subtracting a :class:`Day` with a :class:`Timedelta` is no longer supported.
- Adding or subtracting a :class:`Day` offset to a timezone-aware :class:`Timestamp` or datetime-like may lead to an ambiguous or non-existent time, which will raise.

.. _whatsnew_300.api_breaking.nan_vs_na:

Changed treatment of NaN values in pyarrow and numpy-nullable floating dtypes
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Previously, when dealing with a nullable dtype (e.g. ``Float64Dtype`` or ``int64[pyarrow]``), ``NaN`` was treated as interchangeable with :class:`NA` in some circumstances but not others. This was done to make adoption easier, but caused some confusion (:issue:`32265`). In 3.0, an option ``"mode.nan_is_na"`` (default ``True``) controls whether to treat ``NaN`` as equivalent to :class:`NA`.

With ``pd.set_option("mode.nan_is_na", True)`` (again, this is the default), ``NaN`` can be passed to constructors, ``__setitem__``, ``__contains__`` and be treated the same as :class:`NA`. The only change users will see is that arithmetic and ``np.ufunc`` operations that previously introduced ``NaN`` entries produce :class:`NA` entries instead:

*Old behavior:*

.. code-block:: ipython

In [2]: ser = pd.Series([0, None], dtype=pd.Float64Dtype())
In [3]: ser / 0
Out[3]:
0 NaN
1 <NA>
dtype: Float64

*New behavior:*

.. ipython:: python

ser = pd.Series([0, None], dtype=pd.Float64Dtype())
ser / 0

By contrast, with ``pd.set_option("mode.nan_is_na", False)``, ``NaN`` is always considered distinct and specifically as a floating-point value, so cannot be used with integer dtypes:

*Old behavior:*

.. code-block:: ipython

In [2]: ser = pd.Series([1, np.nan], dtype=pd.Float64Dtype())
In [3]: ser[1]
Out[3]: <NA>

*New behavior:*

.. ipython:: python

pd.set_option("mode.nan_is_na", False)
ser = pd.Series([1, np.nan], dtype=pd.Float64Dtype())
ser[1]

If we had passed ``pd.Int64Dtype()`` or ``"int64[pyarrow]"`` for the dtype in the latter example, this would raise, as a float ``NaN`` cannot be held by an integer dtype.

With ``"mode.nan_is_na"`` set to ``False``, ``ser.to_numpy()`` (and ``frame.values`` and ``np.asarray(obj)``) will convert to ``object`` dtype if :class:`NA` entries are present, where before they would coerce to ``NaN``. To retain a float numpy dtype, explicitly pass ``na_value=np.nan`` to :meth:`Series.to_numpy`.

.. _whatsnew_300.api_breaking.deps:

Increased minimum version for Python
Expand Down
5 changes: 5 additions & 0 deletions pandas/_config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,8 @@
def using_string_dtype() -> bool:
_mode_options = _global_config["future"]
return _mode_options["infer_string"]


def is_nan_na() -> bool:
_mode_options = _global_config["mode"]
return _mode_options["nan_is_na"]
1 change: 1 addition & 0 deletions pandas/_libs/missing.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ def isneginf_scalar(val: object) -> bool: ...
def checknull(val: object) -> bool: ...
def isnaobj(arr: np.ndarray) -> npt.NDArray[np.bool_]: ...
def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ...
def is_pdna_or_none(values: np.ndarray) -> npt.NDArray[np.bool_]: ...
18 changes: 18 additions & 0 deletions pandas/_libs/missing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,24 @@ cdef bint checknull_with_nat_and_na(object obj):
return checknull_with_nat(obj) or obj is C_NA


@cython.wraparound(False)
@cython.boundscheck(False)
def is_pdna_or_none(values: ndarray) -> ndarray:
cdef:
ndarray[uint8_t] result
Py_ssize_t i, N
object val

N = len(values)
result = np.zeros(N, dtype=np.uint8)

for i in range(N):
val = values[i]
if val is None or val is C_NA:
result[i] = True
return result.view(bool)


@cython.wraparound(False)
@cython.boundscheck(False)
def is_numeric_na(values: ndarray) -> ndarray:
Expand Down
5 changes: 3 additions & 2 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ from csv import (
)
import warnings

from pandas._config import is_nan_na

from pandas.util._exceptions import find_stack_level

from pandas import StringDtype
Expand Down Expand Up @@ -43,7 +45,6 @@ from libc.string cimport (
strncpy,
)


import numpy as np

cimport numpy as cnp
Expand Down Expand Up @@ -1461,7 +1462,7 @@ def _maybe_upcast(
if isinstance(arr, IntegerArray) and arr.isna().all():
# use null instead of int64 in pyarrow
arr = arr.to_numpy(na_value=None)
arr = ArrowExtensionArray(pa.array(arr, from_pandas=True))
arr = ArrowExtensionArray(pa.array(arr, from_pandas=is_nan_na()))

return arr

Expand Down
7 changes: 7 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2116,3 +2116,10 @@ def temp_file(tmp_path):
def monkeysession():
with pytest.MonkeyPatch.context() as mp:
yield mp


@pytest.fixture(params=[True, False])
def using_nan_is_na(request):
opt = request.param
with pd.option_context("mode.nan_is_na", opt):
yield opt
14 changes: 12 additions & 2 deletions pandas/core/arrays/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@

import numpy as np

from pandas._config import is_nan_na

from pandas._libs import lib
from pandas._libs.missing import NA
from pandas.errors import LossySetitemError

from pandas.core.dtypes.cast import np_can_hold_element
Expand All @@ -21,7 +24,10 @@


def to_numpy_dtype_inference(
arr: ArrayLike, dtype: npt.DTypeLike | None, na_value, hasna: bool
arr: ArrayLike,
dtype: npt.DTypeLike | None,
na_value,
hasna: bool,
) -> tuple[npt.DTypeLike, Any]:
if dtype is None and is_numeric_dtype(arr.dtype):
dtype_given = False
Expand All @@ -34,7 +40,11 @@ def to_numpy_dtype_inference(
else:
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
if na_value is lib.no_default:
na_value = np.nan
if not is_nan_na():
na_value = NA
dtype = np.dtype(object)
else:
na_value = np.nan
else:
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
elif dtype is not None:
Expand Down
Loading
Loading